From 954f8dce42be092ab3828295e14364bcfb19d9ff Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Fri, 26 Jan 2024 14:07:23 +0100 Subject: [PATCH 01/32] First commit, clone of cutadapt in htrnaseq + help.txt --- src/cutadapt/config.vsh.yaml | 50 ++++++++ src/cutadapt/help.txt | 218 +++++++++++++++++++++++++++++++++++ src/cutadapt/script.sh | 40 +++++++ 3 files changed, 308 insertions(+) create mode 100644 src/cutadapt/config.vsh.yaml create mode 100644 src/cutadapt/help.txt create mode 100644 src/cutadapt/script.sh diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml new file mode 100644 index 00000000..18285219 --- /dev/null +++ b/src/cutadapt/config.vsh.yaml @@ -0,0 +1,50 @@ +functionality: + name: cutadapt + description: | + Demultiplexing using cutadapt + arguments: + # - name: "--input" + # type: file + # required: true + # default: file.fastq.gz + # multiple: true + # multiple_sep: " " + # direction: input + # must_exist: false + # - name: "--barcodesFile" + # type: file + # description: "Trim R1, the `file:/` part is automatically added later" + # default: barcodes.txt + # - name: "-e" + # alternatives: ["--error-rate"] + # type: string + # default: "0.10" + # - name: "-j" + # alternatives: ["--cores"] + # type: integer + # default: 0 + # description: Use the specified amount of CPU cores, default is for autodetect + # - name: "--outputDir" + # type: file + # default: demux + # direction: output + # description: Folder to write the output to, this is used in the -p and -o arguments of cutadapt + # - name: "--report" + # required: false + # type: file + # direction: output + # description: Filename for the report + # default: report.txt + resources: + - type: bash_script + path: script.sh +platforms: + - type: docker + image: python:3.8 + setup: + - type: python + pip: + - cutadapt + - type: nextflow + # directives: + # label: fat diff --git a/src/cutadapt/help.txt b/src/cutadapt/help.txt new file mode 100644 index 00000000..2280c3e2 --- /dev/null +++ b/src/cutadapt/help.txt @@ -0,0 +1,218 @@ +cutadapt version 4.6 + +Copyright (C) 2010 Marcel Martin and contributors + +Cutadapt removes adapter sequences from high-throughput sequencing reads. + +Usage: + cutadapt -a ADAPTER [options] [-o output.fastq] input.fastq + +For paired-end reads: + cutadapt -a ADAPT1 -A ADAPT2 [options] -o out1.fastq -p out2.fastq in1.fastq in2.fastq + +Replace "ADAPTER" with the actual sequence of your 3' adapter. IUPAC wildcard +characters are supported. All reads from input.fastq will be written to +output.fastq with the adapter sequence removed. Adapter matching is +error-tolerant. Multiple adapter sequences can be given (use further -a +options), but only the best-matching adapter will be removed. + +Input may also be in FASTA format. Compressed input and output is supported and +auto-detected from the file name (.gz, .xz, .bz2). Use the file name '-' for +standard input/output. Without the -o option, output is sent to standard output. + +Citation: + +Marcel Martin. Cutadapt removes adapter sequences from high-throughput +sequencing reads. EMBnet.Journal, 17(1):10-12, May 2011. +http://dx.doi.org/10.14806/ej.17.1.200 + +Run "cutadapt --help" to see all command-line options. +See https://cutadapt.readthedocs.io/ for full documentation. + +Options: + -h, --help Show this help message and exit + --version Show version number and exit + --debug Print debug log. Use twice to also print DP matrices + -j CORES, --cores CORES + Number of CPU cores to use. Use 0 to auto-detect. Default: + 1 + +Finding adapters: + Parameters -a, -g, -b specify adapters to be removed from each read (or from + R1 if data is paired-end. If specified multiple times, only the best matching + adapter is trimmed (but see the --times option). Use notation 'file:FILE' to + read adapter sequences from a FASTA file. + + -a ADAPTER, --adapter ADAPTER + Sequence of an adapter ligated to the 3' end (paired data: + of the first read). The adapter and subsequent bases are + trimmed. If a '$' character is appended ('anchoring'), the + adapter is only found if it is a suffix of the read. + -g ADAPTER, --front ADAPTER + Sequence of an adapter ligated to the 5' end (paired data: + of the first read). The adapter and any preceding bases + are trimmed. Partial matches at the 5' end are allowed. If + a '^' character is prepended ('anchoring'), the adapter is + only found if it is a prefix of the read. + -b ADAPTER, --anywhere ADAPTER + Sequence of an adapter that may be ligated to the 5' or 3' + end (paired data: of the first read). Both types of + matches as described under -a and -g are allowed. If the + first base of the read is part of the match, the behavior + is as with -g, otherwise as with -a. This option is mostly + for rescuing failed library preparations - do not use if + you know which end your adapter was ligated to! + -e E, --error-rate E, --errors E + Maximum allowed error rate (if 0 <= E < 1), or absolute + number of errors for full-length adapter match (if E is an + integer >= 1). Error rate = no. of errors divided by + length of matching region. Default: 0.1 (10%) + --no-indels Allow only mismatches in alignments. Default: allow both + mismatches and indels + -n COUNT, --times COUNT + Remove up to COUNT adapters from each read. Default: 1 + -O MINLENGTH, --overlap MINLENGTH + Require MINLENGTH overlap between read and adapter for an + adapter to be found. Default: 3 + --match-read-wildcards + Interpret IUPAC wildcards in reads. Default: False + -N, --no-match-adapter-wildcards + Do not interpret IUPAC wildcards in adapters. + --action {trim,retain,mask,lowercase,none} + What to do if a match was found. trim: trim adapter and + up- or downstream sequence; retain: trim, but retain + adapter; mask: replace with 'N' characters; lowercase: + convert to lowercase; none: leave unchanged. Default: trim + --rc, --revcomp Check both the read and its reverse complement for adapter + matches. If match is on reverse-complemented version, + output that one. Default: check only read + +Additional read modifications: + -u LEN, --cut LEN Remove LEN bases from each read (or R1 if paired; use -U + option for R2). If LEN is positive, remove bases from the + beginning. If LEN is negative, remove bases from the end. + Can be used twice if LENs have different signs. Applied + *before* adapter trimming. + --nextseq-trim 3'CUTOFF + NextSeq-specific quality trimming (each read). Trims also + dark cycles appearing as high-quality G bases. + -q [5'CUTOFF,]3'CUTOFF, --quality-cutoff [5'CUTOFF,]3'CUTOFF + Trim low-quality bases from 5' and/or 3' ends of each read + before adapter removal. Applied to both reads if data is + paired. If one value is given, only the 3' end is trimmed. + If two comma-separated cutoffs are given, the 5' end is + trimmed with the first cutoff, the 3' end with the second. + --quality-base N Assume that quality values in FASTQ are encoded as + ascii(quality + N). This needs to be set to 64 for some + old Illumina FASTQ files. Default: 33 + --poly-a Trim poly-A tails + --length LENGTH, -l LENGTH + Shorten reads to LENGTH. Positive values remove bases at + the end while negative ones remove bases at the beginning. + This and the following modifications are applied after + adapter trimming. + --trim-n Trim N's on ends of reads. + --length-tag TAG Search for TAG followed by a decimal number in the + description field of the read. Replace the decimal number + with the correct length of the trimmed read. For example, + use --length-tag 'length=' to correct fields like + 'length=123'. + --strip-suffix STRIP_SUFFIX + Remove this suffix from read names if present. Can be + given multiple times. + -x PREFIX, --prefix PREFIX + Add this prefix to read names. Use {name} to insert the + name of the matching adapter. + -y SUFFIX, --suffix SUFFIX + Add this suffix to read names; can also include {name} + --rename TEMPLATE Rename reads using TEMPLATE containing variables such as + {id}, {adapter_name} etc. (see documentation) + --zero-cap, -z Change negative quality values to zero. + +Filtering of processed reads: + Filters are applied after above read modifications. Paired-end reads are + always discarded pairwise (see also --pair-filter). + + -m LEN[:LEN2], --minimum-length LEN[:LEN2] + Discard reads shorter than LEN. Default: 0 + -M LEN[:LEN2], --maximum-length LEN[:LEN2] + Discard reads longer than LEN. Default: no limit + --max-n COUNT Discard reads with more than COUNT 'N' bases. If COUNT is + a number between 0 and 1, it is interpreted as a fraction + of the read length. + --max-expected-errors ERRORS, --max-ee ERRORS + Discard reads whose expected number of errors (computed + from quality values) exceeds ERRORS. + --max-average-error-rate ERROR_RATE, --max-aer ERROR_RATE + as --max-expected-errors (see above), but divided by + length to account for reads of varying length. + --discard-trimmed, --discard + Discard reads that contain an adapter. Use also -O to + avoid discarding too many randomly matching reads. + --discard-untrimmed, --trimmed-only + Discard reads that do not contain an adapter. + --discard-casava Discard reads that did not pass CASAVA filtering (header + has :Y:). + +Output: + --quiet Print only error messages. + --report {full,minimal} + Which type of report to print: 'full' or 'minimal'. + Default: full + --json FILE Dump report in JSON format to FILE + -o FILE, --output FILE + Write trimmed reads to FILE. FASTQ or FASTA format is + chosen depending on input. Summary report is sent to + standard output. Use '{name}' for demultiplexing (see + docs). Default: write to standard output + --fasta Output FASTA to standard output even on FASTQ input. + -Z Use compression level 1 for gzipped output files (faster, + but uses more space) + --info-file FILE Write information about each read and its adapter matches + into FILE. See the documentation for the file format. + -r FILE, --rest-file FILE + When the adapter matches in the middle of a read, write + the rest (after the adapter) to FILE. + --wildcard-file FILE When the adapter has N wildcard bases, write adapter bases + matching wildcard positions to FILE. (Inaccurate with + indels.) + --too-short-output FILE + Write reads that are too short (according to length + specified by -m) to FILE. Default: discard reads + --too-long-output FILE + Write reads that are too long (according to length + specified by -M) to FILE. Default: discard reads + --untrimmed-output FILE + Write reads that do not contain any adapter to FILE. + Default: output to same file as trimmed reads + +Paired-end options: + The -A/-G/-B/-U/-Q options work like their lowercase counterparts, but are + applied to R2 (second read in pair) + + -A ADAPTER 3' adapter to be removed from R2 + -G ADAPTER 5' adapter to be removed from R2 + -B ADAPTER 5'/3 adapter to be removed from R2 + -U LENGTH Remove LENGTH bases from R2 + -Q [5'CUTOFF,]3'CUTOFF + Quality-trimming cutoff for R2. Default: same as for R1 + -p FILE, --paired-output FILE + Write R2 to FILE. + --pair-adapters Treat adapters given with -a/-A etc. as pairs. Either both + or none are removed from each read pair. + --pair-filter {any,both,first} + Which of the reads in a paired-end read have to match the + filtering criterion in order for the pair to be filtered. + Default: any + --interleaved Read and/or write interleaved paired-end reads. + --untrimmed-paired-output FILE + Write second read in a pair to this FILE when no adapter + was found. Use with --untrimmed-output. Default: output to + same file as trimmed reads + --too-short-paired-output FILE + Write second read in a pair to this file if pair is too + short. + --too-long-paired-output FILE + Write second read in a pair to this file if pair is too + long. + diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh new file mode 100644 index 00000000..39eb3b70 --- /dev/null +++ b/src/cutadapt/script.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +if [ -z $par_outputDir ]; then + par_outputDir=. +else + mkdir -p "$par_outputDir" +fi + +for f in $par_input; do + [ ! -f "$f" ] && echo "The input file $f does not exist" && exit 1 +done + +barcodesFasta="barcodes.fasta" + +awk '{print ">"$1"\n""^"$1}' $par_barcodesFile >$barcodesFasta + +fastqFiles=$(echo $par_input | tr " " "\n") +for file in $fastqFiles; do + if echo "$file" | grep -q R1; then + input_R1=$(echo $file | grep R1) + fi + if echo "$file" | grep -q R2; then + input_R2=$(echo $file | grep R2) + fi +done +demuxFilesIn="$input_R1 $input_R2" + +# Note to self: +# The eval is here to expand shell globs, this way it is possible to use +# for instance pointers to ".../...R?....fastq", but please use the double +# quotes and an absolute path! +eval /usr/local/bin/cutadapt \ + -e "$par_e" \ + --no-indels \ + --action=none \ + --cores=0 \ + -g "file:$barcodesFasta" \ + -o "$par_outputDir/{name}_R1_001.fastq" \ + -p "$par_outputDir/{name}_R2_001.fastq" \ + "$demuxFilesIn" >"$par_report" From be3fb5a16a5ef78aace7a54ad576372b6db233e5 Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Tue, 30 Jan 2024 10:26:20 +0100 Subject: [PATCH 02/32] Add config --- src/cutadapt/config.vsh.yaml | 461 ++++++++++++++++++++++++++++++++--- src/cutadapt/script.sh | 120 ++++++--- 2 files changed, 511 insertions(+), 70 deletions(-) diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index 18285219..a1a950df 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -1,40 +1,431 @@ functionality: name: cutadapt description: | - Demultiplexing using cutadapt - arguments: - # - name: "--input" - # type: file - # required: true - # default: file.fastq.gz - # multiple: true - # multiple_sep: " " - # direction: input - # must_exist: false - # - name: "--barcodesFile" - # type: file - # description: "Trim R1, the `file:/` part is automatically added later" - # default: barcodes.txt - # - name: "-e" - # alternatives: ["--error-rate"] - # type: string - # default: "0.10" - # - name: "-j" - # alternatives: ["--cores"] - # type: integer - # default: 0 - # description: Use the specified amount of CPU cores, default is for autodetect - # - name: "--outputDir" - # type: file - # default: demux - # direction: output - # description: Folder to write the output to, this is used in the -p and -o arguments of cutadapt - # - name: "--report" - # required: false - # type: file - # direction: output - # description: Filename for the report - # default: report.txt + Cutadapt removes adapter sequences from high-throughput sequencing reads. + info: + keywords: [RNA-seq, scRNA-seq, high-throughput] + homepage: https://cutadapt.readthedocs.io/ + documentation: https://cutadapt.readthedocs.io/ + repository: https://github.com/marcelm/cutadapt + reference: http://dx.doi.org/10.14806/ej.17.1.200 + license: MIT + argument_groups: + #################################################################### + - name: Specify Adapters for R1 + arguments: + - name: --adapter + alternatives: [-a] + type: string + multiple: true + description: | + Sequence of an adapter ligated to the 3' end (paired data: + of the first read). The adapter and subsequent bases are + trimmed. If a '$' character is appended ('anchoring'), the + adapter is only found if it is a suffix of the read. + required: false + - name: --front + alternatives: [-g] + type: string + multiple: true + description: | + Sequence of an adapter ligated to the 5' end (paired data: + of the first read). The adapter and any preceding bases + are trimmed. Partial matches at the 5' end are allowed. If + a '^' character is prepended ('anchoring'), the adapter is + only found if it is a prefix of the read. + required: false + - name: --anywhere + alternatives: [-b] + type: string + multiple: true + description: | + Sequence of an adapter that may be ligated to the 5' or 3' + end (paired data: of the first read). Both types of + matches as described under -a and -g are allowed. If the + first base of the read is part of the match, the behavior + is as with -g, otherwise as with -a. This option is mostly + for rescuing failed library preparations - do not use if + you know which end your adapter was ligated to! + required: false + + #################################################################### + - name: Specify Adapters using Fasta files for R1 + arguments: + - name: --adapter_fasta + type: string + multiple: true + description: | + Fasta file containing sequences of an adapter ligated to the 3' end (paired data: + of the first read). The adapter and subsequent bases are + trimmed. If a '$' character is appended ('anchoring'), the + adapter is only found if it is a suffix of the read. + required: false + - name: --front_fasta + type: string + multiple: true + description: | + Fasta file containing sequences of an adapter ligated to the 5' end (paired data: + of the first read). The adapter and any preceding bases + are trimmed. Partial matches at the 5' end are allowed. If + a '^' character is prepended ('anchoring'), the adapter is + only found if it is a prefix of the read. + required: false + - name: --anywhere_fasta + type: string + multiple: true + description: | + Fasta file containing sequences of an adapter that may be ligated to the 5' or 3' + end (paired data: of the first read). Both types of + matches as described under -a and -g are allowed. If the + first base of the read is part of the match, the behavior + is as with -g, otherwise as with -a. This option is mostly + for rescuing failed library preparations - do not use if + you know which end your adapter was ligated to! + required: false + + #################################################################### + - name: Specify Adapters for R2 + arguments: + - name: --adapterR2 + alternatives: [-A] + type: string + multiple: true + description: | + Sequence of an adapter ligated to the 3' end (paired data: + of the first read). The adapter and subsequent bases are + trimmed. If a '$' character is appended ('anchoring'), the + adapter is only found if it is a suffix of the read. + required: false + - name: --frontR2 + alternatives: [-G] + type: string + multiple: true + description: | + Sequence of an adapter ligated to the 5' end (paired data: + of the first read). The adapter and any preceding bases + are trimmed. Partial matches at the 5' end are allowed. If + a '^' character is prepended ('anchoring'), the adapter is + only found if it is a prefix of the read. + required: false + - name: --anywhereR2 + alternatives: [-B] + type: string + multiple: true + description: | + Sequence of an adapter that may be ligated to the 5' or 3' + end (paired data: of the first read). Both types of + matches as described under -a and -g are allowed. If the + first base of the read is part of the match, the behavior + is as with -g, otherwise as with -a. This option is mostly + for rescuing failed library preparations - do not use if + you know which end your adapter was ligated to! + required: false + + #################################################################### + - name: Specify Adapters using Fasta files for R2 + arguments: + - name: --adapterR2_fasta + type: string + multiple: true + description: | + Fasta file containing sequences of an adapter ligated to the 3' end (paired data: + of the first read). The adapter and subsequent bases are + trimmed. If a '$' character is appended ('anchoring'), the + adapter is only found if it is a suffix of the read. + required: false + - name: --frontR2_fasta + type: string + multiple: true + description: | + Fasta file containing sequences of an adapter ligated to the 5' end (paired data: + of the first read). The adapter and any preceding bases + are trimmed. Partial matches at the 5' end are allowed. If + a '^' character is prepended ('anchoring'), the adapter is + only found if it is a prefix of the read. + required: false + - name: --anywhereR2_fasta + type: string + multiple: true + description: | + Fasta file containing sequences of an adapter that may be ligated to the 5' or 3' + end (paired data: of the first read). Both types of + matches as described under -a and -g are allowed. If the + first base of the read is part of the match, the behavior + is as with -g, otherwise as with -a. This option is mostly + for rescuing failed library preparations - do not use if + you know which end your adapter was ligated to! + required: false + + #################################################################### + - name: Paired-end options + arguments: + - name: --pair_adapters + type: boolean_true + description: | + Treat adapters given with -a/-A etc. as pairs. Either both + or none are removed from each read pair. + - name: --pair_filter + type: string + choices: [any, both, first] + description: | + Which of the reads in a paired-end read have to match the + filtering criterion in order for the pair to be filtered. + default: any + - name: --interleaved + type: boolean_true + description: | + Read and/or write interleaved paired-end reads. + + #################################################################### + - name: Input parameters + arguments: + - name: --error_rate + alternatives: [-E, --errors] + type: double + description: | + Maximum allowed error rate (if 0 <= E < 1), or absolute + number of errors for full-length adapter match (if E is an + integer >= 1). Error rate = no. of errors divided by + length of matching region. + default: 0.1 + - name: --no_indels + type: boolean_false + description: | + Allow only mismatches in alignments. + - name: --times + type: integer + alternatives: [-n] + description: | + Remove up to COUNT adapters from each read. + default: 1 + - name: --overlap + alternatives: [-O] + type: integer + description: | + Require MINLENGTH overlap between read and adapter for an + adapter to be found. + default: 3 + - name: --match_read_wildcards + type: boolean_false + description: | + Interpret IUPAC wildcards in reads. + - name: --no_match_adapter_wildcards + type: boolean_true + description: | + Do not interpret IUPAC wildcards in adapters. + - name: --action + type: string + choices: + - trim + - retain + - mask + - lowercase + - none + description: | + What to do if a match was found. trim: trim adapter and + up- or downstream sequence; retain: trim, but retain + adapter; mask: replace with 'N' characters; lowercase: + convert to lowercase; none: leave unchanged. + default: trim + - name: --revcomp + alternatives: [--rc] + type: boolean_true + description: | + Check both the read and its reverse complement for adapter + matches. If match is on reverse-complemented version, + output that one. + + #################################################################### + - name: Read modifications + arguments: + - name: --cut + alternatives: [-u] + type: integer + multiple: true + description: | + Remove LEN bases from each read (or R1 if paired; use --cutR2 + option for R2). If LEN is positive, remove bases from the + beginning. If LEN is negative, remove bases from the end. + Can be used twice if LENs have different signs. Applied + *before* adapter trimming. + - name: --cutR2 + type: integer + multiple: true + description: | + Remove LEN bases from each read (for R2). If LEN is positive, remove bases from the + beginning. If LEN is negative, remove bases from the end. + Can be used twice if LENs have different signs. Applied + *before* adapter trimming. + - name: --nextseq_trim + type: string + description: | + NextSeq-specific quality trimming (each read). Trims also + dark cycles appearing as high-quality G bases. + - name: --quality_cutoff + alternatives: [-q] + type: string + description: | + Trim low-quality bases from 5' and/or 3' ends of each read + before adapter removal. Applied to both reads if data is + paired. If one value is given, only the 3' end is trimmed. + If two comma-separated cutoffs are given, the 5' end is + trimmed with the first cutoff, the 3' end with the second. + - name: --quality_cutoffR2 + alternatives: [-Q] + type: string + description: | + Quality-trimming cutoff for R2. Default: same as for R1 + - name: --quality_base + type: integer + description: | + Assume that quality values in FASTQ are encoded as + ascii(quality + N). This needs to be set to 64 for some + old Illumina FASTQ files. + default: 33 + - name: --poly_a + type: boolean_true + description: Trim poly-A tails + - name: --length + alternatives: [-l] + type: integer + description: | + Shorten reads to LENGTH. Positive values remove bases at + the end while negative ones remove bases at the beginning. + This and the following modifications are applied after + adapter trimming. + - name: --trim_n + type: boolean_true + description: Trim N's on ends of reads. + - name: --length_tag + type: string + description: | + Search for TAG followed by a decimal number in the + description field of the read. Replace the decimal number + with the correct length of the trimmed read. For example, + use --length-tag 'length=' to correct fields like + 'length=123'. + example: "length=" + - name: --strip_suffix + type: string + description: | + Remove this suffix from read names if present. Can be + given multiple times. + - name: --prefix + alternatives: [-x] + type: string + description: | + Add this prefix to read names. Use {name} to insert the + name of the matching adapter. + - name: --suffix + alternatives: [-y] + type: string + description: | + Add this suffix to read names; can also include {name} + - name: --rename + type: string + description: | + Rename reads using TEMPLATE containing variables such as + {id}, {adapter_name} etc. (see documentation) + - name: --zero_cap + alternatives: [-z] + type: boolean_true + description: Change negative quality values to zero. + + #################################################################### + - name: Filtering of processed reads + description: | + Filters are applied after above read modifications. Paired-end reads are + always discarded pairwise (see also --pair-filter). + arguments: + - name: --minimum_length + alternatives: [-m] + type: string + description: | + Discard reads shorter than LEN. + default: "0" + - name: --maximum_length + alternatives: [-M] + type: string + description: | + Discard reads longer than LEN. Default: no limit + - name: --max_n + type: string + description: | + Discard reads with more than COUNT 'N' bases. If COUNT is + a number between 0 and 1, it is interpreted as a fraction + of the read length. + - name: --max_expected_errors + alternatives: [--max_ee] + type: long + description: | + Discard reads whose expected number of errors (computed + from quality values) exceeds ERRORS. + - name: --max_average_error_rate + alternatives: [--max_aer] + type: long + description: | + as --max_expected_errors (see above), but divided by + length to account for reads of varying length. + - name: --discard_trimmed + alternatives: [--discard] + type: boolean_true + description: | + Discard reads that contain an adapter. Use also -O to + avoid discarding too many randomly matching reads. + - name: --discard_untrimmed + alternatives: [--trimmed_only] + type: boolean_true + description: | + Discard reads that do not contain an adapter. + - name: --discard_casava + type: boolean_true + description: | + Discard reads that did not pass CASAVA filtering (header + has :Y:). + + #################################################################### + - name: Output parameters + arguments: + - name: --report + type: string + choices: [full, minimal] + description: | + Which type of report to print: 'full' or 'minimal'. + default: full + - name: --json + type: boolean_true + description: | + Write report in JSON format to report.json in + the output directory. + - name: --output + type: file + description: | + Write trimmed reads to this directory and name the files using {name}. + FASTQ or FASTA format is chosen depending on input. + Summary report is sent to standard output. + default: output/ + direction: output + required: true + must_exist: true + - name: --fasta + type: boolean_true + description: | + Output FASTA to standard output even on FASTQ input. + - name: --info_file + type: boolean_true + description: | + Write information about each read and its adapter matches + into info.txt in the output directory. + See the documentation for the file format. + # - name: -Z + # - name: --rest_file + # - name: --wildcard-file + # - name: --too_short_output + # - name: --too_long_output + # - name: --untrimmed_output + # - name: --untrimmed_paired_output + # - name: too_short_paired_output + # - name: too_long_paired_output resources: - type: bash_script path: script.sh @@ -46,5 +437,3 @@ platforms: pip: - cutadapt - type: nextflow - # directives: - # label: fat diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh index 39eb3b70..aba0cfca 100644 --- a/src/cutadapt/script.sh +++ b/src/cutadapt/script.sh @@ -1,40 +1,92 @@ #!/bin/bash -if [ -z $par_outputDir ]; then - par_outputDir=. +if [ -z $par_output ]; then + par_output=. else - mkdir -p "$par_outputDir" + mkdir -p "$par_output" fi -for f in $par_input; do - [ ! -f "$f" ] && echo "The input file $f does not exist" && exit 1 -done +echo "par_adapter: $par_adapter" +echo "par_front: $par_front" +echo "par_anywhere: $par_anywhere" +echo "par_adapter_fasta: $par_adapter_fasta" +echo "par_front_fasta: $par_front_fasta" +echo "par_anywhere_fasta: $par_anywhere_fasta" +echo "par_adapterR2: $par_adapterR2" +echo "par_frontR2: $par_frontR2" +echo "par_anywhereR2: $par_anywhereR2" +echo "par_adapterR2_fasta: $par_adapterR2_fasta" +echo "par_frontR2_fasta: $par_frontR2_fasta" +echo "par_anywhereR2_fasta: $par_anywhereR2_fasta" +echo "par_pair_adapters: $par_pair_adapters" +echo "par_pair_filter: $par_pair_filter" +echo "par_interleaved: $par_interleaved" +echo "par_error_rate: $par_error_rate" +echo "par_no_indels: $par_no_indels" +echo "par_times: $par_times" +echo "par_overlap: $par_overlap" +echo "par_match_read_wildcards: $par_match_read_wildcards" +echo "no_match_adapter_wildcards: $no_match_adapter_wildcards" +echo "par_action: $par_action" +echo "par_revcomp: $par_revcomp" +echo "par_cut: $par_cut" +echo "par_cutR2: $par_cutR2" +echo "par_nextseq_trim: $par_nextseq_trim" +echo "par_quality_cutoff: $par_quality_cutoff" +echo "par_quality_cutoffR2: $par_quality_cutoffR2" +echo "par_quality_base: $par_quality_base" +echo "par_poly_a: $par_poly_a" +echo "par_length: $par_length" +echo "par_trim_n: $par_trim_n" +echo "par_length_tag: $par_length_tag" +echo "par_strip_suffix: $par_strip_suffix" +echo "par_prefix: $par_prefix" +echo "par_suffix: $par_suffix" +echo "par_rename: $par_rename" +echo "par_zero_cap: $par_zero_cap" +echo "par_minimum_length: $par_minimum_length" +echo "par_maximum_length: $par_maximum_length" +echo "par_max_n: $par_max_n" +echo "par_max_expected_errors: $par_max_expected_errors" +echo "par_max_average_error_rate: $par_max_average_error_rate" +echo "par_discard_trimmed: $par_discard_trimmed" +echo "par_discard_untrimmed: $par_discard_untrimmed" +echo "par_discard_casava: $par_discard_casava" +echo "par_report: $par_report" +echo "par_json: $par_json" +echo "par_output: $par_output" +echo "par_fasta: $par_fasta" +echo "par_info_file: $par_info_file" -barcodesFasta="barcodes.fasta" - -awk '{print ">"$1"\n""^"$1}' $par_barcodesFile >$barcodesFasta - -fastqFiles=$(echo $par_input | tr " " "\n") -for file in $fastqFiles; do - if echo "$file" | grep -q R1; then - input_R1=$(echo $file | grep R1) - fi - if echo "$file" | grep -q R2; then - input_R2=$(echo $file | grep R2) - fi -done -demuxFilesIn="$input_R1 $input_R2" - -# Note to self: -# The eval is here to expand shell globs, this way it is possible to use -# for instance pointers to ".../...R?....fastq", but please use the double -# quotes and an absolute path! -eval /usr/local/bin/cutadapt \ - -e "$par_e" \ - --no-indels \ - --action=none \ - --cores=0 \ - -g "file:$barcodesFasta" \ - -o "$par_outputDir/{name}_R1_001.fastq" \ - -p "$par_outputDir/{name}_R2_001.fastq" \ - "$demuxFilesIn" >"$par_report" +# for f in $par_input; do +# [ ! -f "$f" ] && echo "The input file $f does not exist" && exit 1 +# done +# +# barcodesFasta="barcodes.fasta" +# +# awk '{print ">"$1"\n""^"$1}' $par_barcodesFile >$barcodesFasta +# +# fastqFiles=$(echo $par_input | tr " " "\n") +# for file in $fastqFiles; do +# if echo "$file" | grep -q R1; then +# input_R1=$(echo $file | grep R1) +# fi +# if echo "$file" | grep -q R2; then +# input_R2=$(echo $file | grep R2) +# fi +# done +# demuxFilesIn="$input_R1 $input_R2" +# +# # Note to self: +# # The eval is here to expand shell globs, this way it is possible to use +# # for instance pointers to ".../...R?....fastq", but please use the double +# # quotes and an absolute path! +# eval /usr/local/bin/cutadapt \ +# -e "$par_e" \ +# --no-indels \ +# --action=none \ +# --cores=0 \ +# -g "file:$barcodesFasta" \ +# -o "$par_outputDir/{name}_R1_001.fastq" \ +# -p "$par_outputDir/{name}_R2_001.fastq" \ +# "$demuxFilesIn" >"$par_report" From 0de2a361d8d6f921c03a41a66a97fe23329ad5a6 Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Tue, 6 Feb 2024 15:02:42 +0100 Subject: [PATCH 03/32] Don't allow multiple: true when providing a FASTA file with adapters --- src/cutadapt/config.vsh.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index a1a950df..72b0a7fc 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -53,7 +53,6 @@ functionality: arguments: - name: --adapter_fasta type: string - multiple: true description: | Fasta file containing sequences of an adapter ligated to the 3' end (paired data: of the first read). The adapter and subsequent bases are @@ -62,7 +61,6 @@ functionality: required: false - name: --front_fasta type: string - multiple: true description: | Fasta file containing sequences of an adapter ligated to the 5' end (paired data: of the first read). The adapter and any preceding bases @@ -72,7 +70,6 @@ functionality: required: false - name: --anywhere_fasta type: string - multiple: true description: | Fasta file containing sequences of an adapter that may be ligated to the 5' or 3' end (paired data: of the first read). Both types of @@ -126,7 +123,6 @@ functionality: arguments: - name: --adapterR2_fasta type: string - multiple: true description: | Fasta file containing sequences of an adapter ligated to the 3' end (paired data: of the first read). The adapter and subsequent bases are @@ -135,7 +131,6 @@ functionality: required: false - name: --frontR2_fasta type: string - multiple: true description: | Fasta file containing sequences of an adapter ligated to the 5' end (paired data: of the first read). The adapter and any preceding bases @@ -145,7 +140,6 @@ functionality: required: false - name: --anywhereR2_fasta type: string - multiple: true description: | Fasta file containing sequences of an adapter that may be ligated to the 5' or 3' end (paired data: of the first read). Both types of From e0685a52ed812cf2510175fed08b455fa52e6c44 Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Wed, 7 Feb 2024 14:17:22 +0100 Subject: [PATCH 04/32] First version of script --- src/cutadapt/config.vsh.yaml | 16 +-- src/cutadapt/script.sh | 243 +++++++++++++++++++++++++++++------ 2 files changed, 209 insertions(+), 50 deletions(-) diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index 72b0a7fc..0b0573bf 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -52,7 +52,7 @@ functionality: - name: Specify Adapters using Fasta files for R1 arguments: - name: --adapter_fasta - type: string + type: file description: | Fasta file containing sequences of an adapter ligated to the 3' end (paired data: of the first read). The adapter and subsequent bases are @@ -60,7 +60,7 @@ functionality: adapter is only found if it is a suffix of the read. required: false - name: --front_fasta - type: string + type: file description: | Fasta file containing sequences of an adapter ligated to the 5' end (paired data: of the first read). The adapter and any preceding bases @@ -69,7 +69,7 @@ functionality: only found if it is a prefix of the read. required: false - name: --anywhere_fasta - type: string + type: file description: | Fasta file containing sequences of an adapter that may be ligated to the 5' or 3' end (paired data: of the first read). Both types of @@ -122,7 +122,7 @@ functionality: - name: Specify Adapters using Fasta files for R2 arguments: - name: --adapterR2_fasta - type: string + type: file description: | Fasta file containing sequences of an adapter ligated to the 3' end (paired data: of the first read). The adapter and subsequent bases are @@ -130,7 +130,7 @@ functionality: adapter is only found if it is a suffix of the read. required: false - name: --frontR2_fasta - type: string + type: file description: | Fasta file containing sequences of an adapter ligated to the 5' end (paired data: of the first read). The adapter and any preceding bases @@ -139,7 +139,7 @@ functionality: only found if it is a prefix of the read. required: false - name: --anywhereR2_fasta - type: string + type: file description: | Fasta file containing sequences of an adapter that may be ligated to the 5' or 3' end (paired data: of the first read). Both types of @@ -200,11 +200,11 @@ functionality: adapter to be found. default: 3 - name: --match_read_wildcards - type: boolean_false + type: boolean_true description: | Interpret IUPAC wildcards in reads. - name: --no_match_adapter_wildcards - type: boolean_true + type: boolean_false description: | Do not interpret IUPAC wildcards in adapters. - name: --action diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh index aba0cfca..47ed61f1 100644 --- a/src/cutadapt/script.sh +++ b/src/cutadapt/script.sh @@ -6,18 +6,6 @@ else mkdir -p "$par_output" fi -echo "par_adapter: $par_adapter" -echo "par_front: $par_front" -echo "par_anywhere: $par_anywhere" -echo "par_adapter_fasta: $par_adapter_fasta" -echo "par_front_fasta: $par_front_fasta" -echo "par_anywhere_fasta: $par_anywhere_fasta" -echo "par_adapterR2: $par_adapterR2" -echo "par_frontR2: $par_frontR2" -echo "par_anywhereR2: $par_anywhereR2" -echo "par_adapterR2_fasta: $par_adapterR2_fasta" -echo "par_frontR2_fasta: $par_frontR2_fasta" -echo "par_anywhereR2_fasta: $par_anywhereR2_fasta" echo "par_pair_adapters: $par_pair_adapters" echo "par_pair_filter: $par_pair_filter" echo "par_interleaved: $par_interleaved" @@ -58,35 +46,206 @@ echo "par_output: $par_output" echo "par_fasta: $par_fasta" echo "par_info_file: $par_info_file" -# for f in $par_input; do -# [ ! -f "$f" ] && echo "The input file $f does not exist" && exit 1 -# done -# -# barcodesFasta="barcodes.fasta" -# -# awk '{print ">"$1"\n""^"$1}' $par_barcodesFile >$barcodesFasta -# -# fastqFiles=$(echo $par_input | tr " " "\n") -# for file in $fastqFiles; do -# if echo "$file" | grep -q R1; then -# input_R1=$(echo $file | grep R1) -# fi -# if echo "$file" | grep -q R2; then -# input_R2=$(echo $file | grep R2) -# fi -# done -# demuxFilesIn="$input_R1 $input_R2" -# -# # Note to self: -# # The eval is here to expand shell globs, this way it is possible to use -# # for instance pointers to ".../...R?....fastq", but please use the double -# # quotes and an absolute path! -# eval /usr/local/bin/cutadapt \ -# -e "$par_e" \ -# --no-indels \ -# --action=none \ -# --cores=0 \ -# -g "file:$barcodesFasta" \ +# Do we get explicit adapter sequences or a FASTA file? +# Let the underlying tool deal with inconsistant states. +adapter_mode_R1="" +if [ ! -z "${par_adapter_fasta+set}" ]; then + adapter_mode_R1="fasta" +else + adapter_mode_R1="plain" +fi + +front_mode_R1="" +if [ ! -z "${par_front_fasta+set}" ]; then + front_mode_R1="fasta" +else + front_mode_R1="plain" +fi + +anywhere_mode_R1="" +if [ ! -z "${par_anywhere_fasta+set}" ]; then + anywhere_mode_R1="fasta" +else + anywhere_mode_R1="plain" +fi + +adapter_mode_R2="" +if [ ! -z "${par_adapter_fastaR2+set}" ]; then + adapter_mode_R2="fasta" +else + adapter_mode_R2="plain" +fi + +front_mode_R2="" +if [ ! -z "${par_front_fastaR2+set}" ]; then + front_mode_R2="fasta" +else + front_mode_R2="plain" +fi + +anywhere_mode_R2="" +if [ ! -z "${par_anywhere_fastaR2+set}" ]; then + anywhere_mode_R2="fasta" +else + anywhere_mode_R2="plain" +fi + +echo "Running cutadapt" +echo +echo "Adapter settings" +echo "----------------" +echo "Adapter Mode R1 : $adapter_mode_R1" +echo "Front Mode R1 : $front_mode_R1" +echo "Anywhere Mode R1 : $anywhere_mode_R1" +echo "Adapter Mode R2 : $adapter_mode_R2" +echo "Front Mode R2 : $front_mode_R2" +echo "Anywhere Mode R2 : $anywhere_mode_R2" +echo + +# Adapter arguments +# - paired and single-end +# - string and fasta +########################################################### +echo ">> Parsing arguments dealing with adapters" +adapter_args=$(echo \ + ${par_adapter:+--adapter "${par_adapter}"} \ + ${par_adapter_fasta:+--adapter "file:${par_adapter_fasta}"} \ + ${par_front:+--front "${par_front}"} \ + ${par_front_fasta:+--front "file:${par_front_fasta}"} \ + ${par_anywhere:+--anywhere "${par_anywhere}"} \ + ${par_anywhere_fasta:+--anywhere "file:${par_anywhere_fasta}"} \ + ${par_adapterR2:+--adapterR2 "${par_adapterR2}"} \ + ${par_adapterR2_fasta:+--adapterR2 "file:${par_adapterR2_fasta}"} \ + ${par_frontR2:+--frontR2 "${par_frontR2}"} \ + ${par_frontR2_fasta:+--frontR2 "file:${par_frontR2_fasta}"} \ + ${par_anywhereR2:+--anywhereR2 "${par_anywhereR2}"} \ + ${par_anywhereR2_fasta:+--anywhereR2 "file:${par_anywhereR2_fasta}"} +) +echo "Arguments to cutadapt:" +echo "$adapter_args" +echo + +# Paired-end options +########################################################### +echo ">> Parsing arguments for paired-end reads" +[[ "$par_pair_adapters" == "false" ]] && unset par_pair_adapters +[[ "$par_interleaved" == "false" ]] && unset par_interleaved + +paired_args=$(echo \ + ${par_pair_adapters:+--pair-adapters} \ + ${par_pair_filter:+--pair-filter "${par_pair_filter}"} \ + ${par_interleaved:+--interleaved} \ + +) +echo "Arguments to cutadapt:" +echo $paired_args +echo + +# Input arguments +########################################################### +echo ">> Parsing input arguments" +[[ "$par_no_indels" == "true" ]] && unset par_no_indels +[[ "$par_match_read_wildcards" == "false" ]] && unset par_match_read_wildcards +[[ "$par_no_match_adapter_wildcards" == "true" ]] && unset par_no_match_adapter_wildcards +[[ "$par_revcomp" == "false" ]] && unset par_revcomp + +input_args=$(echo \ + ${par_error_rate:+-error-rate "${par_error_rate}"} \ + ${par_no_indels:+--no-indels} \ + ${par_times:+--times "${par_times}"} \ + ${par_overlap:+--overlap "${par_overlap}"} \ + ${par_match_read_wildcards:+--match-read-wildcards} \ + ${par_no_match_adapter_wildcards:+--no-match-adapter-wildcards} \ + ${par_action:+--action "${par_action}"} \ + ${par_revcomp:+--revcomp} \ +) +echo "Arguments to cutadapt:" +echo $input_args +echo + +# Read modifications +########################################################### +echo ">> Parsing read modification arguments" +[[ "$par_poly_a" == "false" ]] && unset par_poly_a +[[ "$par_trim_n" == "false" ]] && unset par_trim_n +[[ "$par_zero_cap" == "false" ]] && unset par_zero_cap + +mod_args=$(echo \ + ${par_cut:+--cut "${par_cut}"} \ + ${par_cutR2:+--cutR2 "${par_cutR2}"} \ + ${par_nextseq_trim:+--nextseq-trim "${par_nextseq_trim}"} \ + ${par_quality_cutoff:+--quality-cutoff "${par_quality_cutoff}"} \ + ${par_quality_cutoffR2:+--quality-cutoffR2 "${par_quality_cutoffR2}"} \ + ${par_quality_base:+--quality-base "${par_quality_base}"} \ + ${par_poly_a:+--poly-a} \ + ${par_length:+--length "${par_length}"} \ + ${par_trim_n:+--trim-n} \ + ${par_length_tag:+--length-tag "${par_length_tag}"} \ + ${par_strip_suffix:+--strip-suffix "${par_strip_suffix}"} \ + ${par_prefix:+--prefix "${par_prefix}"} \ + ${par_suffix:+--suffix "${par_suffix}"} \ + ${par_rename:+--rename "${par_rename}"} \ + ${par_zero_cap:+--zero-cap} \ +) +echo "Arguments to cutadapt:" +echo $mod_args +echo + +# Filtering of processed reads arguments +########################################################### +echo ">> Filtering of processed reads arguments" +[[ "$par_discard_trimmed" == "false" ]] && unset par_discard_trimmed +[[ "$par_discard_untrimmed" == "false" ]] && unset par_discard_untrimmed +[[ "$par_discard_casava" == "false" ]] && unset par_discard_casava + +filter_args=$(echo \ + ${par_minimum_length:+--minimum-length "${par_minimum_length}"} \ + ${par_maximum_length:+--maximum-length "${par_maximum_length}"} \ + ${par_max_n:+--max-n "${par_max_n}"} \ + ${par_max_expected_errors:+--max-expected-errors "${par_max_expected_errors}"} \ + ${par_max_average_error_rate:+--max-average-error-rate "${par_max_average_error_rate}"} \ + ${par_discard_trimmed:+--discard-trimmed} \ + ${par_discard_untrimmed:+--discard-untrimmed} \ + ${par_discard_casava:+--discard-casava} \ +) +echo "Arguments to cutadapt:" +echo $filter_args +echo + +# Output arguments +# We write the output to a directory rather than +# individual files. +########################################################### +echo ">> Output arguments" +[[ "$par_json" == "false" ]] && unset par_json +[[ "$par_fasta" == "false" ]] && unset par_fasta +[[ "$par_info_file" == "false" ]] && unset par_info_file + # -o "$par_outputDir/{name}_R1_001.fastq" \ # -p "$par_outputDir/{name}_R2_001.fastq" \ -# "$demuxFilesIn" >"$par_report" + +output_args=$(echo \ + ${par_report:+--report "${par_report}"} \ + ${par_json:+--json} \ + -o "$par_output/{name}_R1_001.fastq" \ + -p "$par_output/{name}_R1_001.fastq" \ + ${par_fasta:+--fasta} \ + ${par_info_file:+--info-file} \ +) +echo "Arguments to cutadapt:" +echo $output_args +echo + +echo ">> Full CLI to be run:" +cli=$(echo "cutadapt" \ + $adapter_args \ + $paired_args \ + $input_args \ + $mod_args \ + $filter_args \ + $output_args +) + +echo $cli + +# $( "$cli" ) > $par_output/report.txt From b93f95cee8c6d6a3a66e52d217eb1f444e2411ca Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Wed, 7 Feb 2024 15:47:15 +0100 Subject: [PATCH 05/32] Updates and fixes - se/pe --- src/cutadapt/config.vsh.yaml | 11 ++- src/cutadapt/script.sh | 144 +++++++++-------------------------- 2 files changed, 43 insertions(+), 112 deletions(-) diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index 0b0573bf..6aa30f38 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -164,7 +164,6 @@ functionality: description: | Which of the reads in a paired-end read have to match the filtering criterion in order for the pair to be filtered. - default: any - name: --interleaved type: boolean_true description: | @@ -173,6 +172,12 @@ functionality: #################################################################### - name: Input parameters arguments: + - name: --input + type: file + multiple: true + required: true + description: | + Input fastq files. Paired reads are delimited with a space. - name: --error_rate alternatives: [-E, --errors] type: double @@ -329,7 +334,7 @@ functionality: - name: Filtering of processed reads description: | Filters are applied after above read modifications. Paired-end reads are - always discarded pairwise (see also --pair-filter). + always discarded pairwise (see also --pair_filter). arguments: - name: --minimum_length alternatives: [-m] @@ -397,7 +402,7 @@ functionality: Write trimmed reads to this directory and name the files using {name}. FASTQ or FASTA format is chosen depending on input. Summary report is sent to standard output. - default: output/ + default: output direction: output required: true must_exist: true diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh index 47ed61f1..c73d2d9a 100644 --- a/src/cutadapt/script.sh +++ b/src/cutadapt/script.sh @@ -6,101 +6,20 @@ else mkdir -p "$par_output" fi -echo "par_pair_adapters: $par_pair_adapters" -echo "par_pair_filter: $par_pair_filter" -echo "par_interleaved: $par_interleaved" -echo "par_error_rate: $par_error_rate" -echo "par_no_indels: $par_no_indels" -echo "par_times: $par_times" -echo "par_overlap: $par_overlap" -echo "par_match_read_wildcards: $par_match_read_wildcards" -echo "no_match_adapter_wildcards: $no_match_adapter_wildcards" -echo "par_action: $par_action" -echo "par_revcomp: $par_revcomp" -echo "par_cut: $par_cut" -echo "par_cutR2: $par_cutR2" -echo "par_nextseq_trim: $par_nextseq_trim" -echo "par_quality_cutoff: $par_quality_cutoff" -echo "par_quality_cutoffR2: $par_quality_cutoffR2" -echo "par_quality_base: $par_quality_base" -echo "par_poly_a: $par_poly_a" -echo "par_length: $par_length" -echo "par_trim_n: $par_trim_n" -echo "par_length_tag: $par_length_tag" -echo "par_strip_suffix: $par_strip_suffix" -echo "par_prefix: $par_prefix" -echo "par_suffix: $par_suffix" -echo "par_rename: $par_rename" -echo "par_zero_cap: $par_zero_cap" -echo "par_minimum_length: $par_minimum_length" -echo "par_maximum_length: $par_maximum_length" -echo "par_max_n: $par_max_n" -echo "par_max_expected_errors: $par_max_expected_errors" -echo "par_max_average_error_rate: $par_max_average_error_rate" -echo "par_discard_trimmed: $par_discard_trimmed" -echo "par_discard_untrimmed: $par_discard_untrimmed" -echo "par_discard_casava: $par_discard_casava" -echo "par_report: $par_report" -echo "par_json: $par_json" -echo "par_output: $par_output" -echo "par_fasta: $par_fasta" -echo "par_info_file: $par_info_file" - -# Do we get explicit adapter sequences or a FASTA file? -# Let the underlying tool deal with inconsistant states. -adapter_mode_R1="" -if [ ! -z "${par_adapter_fasta+set}" ]; then - adapter_mode_R1="fasta" -else - adapter_mode_R1="plain" -fi - -front_mode_R1="" -if [ ! -z "${par_front_fasta+set}" ]; then - front_mode_R1="fasta" -else - front_mode_R1="plain" -fi - -anywhere_mode_R1="" -if [ ! -z "${par_anywhere_fasta+set}" ]; then - anywhere_mode_R1="fasta" -else - anywhere_mode_R1="plain" -fi - -adapter_mode_R2="" -if [ ! -z "${par_adapter_fastaR2+set}" ]; then - adapter_mode_R2="fasta" -else - adapter_mode_R2="plain" -fi - -front_mode_R2="" -if [ ! -z "${par_front_fastaR2+set}" ]; then - front_mode_R2="fasta" -else - front_mode_R2="plain" -fi - -anywhere_mode_R2="" -if [ ! -z "${par_anywhere_fastaR2+set}" ]; then - anywhere_mode_R2="fasta" -else - anywhere_mode_R2="plain" -fi +# Init +########################################################### echo "Running cutadapt" echo -echo "Adapter settings" -echo "----------------" -echo "Adapter Mode R1 : $adapter_mode_R1" -echo "Front Mode R1 : $front_mode_R1" -echo "Anywhere Mode R1 : $anywhere_mode_R1" -echo "Adapter Mode R2 : $adapter_mode_R2" -echo "Front Mode R2 : $front_mode_R2" -echo "Anywhere Mode R2 : $anywhere_mode_R2" -echo +echo ">> Paired-end data or not?" +IFS=':' read -a inputs <<< "$par_input" +input=$(echo $par_input | tr ':' ' ') + +nr_inputs="${#inputs[@]}" + +[[ $nr_inputs = 1 ]] && echo " Single end" && mode="se" +[[ $nr_inputs = 2 ]] && echo " Paired end" && mode="pe" +[[ $nr_inputs = 3 ]] && echo " Too much input !!!" && exit 1 # Adapter arguments # - paired and single-end @@ -134,8 +53,7 @@ echo ">> Parsing arguments for paired-end reads" paired_args=$(echo \ ${par_pair_adapters:+--pair-adapters} \ ${par_pair_filter:+--pair-filter "${par_pair_filter}"} \ - ${par_interleaved:+--interleaved} \ - + ${par_interleaved:+--interleaved} ) echo "Arguments to cutadapt:" echo $paired_args @@ -150,7 +68,7 @@ echo ">> Parsing input arguments" [[ "$par_revcomp" == "false" ]] && unset par_revcomp input_args=$(echo \ - ${par_error_rate:+-error-rate "${par_error_rate}"} \ + ${par_error_rate:+--error-rate "${par_error_rate}"} \ ${par_no_indels:+--no-indels} \ ${par_times:+--times "${par_times}"} \ ${par_overlap:+--overlap "${par_overlap}"} \ @@ -221,23 +139,31 @@ echo ">> Output arguments" [[ "$par_fasta" == "false" ]] && unset par_fasta [[ "$par_info_file" == "false" ]] && unset par_info_file -# -o "$par_outputDir/{name}_R1_001.fastq" \ -# -p "$par_outputDir/{name}_R2_001.fastq" \ - -output_args=$(echo \ - ${par_report:+--report "${par_report}"} \ - ${par_json:+--json} \ - -o "$par_output/{name}_R1_001.fastq" \ - -p "$par_output/{name}_R1_001.fastq" \ - ${par_fasta:+--fasta} \ - ${par_info_file:+--info-file} \ -) +if [ $mode = "se" ]; then + output_args=$(echo \ + ${par_report:+--report "${par_report}"} \ + ${par_json:+--json} \ + --output "$par_output/{name}_R1_001.fastq" \ + ${par_fasta:+--fasta} \ + ${par_info_file:+--info-file} \ + ) +else + output_args=$(echo \ + ${par_report:+--report "${par_report}"} \ + ${par_json:+--json} \ + --output "$par_output/{name}_R1_001.fastq" \ + --paired-output "$par_output/{name}_R2_001.fastq" \ + ${par_fasta:+--fasta} \ + ${par_info_file:+--info-file} \ + ) +fi echo "Arguments to cutadapt:" echo $output_args echo echo ">> Full CLI to be run:" -cli=$(echo "cutadapt" \ +cli=$(echo \ + $input \ $adapter_args \ $paired_args \ $input_args \ @@ -246,6 +172,6 @@ cli=$(echo "cutadapt" \ $output_args ) -echo $cli +echo cutadapt $cli | sed -e 's/--/\r\n --/g' -# $( "$cli" ) > $par_output/report.txt +cutadapt $cli | tee $par_output/report.txt From ac6e1c25ee0182e8b3a0e39578cf75eb041c1e19 Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Fri, 9 Feb 2024 17:42:41 +0100 Subject: [PATCH 06/32] Add tests and fix --json argument --- src/cutadapt/config.vsh.yaml | 5 +++ src/cutadapt/script.sh | 4 +-- src/cutadapt/test.sh | 55 +++++++++++++++++++++++++++++ src/cutadapt/test_data/pe/a.1.fastq | 4 +++ src/cutadapt/test_data/pe/a.2.fastq | 4 +++ src/cutadapt/test_data/script.sh | 15 ++++++++ src/cutadapt/test_data/se/a.fastq | 4 +++ 7 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 src/cutadapt/test.sh create mode 100644 src/cutadapt/test_data/pe/a.1.fastq create mode 100644 src/cutadapt/test_data/pe/a.2.fastq create mode 100755 src/cutadapt/test_data/script.sh create mode 100644 src/cutadapt/test_data/se/a.fastq diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index 6aa30f38..cf2f800a 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -428,6 +428,11 @@ functionality: resources: - type: bash_script path: script.sh + test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data platforms: - type: docker image: python:3.8 diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh index c73d2d9a..ef99ade4 100644 --- a/src/cutadapt/script.sh +++ b/src/cutadapt/script.sh @@ -142,7 +142,7 @@ echo ">> Output arguments" if [ $mode = "se" ]; then output_args=$(echo \ ${par_report:+--report "${par_report}"} \ - ${par_json:+--json} \ + ${par_json:+--json "${par_output}/report.json"} \ --output "$par_output/{name}_R1_001.fastq" \ ${par_fasta:+--fasta} \ ${par_info_file:+--info-file} \ @@ -150,7 +150,7 @@ if [ $mode = "se" ]; then else output_args=$(echo \ ${par_report:+--report "${par_report}"} \ - ${par_json:+--json} \ + ${par_json:+--json "${par_output}/report.json"} \ --output "$par_output/{name}_R1_001.fastq" \ --paired-output "$par_output/{name}_R2_001.fastq" \ ${par_fasta:+--fasta} \ diff --git a/src/cutadapt/test.sh b/src/cutadapt/test.sh new file mode 100644 index 00000000..0d362ff1 --- /dev/null +++ b/src/cutadapt/test.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +set -e + +dir_in="$meta_resources_dir/test_data" + +echo "> Run cutadapt on single-end data" +"$meta_executable" \ + --report minimal \ + --output output-dir \ + --adapter AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ + --input $dir_in/se/a.fastq \ + --quality_cutoff 20 \ + --json + +echo ">> Checking output" +[ ! -f "output-dir/report.txt" ] && echo "report.txt does not exist" && exit 1 +[ ! -f "output-dir/report.json" ] && echo "report.json does not exist" && exit 1 +[ ! -f "output-dir/1_R1_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 +[ ! -f "output-dir/unknown_R1_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 + +echo ">> Check if output is empty" +[ -s "output-dir/1_R1_001.fastq" ] && echo "1_R1_001.fastq should be empty" && exit 1 +[ ! -s "output-dir/unknown_R1_001.fastq" ] && echo "unkown_R1_001.fastq is empty" && exit 1 + +rm -r output-dir + +echo "> Run cutadapt on paired-end data" +"$meta_executable" \ + --report minimal \ + --output output-dir \ + --adapter AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ + --input $dir_in/pe/a.1.fastq \ + --input $dir_in/pe/a.2.fastq \ + --quality_cutoff 20 \ + --json + +echo ">> Checking output" +[ ! -f "output-dir/report.txt" ] && echo "report.txt does not exist" && exit 1 +[ ! -f "output-dir/report.json" ] && echo "report.json does not exist" && exit 1 +[ ! -f "output-dir/1_R1_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 +[ ! -f "output-dir/1_R2_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 +[ ! -f "output-dir/unknown_R1_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 +[ ! -f "output-dir/unknown_R2_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 + +echo ">> Check if output is empty" +[ -s "output-dir/1_R1_001.fastq" ] && echo "1_R1_001.fastq should be empty" && exit 1 +[ -s "output-dir/1_R2_001.fastq" ] && echo "1_R2_001.fastq should be empty" && exit 1 +[ ! -s "output-dir/unknown_R1_001.fastq" ] && echo "unkown_R1_001.fastq is empty" && exit 1 +[ ! -s "output-dir/unknown_R2_001.fastq" ] && echo "unkown_R2_001.fastq is empty" && exit 1 + +rm -r output-dir + +echo "> Test successful" + diff --git a/src/cutadapt/test_data/pe/a.1.fastq b/src/cutadapt/test_data/pe/a.1.fastq new file mode 100644 index 00000000..42735560 --- /dev/null +++ b/src/cutadapt/test_data/pe/a.1.fastq @@ -0,0 +1,4 @@ +@1 +ACGGCAT ++ +!!!!!!! diff --git a/src/cutadapt/test_data/pe/a.2.fastq b/src/cutadapt/test_data/pe/a.2.fastq new file mode 100644 index 00000000..42735560 --- /dev/null +++ b/src/cutadapt/test_data/pe/a.2.fastq @@ -0,0 +1,4 @@ +@1 +ACGGCAT ++ +!!!!!!! diff --git a/src/cutadapt/test_data/script.sh b/src/cutadapt/test_data/script.sh new file mode 100755 index 00000000..87dc085e --- /dev/null +++ b/src/cutadapt/test_data/script.sh @@ -0,0 +1,15 @@ +# cutadapt test data + +# Test data was obtained from https://github.com/snakemake/snakemake-wrappers/tree/master/bio/cutadapt/test + +if [ ! -d /tmp/snakemake-wrappers ]; then + git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers +fi + +mkdir -p src/cutadapt/test_data/pe +mkdir src/cutadapt/test_data/se + +cp -r /tmp/snakemake-wrappers/bio/cutadapt/se/test/reads/* src/cutadapt/test_data/se +cp -r /tmp/snakemake-wrappers/bio/cutadapt/pe/test/reads/* src/cutadapt/test_data/pe + +rm -rf /tmp/snakemake-wrappers diff --git a/src/cutadapt/test_data/se/a.fastq b/src/cutadapt/test_data/se/a.fastq new file mode 100644 index 00000000..42735560 --- /dev/null +++ b/src/cutadapt/test_data/se/a.fastq @@ -0,0 +1,4 @@ +@1 +ACGGCAT ++ +!!!!!!! From e68a03856de8a4b744c6ca89559bed6c6cf7ced5 Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Fri, 9 Feb 2024 17:47:52 +0100 Subject: [PATCH 07/32] Add software version --- src/cutadapt/config.vsh.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index cf2f800a..98f59be2 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -440,4 +440,7 @@ platforms: - type: python pip: - cutadapt + - type: docker + run: | + cutadapt --version | sed 's/\(.*\)/cutadapt: "\1"/' > /var/software_versions.txt - type: nextflow From b2d39140812855ec643ec8ee97d863adc1a214fb Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Tue, 13 Feb 2024 17:01:19 +0100 Subject: [PATCH 08/32] Better consistency in using snake_case --- src/cutadapt/config.vsh.yaml | 18 +++++++++--------- src/cutadapt/script.sh | 16 ++++++++-------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index 98f59be2..ebfe5a4a 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -83,7 +83,7 @@ functionality: #################################################################### - name: Specify Adapters for R2 arguments: - - name: --adapterR2 + - name: --adapter_r2 alternatives: [-A] type: string multiple: true @@ -93,7 +93,7 @@ functionality: trimmed. If a '$' character is appended ('anchoring'), the adapter is only found if it is a suffix of the read. required: false - - name: --frontR2 + - name: --front_r2 alternatives: [-G] type: string multiple: true @@ -104,7 +104,7 @@ functionality: a '^' character is prepended ('anchoring'), the adapter is only found if it is a prefix of the read. required: false - - name: --anywhereR2 + - name: --anywhere_r2 alternatives: [-B] type: string multiple: true @@ -121,7 +121,7 @@ functionality: #################################################################### - name: Specify Adapters using Fasta files for R2 arguments: - - name: --adapterR2_fasta + - name: --adapter_r2_fasta type: file description: | Fasta file containing sequences of an adapter ligated to the 3' end (paired data: @@ -129,7 +129,7 @@ functionality: trimmed. If a '$' character is appended ('anchoring'), the adapter is only found if it is a suffix of the read. required: false - - name: --frontR2_fasta + - name: --front_r2_fasta type: file description: | Fasta file containing sequences of an adapter ligated to the 5' end (paired data: @@ -138,7 +138,7 @@ functionality: a '^' character is prepended ('anchoring'), the adapter is only found if it is a prefix of the read. required: false - - name: --anywhereR2_fasta + - name: --anywhere_r2_fasta type: file description: | Fasta file containing sequences of an adapter that may be ligated to the 5' or 3' @@ -242,12 +242,12 @@ functionality: type: integer multiple: true description: | - Remove LEN bases from each read (or R1 if paired; use --cutR2 + Remove LEN bases from each read (or R1 if paired; use --cut_r2 option for R2). If LEN is positive, remove bases from the beginning. If LEN is negative, remove bases from the end. Can be used twice if LENs have different signs. Applied *before* adapter trimming. - - name: --cutR2 + - name: --cut_r2 type: integer multiple: true description: | @@ -269,7 +269,7 @@ functionality: paired. If one value is given, only the 3' end is trimmed. If two comma-separated cutoffs are given, the 5' end is trimmed with the first cutoff, the 3' end with the second. - - name: --quality_cutoffR2 + - name: --quality_cutoff_r2 alternatives: [-Q] type: string description: | diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh index ef99ade4..f0879130 100644 --- a/src/cutadapt/script.sh +++ b/src/cutadapt/script.sh @@ -33,12 +33,12 @@ adapter_args=$(echo \ ${par_front_fasta:+--front "file:${par_front_fasta}"} \ ${par_anywhere:+--anywhere "${par_anywhere}"} \ ${par_anywhere_fasta:+--anywhere "file:${par_anywhere_fasta}"} \ - ${par_adapterR2:+--adapterR2 "${par_adapterR2}"} \ - ${par_adapterR2_fasta:+--adapterR2 "file:${par_adapterR2_fasta}"} \ - ${par_frontR2:+--frontR2 "${par_frontR2}"} \ - ${par_frontR2_fasta:+--frontR2 "file:${par_frontR2_fasta}"} \ - ${par_anywhereR2:+--anywhereR2 "${par_anywhereR2}"} \ - ${par_anywhereR2_fasta:+--anywhereR2 "file:${par_anywhereR2_fasta}"} + ${par_adapter_r2:+--adapter_r2 "${par_adapter_r2}"} \ + ${par_adapter_r2_fasta:+--adapter_r2 "file:${par_adapter_r2_fasta}"} \ + ${par_front_r2:+--front_r2 "${par_front_r2}"} \ + ${par_front_r2_fasta:+--front_r2 "file:${par_front_r2_fasta}"} \ + ${par_anywhere_r2:+--anywhere_r2 "${par_anywhere_r2}"} \ + ${par_anywhere_r2_fasta:+--anywhere_r2 "file:${par_anywhere_r2_fasta}"} ) echo "Arguments to cutadapt:" echo "$adapter_args" @@ -90,10 +90,10 @@ echo ">> Parsing read modification arguments" mod_args=$(echo \ ${par_cut:+--cut "${par_cut}"} \ - ${par_cutR2:+--cutR2 "${par_cutR2}"} \ + ${par_cut_r2:+--cut_r2 "${par_cut_r2}"} \ ${par_nextseq_trim:+--nextseq-trim "${par_nextseq_trim}"} \ ${par_quality_cutoff:+--quality-cutoff "${par_quality_cutoff}"} \ - ${par_quality_cutoffR2:+--quality-cutoffR2 "${par_quality_cutoffR2}"} \ + ${par_quality_cutoff_r2:+--quality-cutoff_r2 "${par_quality_cutoff_r2}"} \ ${par_quality_base:+--quality-base "${par_quality_base}"} \ ${par_poly_a:+--poly-a} \ ${par_length:+--length "${par_length}"} \ From da46e66bdec235429ca28d3b2a1d22cd8c2d21cf Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Tue, 13 Feb 2024 17:13:35 +0100 Subject: [PATCH 09/32] Update src/cutadapt/config.vsh.yaml Co-authored-by: Robrecht Cannoodt --- src/cutadapt/config.vsh.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index ebfe5a4a..bc8a3091 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -185,8 +185,8 @@ functionality: Maximum allowed error rate (if 0 <= E < 1), or absolute number of errors for full-length adapter match (if E is an integer >= 1). Error rate = no. of errors divided by - length of matching region. - default: 0.1 + length of matching region. Default: 0.1 (10%). + example: 0.1 - name: --no_indels type: boolean_false description: | From e714e676d8b18d873e4247a34ede28a884f8f2e9 Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Tue, 13 Feb 2024 17:15:27 +0100 Subject: [PATCH 10/32] Update src/cutadapt/config.vsh.yaml Co-authored-by: Robrecht Cannoodt --- src/cutadapt/config.vsh.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index bc8a3091..4ee516cf 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -435,7 +435,7 @@ functionality: path: test_data platforms: - type: docker - image: python:3.8 + image: python:3.12 setup: - type: python pip: From 365215d6dbf81bd592a1df1f98b7cc832a08eff4 Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Tue, 13 Feb 2024 17:16:12 +0100 Subject: [PATCH 11/32] Update src/cutadapt/config.vsh.yaml Co-authored-by: Robrecht Cannoodt --- src/cutadapt/config.vsh.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index 4ee516cf..ec241050 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -191,12 +191,13 @@ functionality: type: boolean_false description: | Allow only mismatches in alignments. + - name: --times type: integer alternatives: [-n] description: | - Remove up to COUNT adapters from each read. - default: 1 + Remove up to COUNT adapters from each read. Default: 1. + example: 1 - name: --overlap alternatives: [-O] type: integer From 94ad54dcb751def6f5e0f3fca540bc04334d4ccd Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Tue, 13 Feb 2024 17:11:58 +0100 Subject: [PATCH 12/32] Specify --input and --input_r2 as separate arguments --- src/cutadapt/config.vsh.yaml | 8 ++++++-- src/cutadapt/script.sh | 17 ++++++++++------- src/cutadapt/test.sh | 2 +- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index ec241050..d2b42d36 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -174,10 +174,14 @@ functionality: arguments: - name: --input type: file - multiple: true required: true description: | - Input fastq files. Paired reads are delimited with a space. + Input fastq file for single-end reads or R1 for paired-end reads. + - name: --input_r2 + type: file + required: false + description: | + Input fastq file for R2 in the case of paired-end reads. - name: --error_rate alternatives: [-E, --errors] type: double diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh index f0879130..8ebb9bd9 100644 --- a/src/cutadapt/script.sh +++ b/src/cutadapt/script.sh @@ -12,14 +12,17 @@ fi echo "Running cutadapt" echo echo ">> Paired-end data or not?" -IFS=':' read -a inputs <<< "$par_input" -input=$(echo $par_input | tr ':' ' ') -nr_inputs="${#inputs[@]}" - -[[ $nr_inputs = 1 ]] && echo " Single end" && mode="se" -[[ $nr_inputs = 2 ]] && echo " Paired end" && mode="pe" -[[ $nr_inputs = 3 ]] && echo " Too much input !!!" && exit 1 +mode="" +if [[ -z $par_input_r2 ]]; then + mode="se" + echo " Single end" + input="$par_input" +else + echo " Paired end" + mode="pe" + input="$par_input $par_input_r2" +fi # Adapter arguments # - paired and single-end diff --git a/src/cutadapt/test.sh b/src/cutadapt/test.sh index 0d362ff1..cf342ca1 100644 --- a/src/cutadapt/test.sh +++ b/src/cutadapt/test.sh @@ -31,7 +31,7 @@ echo "> Run cutadapt on paired-end data" --output output-dir \ --adapter AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ --input $dir_in/pe/a.1.fastq \ - --input $dir_in/pe/a.2.fastq \ + --input_r2 $dir_in/pe/a.2.fastq \ --quality_cutoff 20 \ --json From d5af57e19551cf9358ea8e628910e41b0aa94ce6 Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Tue, 13 Feb 2024 17:21:54 +0100 Subject: [PATCH 13/32] Avoid specifying default arg values --- src/cutadapt/config.vsh.yaml | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index d2b42d36..4204362a 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -207,8 +207,8 @@ functionality: type: integer description: | Require MINLENGTH overlap between read and adapter for an - adapter to be found. - default: 3 + adapter to be found. The default is 3. + example: 3 - name: --match_read_wildcards type: boolean_true description: | @@ -230,7 +230,8 @@ functionality: up- or downstream sequence; retain: trim, but retain adapter; mask: replace with 'N' characters; lowercase: convert to lowercase; none: leave unchanged. - default: trim + The default is trim. + example: trim - name: --revcomp alternatives: [--rc] type: boolean_true @@ -284,8 +285,8 @@ functionality: description: | Assume that quality values in FASTQ are encoded as ascii(quality + N). This needs to be set to 64 for some - old Illumina FASTQ files. - default: 33 + old Illumina FASTQ files. The default is 33. + example: 33 - name: --poly_a type: boolean_true description: Trim poly-A tails @@ -345,8 +346,8 @@ functionality: alternatives: [-m] type: string description: | - Discard reads shorter than LEN. - default: "0" + Discard reads shorter than LEN. Default is 0. + example: "0" - name: --maximum_length alternatives: [-M] type: string @@ -394,8 +395,8 @@ functionality: type: string choices: [full, minimal] description: | - Which type of report to print: 'full' or 'minimal'. - default: full + Which type of report to print: 'full' (default) or 'minimal'. + example: full - name: --json type: boolean_true description: | From 8b6aa30af2edbd61ba26e0f547a548b56786ed5d Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Fri, 16 Feb 2024 10:46:29 +0100 Subject: [PATCH 14/32] Add more information to `--minimum_length` and `maximum_length` --- src/cutadapt/config.vsh.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index 4204362a..7d63206d 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -347,12 +347,17 @@ functionality: type: string description: | Discard reads shorter than LEN. Default is 0. + When trimming paired-end reads, the minimum lengths for R1 and R2 can be specified separately by separating them with a colon (:). + If the colon syntax is not used, the same minimum length applies to both reads, as discussed above. + Also, one of the values can be omitted to impose no restrictions. + For example, with -m 17:, the length of R1 must be at least 17, but the length of R2 is ignored. example: "0" - name: --maximum_length alternatives: [-M] type: string description: | - Discard reads longer than LEN. Default: no limit + Discard reads longer than LEN. Default: no limit. + For paired reads, see the remark for --minimum_length - name: --max_n type: string description: | From 99b4f0139bfb0584ca8da586a985d6a16ae5e1af Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Fri, 16 Feb 2024 11:07:07 +0100 Subject: [PATCH 15/32] Add --cpus by means of $meta_cpus and set proper default --- src/cutadapt/script.sh | 13 ++++++++++++- src/cutadapt/test.sh | 3 ++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh index 8ebb9bd9..93a3fa5e 100644 --- a/src/cutadapt/script.sh +++ b/src/cutadapt/script.sh @@ -119,6 +119,9 @@ echo ">> Filtering of processed reads arguments" [[ "$par_discard_untrimmed" == "false" ]] && unset par_discard_untrimmed [[ "$par_discard_casava" == "false" ]] && unset par_discard_casava +# Parse and transform the minimum and maximum length arguments +[[ -z $par_minimum_length ]] + filter_args=$(echo \ ${par_minimum_length:+--minimum-length "${par_minimum_length}"} \ ${par_maximum_length:+--maximum-length "${par_maximum_length}"} \ @@ -164,7 +167,14 @@ echo "Arguments to cutadapt:" echo $output_args echo +# Full CLI +# Set the --cores argument to 0 unless meta_cpus is set +########################################################### echo ">> Full CLI to be run:" + +par_cpus=0 +[[ ! -z $meta_cpus ]] && par_cpus=$meta_cpus + cli=$(echo \ $input \ $adapter_args \ @@ -172,7 +182,8 @@ cli=$(echo \ $input_args \ $mod_args \ $filter_args \ - $output_args + $output_args \ + --cores $par_cpus ) echo cutadapt $cli | sed -e 's/--/\r\n --/g' diff --git a/src/cutadapt/test.sh b/src/cutadapt/test.sh index cf342ca1..77094990 100644 --- a/src/cutadapt/test.sh +++ b/src/cutadapt/test.sh @@ -33,7 +33,8 @@ echo "> Run cutadapt on paired-end data" --input $dir_in/pe/a.1.fastq \ --input_r2 $dir_in/pe/a.2.fastq \ --quality_cutoff 20 \ - --json + --json \ + ---cpus 1 echo ">> Checking output" [ ! -f "output-dir/report.txt" ] && echo "report.txt does not exist" && exit 1 From c864e5e054c6f63d774a1bf0bdb81e96e1b5cf5c Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Fri, 16 Feb 2024 14:17:14 +0100 Subject: [PATCH 16/32] Allow multiple for adapters/fasta and add test --- src/cutadapt/config.vsh.yaml | 1 + src/cutadapt/script.sh | 86 +++++++++++++++++++++++++++----- src/cutadapt/test.sh | 33 ++++++++++++ src/cutadapt/test_data/script.sh | 1 + 4 files changed, 109 insertions(+), 12 deletions(-) diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index 7d63206d..88636952 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -53,6 +53,7 @@ functionality: arguments: - name: --adapter_fasta type: file + multiple: true description: | Fasta file containing sequences of an adapter ligated to the 3' end (paired data: of the first read). The adapter and subsequent bases are diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh index 93a3fa5e..2ae29a3c 100644 --- a/src/cutadapt/script.sh +++ b/src/cutadapt/script.sh @@ -28,20 +28,82 @@ fi # - paired and single-end # - string and fasta ########################################################### + +multi_adapter="" +for adapter in `echo $par_adapter | tr ':' ' '`; do + multi_adapter="$multi_adapter --adapter $adapter" +done + +multi_adapter_fasta="" +for adapter_fasta in `echo $par_adapter_fasta | tr ':' ' '`; do + multi_adapter_fasta="$multi_adapter_fasta --adapter file:$adapter_fasta" +done + +multi_adapter_r2="" +for adapter_r2 in `echo $par_adapter_r2 | tr ':' ' '`; do + multi_adapter_r2="$multi_adapter_r2 --adapter_r2 $adapter_r2" +done + +multi_adapter_fasta_r2="" +for adapter_fasta_r2 in `echo $par_adapter_fasta_r2 | tr ':' ' '`; do + multi_adapter_fasta_r2="$multi_adapter_fasta_r2 --adapter file:$adapter_fasta_r2" +done + +multi_front="" +for front in `echo $par_front | tr ':' ' '`; do + multi_front="$multi_front --front $front" +done + +multi_front_fasta="" +for front_fasta in `echo $par_front_fasta | tr ':' ' '`; do + multi_front_fasta="$multi_front_fasta --front file:$front_fasta" +done + +multi_front_r2="" +for front_r2 in `echo $par_front_r2 | tr ':' ' '`; do + multi_front_r2="$multi_front_r2 --front_r2 $front_r2" +done + +multi_front_fasta_r2="" +for front_fasta_r2 in `echo $par_front_fasta_r2 | tr ':' ' '`; do + multi_front_fasta_r2="$multi_front_fasta_r2 --front file:$front_fasta_r2" +done + +multi_anywhere="" +for anywhere in `echo $par_anywhere | tr ':' ' '`; do + multi_anywhere="$multi_anywhere --anywhere $anywhere" +done + +multi_anywhere_fasta="" +for anywhere_fasta in `echo $par_anywhere_fasta | tr ':' ' '`; do + multi_anywhere_fasta="$multi_anywhere_fasta --anywhere file:$anywhere_fasta" +done + +multi_anywhere_r2="" +for anywhere_r2 in `echo $par_anywhere_r2 | tr ':' ' '`; do + multi_anywhere_r2="$multi_anywhere_r2 --anywhere_r2 $anywhere_r2" +done + +multi_anywhere_fasta_r2="" +for anywhere_fasta_r2 in `echo $par_anywhere_fasta_r2 | tr ':' ' '`; do + multi_anywhere_fasta_r2="$multi_anywhere_fasta_r2 --anywhere file:$anywhere_fasta_r2" +done + echo ">> Parsing arguments dealing with adapters" adapter_args=$(echo \ - ${par_adapter:+--adapter "${par_adapter}"} \ - ${par_adapter_fasta:+--adapter "file:${par_adapter_fasta}"} \ - ${par_front:+--front "${par_front}"} \ - ${par_front_fasta:+--front "file:${par_front_fasta}"} \ - ${par_anywhere:+--anywhere "${par_anywhere}"} \ - ${par_anywhere_fasta:+--anywhere "file:${par_anywhere_fasta}"} \ - ${par_adapter_r2:+--adapter_r2 "${par_adapter_r2}"} \ - ${par_adapter_r2_fasta:+--adapter_r2 "file:${par_adapter_r2_fasta}"} \ - ${par_front_r2:+--front_r2 "${par_front_r2}"} \ - ${par_front_r2_fasta:+--front_r2 "file:${par_front_r2_fasta}"} \ - ${par_anywhere_r2:+--anywhere_r2 "${par_anywhere_r2}"} \ - ${par_anywhere_r2_fasta:+--anywhere_r2 "file:${par_anywhere_r2_fasta}"} + ${par_adapter:+${multi_adapter}} \ + ${par_adapter_fasta:+${multi_adapter_fasta}} \ + ${par_front:+${multi_front}} \ + ${par_front_fasta:+${multi_front_fasta}} \ + ${par_anywhere:+${multi_anywhere}} \ + ${par_anywhere_fasta:+${multi_anywhere_fasta}} \ + + ${par_adapter_r2:+${multi_adapter_r2}} \ + ${par_adapter_fasta_r2:+${multi_adapter_fasta_r2}} \ + ${par_front_r2:+${multi_front_r2}} \ + ${par_front_fasta_r2:+${multi_front_fasta_r2}} \ + ${par_anywhere_r2:+${multi_anywhere_r2}} \ + ${par_anywhere_fasta_r2:+${multi_anywhere_fasta_r2}} \ ) echo "Arguments to cutadapt:" echo "$adapter_args" diff --git a/src/cutadapt/test.sh b/src/cutadapt/test.sh index 77094990..14e2e6fe 100644 --- a/src/cutadapt/test.sh +++ b/src/cutadapt/test.sh @@ -25,11 +25,44 @@ echo ">> Check if output is empty" rm -r output-dir +echo "> Run with a combination of inputs" + +echo ">adapter1" > adapters1.fasta +echo "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC" >> adapters1.fasta + +echo ">adapter1" > adapters2.fasta +echo "TGATCGGAAGAGCACACGTCTGAACTCCAGTCAC" >> adapters2.fasta + +"$meta_executable" \ + --report minimal \ + --output output-dir \ + --adapter AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ + --adapter GGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ + --adapter_fasta adapters1.fasta \ + --adapter_fasta adapters2.fasta \ + --input $dir_in/se/a.fastq \ + --quality_cutoff 20 \ + --json + +echo ">> Checking output" +[ ! -f "output-dir/report.txt" ] && echo "report.txt does not exist" && exit 1 +[ ! -f "output-dir/report.json" ] && echo "report.json does not exist" && exit 1 +[ ! -f "output-dir/1_R1_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 +[ ! -f "output-dir/unknown_R1_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 + +echo ">> Check if output is empty" +[ -s "output-dir/1_R1_001.fastq" ] && echo "1_R1_001.fastq should be empty" && exit 1 +[ ! -s "output-dir/unknown_R1_001.fastq" ] && echo "unkown_R1_001.fastq is empty" && exit 1 + +rm -r output-dir +rm adapters?.fasta + echo "> Run cutadapt on paired-end data" "$meta_executable" \ --report minimal \ --output output-dir \ --adapter AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ + --adapter AGATCGGAAGAGCACACGTCTGAACTCCAGTCAB \ --input $dir_in/pe/a.1.fastq \ --input_r2 $dir_in/pe/a.2.fastq \ --quality_cutoff 20 \ diff --git a/src/cutadapt/test_data/script.sh b/src/cutadapt/test_data/script.sh index 87dc085e..3251b59c 100755 --- a/src/cutadapt/test_data/script.sh +++ b/src/cutadapt/test_data/script.sh @@ -13,3 +13,4 @@ cp -r /tmp/snakemake-wrappers/bio/cutadapt/se/test/reads/* src/cutadapt/test_dat cp -r /tmp/snakemake-wrappers/bio/cutadapt/pe/test/reads/* src/cutadapt/test_data/pe rm -rf /tmp/snakemake-wrappers + From 44d98ba3525f9efa11aef1d4e9e26fbd20567233 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Thu, 22 Feb 2024 06:51:23 +0100 Subject: [PATCH 17/32] change multiple_sep to ';' --- _viash.yaml | 7 ++++++- src/cutadapt/script.sh | 24 ++++++++++++------------ 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/_viash.yaml b/_viash.yaml index c59f8543..0f38a97f 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -1 +1,6 @@ -viash_version: 0.8.4 \ No newline at end of file +viash_version: 0.8.4 + +# these config mods will be added by PR #25 +config_mods: | + .functionality.arguments[.multiple == true].multiple_sep := ";" + .functionality.argument_groups[true].arguments[.multiple == true].multiple_sep := ";" \ No newline at end of file diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh index 2ae29a3c..013a917d 100644 --- a/src/cutadapt/script.sh +++ b/src/cutadapt/script.sh @@ -30,62 +30,62 @@ fi ########################################################### multi_adapter="" -for adapter in `echo $par_adapter | tr ':' ' '`; do +for adapter in `echo $par_adapter | tr ';' ' '`; do multi_adapter="$multi_adapter --adapter $adapter" done multi_adapter_fasta="" -for adapter_fasta in `echo $par_adapter_fasta | tr ':' ' '`; do +for adapter_fasta in `echo $par_adapter_fasta | tr ';' ' '`; do multi_adapter_fasta="$multi_adapter_fasta --adapter file:$adapter_fasta" done multi_adapter_r2="" -for adapter_r2 in `echo $par_adapter_r2 | tr ':' ' '`; do +for adapter_r2 in `echo $par_adapter_r2 | tr ';' ' '`; do multi_adapter_r2="$multi_adapter_r2 --adapter_r2 $adapter_r2" done multi_adapter_fasta_r2="" -for adapter_fasta_r2 in `echo $par_adapter_fasta_r2 | tr ':' ' '`; do +for adapter_fasta_r2 in `echo $par_adapter_fasta_r2 | tr ';' ' '`; do multi_adapter_fasta_r2="$multi_adapter_fasta_r2 --adapter file:$adapter_fasta_r2" done multi_front="" -for front in `echo $par_front | tr ':' ' '`; do +for front in `echo $par_front | tr ';' ' '`; do multi_front="$multi_front --front $front" done multi_front_fasta="" -for front_fasta in `echo $par_front_fasta | tr ':' ' '`; do +for front_fasta in `echo $par_front_fasta | tr ';' ' '`; do multi_front_fasta="$multi_front_fasta --front file:$front_fasta" done multi_front_r2="" -for front_r2 in `echo $par_front_r2 | tr ':' ' '`; do +for front_r2 in `echo $par_front_r2 | tr ';' ' '`; do multi_front_r2="$multi_front_r2 --front_r2 $front_r2" done multi_front_fasta_r2="" -for front_fasta_r2 in `echo $par_front_fasta_r2 | tr ':' ' '`; do +for front_fasta_r2 in `echo $par_front_fasta_r2 | tr ';' ' '`; do multi_front_fasta_r2="$multi_front_fasta_r2 --front file:$front_fasta_r2" done multi_anywhere="" -for anywhere in `echo $par_anywhere | tr ':' ' '`; do +for anywhere in `echo $par_anywhere | tr ';' ' '`; do multi_anywhere="$multi_anywhere --anywhere $anywhere" done multi_anywhere_fasta="" -for anywhere_fasta in `echo $par_anywhere_fasta | tr ':' ' '`; do +for anywhere_fasta in `echo $par_anywhere_fasta | tr ';' ' '`; do multi_anywhere_fasta="$multi_anywhere_fasta --anywhere file:$anywhere_fasta" done multi_anywhere_r2="" -for anywhere_r2 in `echo $par_anywhere_r2 | tr ':' ' '`; do +for anywhere_r2 in `echo $par_anywhere_r2 | tr ';' ' '`; do multi_anywhere_r2="$multi_anywhere_r2 --anywhere_r2 $anywhere_r2" done multi_anywhere_fasta_r2="" -for anywhere_fasta_r2 in `echo $par_anywhere_fasta_r2 | tr ':' ' '`; do +for anywhere_fasta_r2 in `echo $par_anywhere_fasta_r2 | tr ';' ' '`; do multi_anywhere_fasta_r2="$multi_anywhere_fasta_r2 --anywhere file:$anywhere_fasta_r2" done From 550d026d785812824f0307a17099506505e62013 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Thu, 22 Feb 2024 06:52:26 +0100 Subject: [PATCH 18/32] add example --- src/cutadapt/script.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh index 013a917d..2f7d16bb 100644 --- a/src/cutadapt/script.sh +++ b/src/cutadapt/script.sh @@ -1,5 +1,15 @@ #!/bin/bash +## VIASH START +par_adapter='AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;GGATCGGAAGAGCACACGTCTGAACTCCAGTCAC' +par_input='src/cutadapt/test_data/se/a.fastq' +par_report='full' +par_json='false' +par_output='output' +par_fasta='false' +par_info_file='false' +## VIASH END + if [ -z $par_output ]; then par_output=. else From 65b47de17cb0efd5472455ca46d9ececf554ca15 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Thu, 22 Feb 2024 06:52:37 +0100 Subject: [PATCH 19/32] simplify code with a helper function --- src/cutadapt/script.sh | 105 +++++++++++++---------------------------- 1 file changed, 33 insertions(+), 72 deletions(-) diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh index 2f7d16bb..1b5b325e 100644 --- a/src/cutadapt/script.sh +++ b/src/cutadapt/script.sh @@ -39,82 +39,43 @@ fi # - string and fasta ########################################################### -multi_adapter="" -for adapter in `echo $par_adapter | tr ';' ' '`; do - multi_adapter="$multi_adapter --adapter $adapter" -done - -multi_adapter_fasta="" -for adapter_fasta in `echo $par_adapter_fasta | tr ';' ' '`; do - multi_adapter_fasta="$multi_adapter_fasta --adapter file:$adapter_fasta" -done - -multi_adapter_r2="" -for adapter_r2 in `echo $par_adapter_r2 | tr ';' ' '`; do - multi_adapter_r2="$multi_adapter_r2 --adapter_r2 $adapter_r2" -done - -multi_adapter_fasta_r2="" -for adapter_fasta_r2 in `echo $par_adapter_fasta_r2 | tr ';' ' '`; do - multi_adapter_fasta_r2="$multi_adapter_fasta_r2 --adapter file:$adapter_fasta_r2" -done - -multi_front="" -for front in `echo $par_front | tr ';' ' '`; do - multi_front="$multi_front --front $front" -done - -multi_front_fasta="" -for front_fasta in `echo $par_front_fasta | tr ';' ' '`; do - multi_front_fasta="$multi_front_fasta --front file:$front_fasta" -done - -multi_front_r2="" -for front_r2 in `echo $par_front_r2 | tr ';' ' '`; do - multi_front_r2="$multi_front_r2 --front_r2 $front_r2" -done - -multi_front_fasta_r2="" -for front_fasta_r2 in `echo $par_front_fasta_r2 | tr ';' ' '`; do - multi_front_fasta_r2="$multi_front_fasta_r2 --front file:$front_fasta_r2" -done - -multi_anywhere="" -for anywhere in `echo $par_anywhere | tr ';' ' '`; do - multi_anywhere="$multi_anywhere --anywhere $anywhere" -done - -multi_anywhere_fasta="" -for anywhere_fasta in `echo $par_anywhere_fasta | tr ';' ' '`; do - multi_anywhere_fasta="$multi_anywhere_fasta --anywhere file:$anywhere_fasta" -done - -multi_anywhere_r2="" -for anywhere_r2 in `echo $par_anywhere_r2 | tr ';' ' '`; do - multi_anywhere_r2="$multi_anywhere_r2 --anywhere_r2 $anywhere_r2" -done - -multi_anywhere_fasta_r2="" -for anywhere_fasta_r2 in `echo $par_anywhere_fasta_r2 | tr ';' ' '`; do - multi_anywhere_fasta_r2="$multi_anywhere_fasta_r2 --anywhere file:$anywhere_fasta_r2" -done +function add_flags { + local arg=$1 + local flag=$2 + local prefix=$3 + [[ -z $prefix ]] && prefix="" + + # This function should not be called if the input is empty + # but check for it just in case + if [[ -z $arg ]]; then + return + fi + + local output="" + IFS=';' read -r -a array <<< "$arg" + for a in "${array[@]}"; do + output="$output $flag $prefix$a" + done + echo $output +} echo ">> Parsing arguments dealing with adapters" adapter_args=$(echo \ - ${par_adapter:+${multi_adapter}} \ - ${par_adapter_fasta:+${multi_adapter_fasta}} \ - ${par_front:+${multi_front}} \ - ${par_front_fasta:+${multi_front_fasta}} \ - ${par_anywhere:+${multi_anywhere}} \ - ${par_anywhere_fasta:+${multi_anywhere_fasta}} \ - - ${par_adapter_r2:+${multi_adapter_r2}} \ - ${par_adapter_fasta_r2:+${multi_adapter_fasta_r2}} \ - ${par_front_r2:+${multi_front_r2}} \ - ${par_front_fasta_r2:+${multi_front_fasta_r2}} \ - ${par_anywhere_r2:+${multi_anywhere_r2}} \ - ${par_anywhere_fasta_r2:+${multi_anywhere_fasta_r2}} \ + ${par_adapter:+$(add_flags "$par_adapter" "--adapter")} \ + ${par_adapter_fasta:+$(add_flags "$par_adapter_fasta" "--adapter" "file:")} \ + ${par_front:+$(add_flags "$par_front" "--front")} \ + ${par_front_fasta:+$(add_flags "$par_front_fasta" "--front" "file:")} \ + ${par_anywhere:+$(add_flags "$par_anywhere" "--anywhere")} \ + ${par_anywhere_fasta:+$(add_flags "$par_anywhere_fasta" "--anywhere" "file:")} \ + + ${par_adapter_r2:+$(add_flags "$par_adapter_r2" "--adapter_r2")} \ + ${par_adapter_fasta_r2:+$(add_flags "$par_adapter_fasta_r2" "--adapter_r2" "file:")} \ + ${par_front_r2:+$(add_flags "$par_front_r2" "--front_r2")} \ + ${par_front_fasta_r2:+$(add_flags "$par_front_fasta_r2" "--front_r2" "file:")} \ + ${par_anywhere_r2:+$(add_flags "$par_anywhere_r2" "--anywhere_r2")} \ + ${par_anywhere_fasta_r2:+$(add_flags "$par_anywhere_fasta_r2" "--anywhere_r2" "file:")} \ ) + echo "Arguments to cutadapt:" echo "$adapter_args" echo From 2f187ce1d15947959679fe97b91d0e044030b094 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Thu, 22 Feb 2024 06:53:05 +0100 Subject: [PATCH 20/32] create directories in test --- src/cutadapt/test.sh | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/src/cutadapt/test.sh b/src/cutadapt/test.sh index 14e2e6fe..9059b6f5 100644 --- a/src/cutadapt/test.sh +++ b/src/cutadapt/test.sh @@ -4,6 +4,11 @@ set -e dir_in="$meta_resources_dir/test_data" +############################################# +mkdir test_simple_single_end +cd test_simple_single_end + +echo "#############################################" echo "> Run cutadapt on single-end data" "$meta_executable" \ --report minimal \ @@ -23,8 +28,14 @@ echo ">> Check if output is empty" [ -s "output-dir/1_R1_001.fastq" ] && echo "1_R1_001.fastq should be empty" && exit 1 [ ! -s "output-dir/unknown_R1_001.fastq" ] && echo "unkown_R1_001.fastq is empty" && exit 1 -rm -r output-dir +cd .. +echo + +############################################# +mkdir test_multiple_single_end +cd test_multiple_single_end +echo "#############################################" echo "> Run with a combination of inputs" echo ">adapter1" > adapters1.fasta @@ -54,9 +65,14 @@ echo ">> Check if output is empty" [ -s "output-dir/1_R1_001.fastq" ] && echo "1_R1_001.fastq should be empty" && exit 1 [ ! -s "output-dir/unknown_R1_001.fastq" ] && echo "unkown_R1_001.fastq is empty" && exit 1 -rm -r output-dir -rm adapters?.fasta +cd .. +echo +############################################# +mkdir test_simple_paired_end +cd test_simple_paired_end + +echo "#############################################" echo "> Run cutadapt on paired-end data" "$meta_executable" \ --report minimal \ @@ -83,7 +99,11 @@ echo ">> Check if output is empty" [ ! -s "output-dir/unknown_R1_001.fastq" ] && echo "unkown_R1_001.fastq is empty" && exit 1 [ ! -s "output-dir/unknown_R2_001.fastq" ] && echo "unkown_R2_001.fastq is empty" && exit 1 -rm -r output-dir +cd .. +echo + +############################################# +echo "#############################################" echo "> Test successful" From 986a0901c65fbda4bbf5b7f30de64c4c153b8bd3 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Thu, 22 Feb 2024 09:02:13 +0100 Subject: [PATCH 21/32] use a different output extension if --fasta is provided --- src/cutadapt/script.sh | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh index 1b5b325e..839761ac 100644 --- a/src/cutadapt/script.sh +++ b/src/cutadapt/script.sh @@ -178,11 +178,17 @@ echo ">> Output arguments" [[ "$par_fasta" == "false" ]] && unset par_fasta [[ "$par_info_file" == "false" ]] && unset par_info_file +if [[ -z $par_fasta ]]; then + ext="fastq" +else + ext="fa" +fi + if [ $mode = "se" ]; then output_args=$(echo \ ${par_report:+--report "${par_report}"} \ ${par_json:+--json "${par_output}/report.json"} \ - --output "$par_output/{name}_R1_001.fastq" \ + --output "$par_output/{name}_001.$ext" \ ${par_fasta:+--fasta} \ ${par_info_file:+--info-file} \ ) @@ -190,8 +196,8 @@ else output_args=$(echo \ ${par_report:+--report "${par_report}"} \ ${par_json:+--json "${par_output}/report.json"} \ - --output "$par_output/{name}_R1_001.fastq" \ - --paired-output "$par_output/{name}_R2_001.fastq" \ + --output "$par_output/{name}_R1_001.$ext" \ + --paired-output "$par_output/{name}_R2_001.$ext" \ ${par_fasta:+--fasta} \ ${par_info_file:+--info-file} \ ) From 6b76604cb419d3abd80e0c222b96a744831ebb0e Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Thu, 22 Feb 2024 09:03:47 +0100 Subject: [PATCH 22/32] decrease code duplication by separating optional outputs from paired/unpaired output arguments --- src/cutadapt/script.sh | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh index 839761ac..bf7bc5d5 100644 --- a/src/cutadapt/script.sh +++ b/src/cutadapt/script.sh @@ -10,6 +10,7 @@ par_fasta='false' par_info_file='false' ## VIASH END +# TODO: change this? if [ -z $par_output ]; then par_output=. else @@ -169,15 +170,29 @@ echo "Arguments to cutadapt:" echo $filter_args echo -# Output arguments -# We write the output to a directory rather than -# individual files. +# Optional output arguments ########################################################### -echo ">> Output arguments" +echo ">> Optional arguments" [[ "$par_json" == "false" ]] && unset par_json [[ "$par_fasta" == "false" ]] && unset par_fasta [[ "$par_info_file" == "false" ]] && unset par_info_file +optional_output_args=$(echo \ + ${par_report:+--report "${par_report}"} \ + ${par_json:+--json "${par_output}/report.json"} \ + ${par_fasta:+--fasta} \ + ${par_info_file:+--info-file "$par_output/info.txt"} \ +) + +echo "Arguments to cutadapt:" +echo $optional_output_args +echo + +# Output arguments +# We write the output to a directory rather than +# individual files. +########################################################### + if [[ -z $par_fasta ]]; then ext="fastq" else @@ -186,22 +201,15 @@ fi if [ $mode = "se" ]; then output_args=$(echo \ - ${par_report:+--report "${par_report}"} \ - ${par_json:+--json "${par_output}/report.json"} \ --output "$par_output/{name}_001.$ext" \ - ${par_fasta:+--fasta} \ - ${par_info_file:+--info-file} \ ) else output_args=$(echo \ - ${par_report:+--report "${par_report}"} \ - ${par_json:+--json "${par_output}/report.json"} \ --output "$par_output/{name}_R1_001.$ext" \ --paired-output "$par_output/{name}_R2_001.$ext" \ - ${par_fasta:+--fasta} \ - ${par_info_file:+--info-file} \ ) fi + echo "Arguments to cutadapt:" echo $output_args echo @@ -221,6 +229,7 @@ cli=$(echo \ $input_args \ $mod_args \ $filter_args \ + $optional_output_args \ $output_args \ --cores $par_cpus ) From 4601f21613d121b0272d9466296b6a313271fcca Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Thu, 22 Feb 2024 09:34:55 +0100 Subject: [PATCH 23/32] write custom tests for cutadapt --- src/cutadapt/config.vsh.yaml | 2 - src/cutadapt/script.sh | 2 +- src/cutadapt/test.sh | 192 ++++++++++++++++++++++------ src/cutadapt/test_data/pe/a.1.fastq | 4 - src/cutadapt/test_data/pe/a.2.fastq | 4 - src/cutadapt/test_data/script.sh | 16 --- src/cutadapt/test_data/se/a.fastq | 4 - 7 files changed, 151 insertions(+), 73 deletions(-) delete mode 100644 src/cutadapt/test_data/pe/a.1.fastq delete mode 100644 src/cutadapt/test_data/pe/a.2.fastq delete mode 100755 src/cutadapt/test_data/script.sh delete mode 100644 src/cutadapt/test_data/se/a.fastq diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index 88636952..b9918b88 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -443,8 +443,6 @@ functionality: test_resources: - type: bash_script path: test.sh - - type: file - path: test_data platforms: - type: docker image: python:3.12 diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh index bf7bc5d5..0808ddeb 100644 --- a/src/cutadapt/script.sh +++ b/src/cutadapt/script.sh @@ -196,7 +196,7 @@ echo if [[ -z $par_fasta ]]; then ext="fastq" else - ext="fa" + ext="fasta" fi if [ $mode = "se" ]; then diff --git a/src/cutadapt/test.sh b/src/cutadapt/test.sh index 9059b6f5..d36e6798 100644 --- a/src/cutadapt/test.sh +++ b/src/cutadapt/test.sh @@ -2,7 +2,26 @@ set -e -dir_in="$meta_resources_dir/test_data" +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || (echo "File '$1' does not exist" && exit 1) +} +assert_file_doesnt_exist() { + [ ! -f "$1" ] || (echo "File '$1' exists but shouldn't" && exit 1) +} +assert_file_empty() { + [ ! -s "$1" ] || (echo "File '$1' is not empty but should be" && exit 1) +} +assert_file_not_empty() { + [ -s "$1" ] || (echo "File '$1' is empty but shouldn't be" && exit 1) +} +assert_file_contains() { + grep -q "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) +} +assert_file_not_contains() { + grep -q "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) +} ############################################# mkdir test_simple_single_end @@ -10,23 +29,56 @@ cd test_simple_single_end echo "#############################################" echo "> Run cutadapt on single-end data" + +cat > example.fa <<'EOF' +>read1 +MYSEQUENCEADAPTER +>read2 +MYSEQUENCEADAP +>read3 +MYSEQUENCEADAPTERSOMETHINGELSE +>read4 +MYSEQUENCEADABTER +>read5 +MYSEQUENCEADAPTR +>read6 +MYSEQUENCEADAPPTER +>read7 +ADAPTERMYSEQUENCE +>read8 +PTERMYSEQUENCE +>read9 +SOMETHINGADAPTERMYSEQUENCE +EOF + "$meta_executable" \ --report minimal \ - --output output-dir \ - --adapter AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ - --input $dir_in/se/a.fastq \ - --quality_cutoff 20 \ + --output out_test1 \ + --adapter ADAPTER \ + --input example.fa \ + --fasta \ + --no_match_adapter_wildcards \ --json echo ">> Checking output" -[ ! -f "output-dir/report.txt" ] && echo "report.txt does not exist" && exit 1 -[ ! -f "output-dir/report.json" ] && echo "report.json does not exist" && exit 1 -[ ! -f "output-dir/1_R1_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 -[ ! -f "output-dir/unknown_R1_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 +assert_file_exists "out_test1/report.txt" +assert_file_exists "out_test1/report.json" +assert_file_exists "out_test1/1_001.fasta" +assert_file_exists "out_test1/unknown_001.fasta" echo ">> Check if output is empty" -[ -s "output-dir/1_R1_001.fastq" ] && echo "1_R1_001.fastq should be empty" && exit 1 -[ ! -s "output-dir/unknown_R1_001.fastq" ] && echo "unkown_R1_001.fastq is empty" && exit 1 +assert_file_not_empty "out_test1/report.txt" +assert_file_not_empty "out_test1/report.json" +assert_file_not_empty "out_test1/1_001.fasta" +assert_file_not_empty "out_test1/unknown_001.fasta" + +echo ">> Check contents" +for i in 1 2 3 7 9; do + assert_file_contains "out_test1/1_001.fasta" ">read$i" +done +for i in 4 5 6 8; do + assert_file_contains "out_test1/unknown_001.fasta" ">read$i" +done cd .. echo @@ -38,32 +90,58 @@ cd test_multiple_single_end echo "#############################################" echo "> Run with a combination of inputs" -echo ">adapter1" > adapters1.fasta -echo "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC" >> adapters1.fasta - -echo ">adapter1" > adapters2.fasta -echo "TGATCGGAAGAGCACACGTCTGAACTCCAGTCAC" >> adapters2.fasta +cat > example.fa <<'EOF' +>read1 +ACGTACGTACGTAAAAA +>read2 +ACGTACGTACGTCCCCC +>read3 +ACGTACGTACGTGGGGG +>read4 +ACGTACGTACGTTTTTT +EOF + +cat > adapters1.fasta <<'EOF' +>adapter1 +CCCCC +EOF + +cat > adapters2.fasta <<'EOF' +>adapter2 +GGGGG +EOF "$meta_executable" \ --report minimal \ - --output output-dir \ - --adapter AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ - --adapter GGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ + --output out_test2 \ + --adapter AAAAA \ --adapter_fasta adapters1.fasta \ --adapter_fasta adapters2.fasta \ - --input $dir_in/se/a.fastq \ - --quality_cutoff 20 \ + --input example.fa \ + --fasta \ --json echo ">> Checking output" -[ ! -f "output-dir/report.txt" ] && echo "report.txt does not exist" && exit 1 -[ ! -f "output-dir/report.json" ] && echo "report.json does not exist" && exit 1 -[ ! -f "output-dir/1_R1_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 -[ ! -f "output-dir/unknown_R1_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 +assert_file_exists "out_test2/report.txt" +assert_file_exists "out_test2/report.json" +assert_file_exists "out_test2/1_001.fasta" +assert_file_exists "out_test2/adapter1_001.fasta" +assert_file_exists "out_test2/adapter2_001.fasta" +assert_file_exists "out_test2/unknown_001.fasta" echo ">> Check if output is empty" -[ -s "output-dir/1_R1_001.fastq" ] && echo "1_R1_001.fastq should be empty" && exit 1 -[ ! -s "output-dir/unknown_R1_001.fastq" ] && echo "unkown_R1_001.fastq is empty" && exit 1 +assert_file_not_empty "out_test2/report.txt" +assert_file_not_empty "out_test2/report.json" +assert_file_not_empty "out_test2/1_001.fasta" +assert_file_not_empty "out_test2/adapter1_001.fasta" +assert_file_not_empty "out_test2/adapter2_001.fasta" +assert_file_not_empty "out_test2/unknown_001.fasta" + +echo ">> Check contents" +assert_file_contains "out_test2/1_001.fasta" ">read1" +assert_file_contains "out_test2/adapter1_001.fasta" ">read2" +assert_file_contains "out_test2/adapter2_001.fasta" ">read3" +assert_file_contains "out_test2/unknown_001.fasta" ">read4" cd .. echo @@ -74,30 +152,60 @@ cd test_simple_paired_end echo "#############################################" echo "> Run cutadapt on paired-end data" + +cat > example_R1.fastq <<'EOF' +@read1 +ACGTACGTACGTAAAAA ++ +IIIIIIIIIIIIIIIII +@read2 +ACGTACGTACGTCCCCC ++ +IIIIIIIIIIIIIIIII +EOF + +cat > example_R2.fastq <<'EOF' +@read1 +ACGTACGTACGTGGGGG ++ +IIIIIIIIIIIIIIIII +@read2 +ACGTACGTACGTTTTTT ++ +IIIIIIIIIIIIIIIII +EOF + "$meta_executable" \ --report minimal \ - --output output-dir \ - --adapter AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ - --adapter AGATCGGAAGAGCACACGTCTGAACTCCAGTCAB \ - --input $dir_in/pe/a.1.fastq \ - --input_r2 $dir_in/pe/a.2.fastq \ + --output out_test3 \ + --adapter AAAAA \ + --adapter_r2 GGGGG \ + --input example_R1.fastq \ + --input_r2 example_R2.fastq \ --quality_cutoff 20 \ --json \ ---cpus 1 echo ">> Checking output" -[ ! -f "output-dir/report.txt" ] && echo "report.txt does not exist" && exit 1 -[ ! -f "output-dir/report.json" ] && echo "report.json does not exist" && exit 1 -[ ! -f "output-dir/1_R1_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 -[ ! -f "output-dir/1_R2_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 -[ ! -f "output-dir/unknown_R1_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 -[ ! -f "output-dir/unknown_R2_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 +assert_file_exists "out_test3/report.txt" +assert_file_exists "out_test3/report.json" +assert_file_exists "out_test3/1_R1_001.fastq" +assert_file_exists "out_test3/1_R2_001.fastq" +assert_file_exists "out_test3/unknown_R1_001.fastq" +assert_file_exists "out_test3/unknown_R2_001.fastq" echo ">> Check if output is empty" -[ -s "output-dir/1_R1_001.fastq" ] && echo "1_R1_001.fastq should be empty" && exit 1 -[ -s "output-dir/1_R2_001.fastq" ] && echo "1_R2_001.fastq should be empty" && exit 1 -[ ! -s "output-dir/unknown_R1_001.fastq" ] && echo "unkown_R1_001.fastq is empty" && exit 1 -[ ! -s "output-dir/unknown_R2_001.fastq" ] && echo "unkown_R2_001.fastq is empty" && exit 1 +assert_file_not_empty "out_test3/report.txt" +assert_file_not_empty "out_test3/report.json" +assert_file_not_empty "out_test3/1_R1_001.fastq" +assert_file_not_empty "out_test3/1_R2_001.fastq" +assert_file_not_empty "out_test3/unknown_R1_001.fastq" + +echo ">> Check contents" +assert_file_contains "out_test3/1_R1_001.fastq" "@read1" +assert_file_contains "out_test3/1_R2_001.fastq" "@read1" +assert_file_contains "out_test3/unknown_R1_001.fastq" "@read2" +assert_file_contains "out_test3/unknown_R2_001.fastq" "@read2" cd .. echo diff --git a/src/cutadapt/test_data/pe/a.1.fastq b/src/cutadapt/test_data/pe/a.1.fastq deleted file mode 100644 index 42735560..00000000 --- a/src/cutadapt/test_data/pe/a.1.fastq +++ /dev/null @@ -1,4 +0,0 @@ -@1 -ACGGCAT -+ -!!!!!!! diff --git a/src/cutadapt/test_data/pe/a.2.fastq b/src/cutadapt/test_data/pe/a.2.fastq deleted file mode 100644 index 42735560..00000000 --- a/src/cutadapt/test_data/pe/a.2.fastq +++ /dev/null @@ -1,4 +0,0 @@ -@1 -ACGGCAT -+ -!!!!!!! diff --git a/src/cutadapt/test_data/script.sh b/src/cutadapt/test_data/script.sh deleted file mode 100755 index 3251b59c..00000000 --- a/src/cutadapt/test_data/script.sh +++ /dev/null @@ -1,16 +0,0 @@ -# cutadapt test data - -# Test data was obtained from https://github.com/snakemake/snakemake-wrappers/tree/master/bio/cutadapt/test - -if [ ! -d /tmp/snakemake-wrappers ]; then - git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers -fi - -mkdir -p src/cutadapt/test_data/pe -mkdir src/cutadapt/test_data/se - -cp -r /tmp/snakemake-wrappers/bio/cutadapt/se/test/reads/* src/cutadapt/test_data/se -cp -r /tmp/snakemake-wrappers/bio/cutadapt/pe/test/reads/* src/cutadapt/test_data/pe - -rm -rf /tmp/snakemake-wrappers - diff --git a/src/cutadapt/test_data/se/a.fastq b/src/cutadapt/test_data/se/a.fastq deleted file mode 100644 index 42735560..00000000 --- a/src/cutadapt/test_data/se/a.fastq +++ /dev/null @@ -1,4 +0,0 @@ -@1 -ACGGCAT -+ -!!!!!!! From 45d1989c80a106cf9494651ec3f8f8c1be182cfa Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Thu, 22 Feb 2024 10:21:44 +0100 Subject: [PATCH 24/32] fix _r2 arguments --- src/cutadapt/script.sh | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh index 0808ddeb..2d15843f 100644 --- a/src/cutadapt/script.sh +++ b/src/cutadapt/script.sh @@ -68,13 +68,12 @@ adapter_args=$(echo \ ${par_front_fasta:+$(add_flags "$par_front_fasta" "--front" "file:")} \ ${par_anywhere:+$(add_flags "$par_anywhere" "--anywhere")} \ ${par_anywhere_fasta:+$(add_flags "$par_anywhere_fasta" "--anywhere" "file:")} \ - - ${par_adapter_r2:+$(add_flags "$par_adapter_r2" "--adapter_r2")} \ - ${par_adapter_fasta_r2:+$(add_flags "$par_adapter_fasta_r2" "--adapter_r2" "file:")} \ - ${par_front_r2:+$(add_flags "$par_front_r2" "--front_r2")} \ - ${par_front_fasta_r2:+$(add_flags "$par_front_fasta_r2" "--front_r2" "file:")} \ - ${par_anywhere_r2:+$(add_flags "$par_anywhere_r2" "--anywhere_r2")} \ - ${par_anywhere_fasta_r2:+$(add_flags "$par_anywhere_fasta_r2" "--anywhere_r2" "file:")} \ + ${par_adapter_r2:+$(add_flags "$par_adapter_r2" "-A")} \ + ${par_adapter_fasta_r2:+$(add_flags "$par_adapter_fasta_r2" "-A" "file:")} \ + ${par_front_r2:+$(add_flags "$par_front_r2" "-G")} \ + ${par_front_fasta_r2:+$(add_flags "$par_front_fasta_r2" "-G" "file:")} \ + ${par_anywhere_r2:+$(add_flags "$par_anywhere_r2" "-B")} \ + ${par_anywhere_fasta_r2:+$(add_flags "$par_anywhere_fasta_r2" "-B" "file:")} \ ) echo "Arguments to cutadapt:" From 22f99c6e95283be10319baeee31bc19fa2f4c843 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Thu, 22 Feb 2024 10:22:04 +0100 Subject: [PATCH 25/32] add debug flag as not to always print the cli command --- src/cutadapt/config.vsh.yaml | 5 ++++ src/cutadapt/script.sh | 58 +++++++++++++++++++----------------- 2 files changed, 36 insertions(+), 27 deletions(-) diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index b9918b88..ebd56e3b 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -437,6 +437,11 @@ functionality: # - name: --untrimmed_paired_output # - name: too_short_paired_output # - name: too_long_paired_output + - name: Debug + arguments: + - type: boolean_true + name: --debug + description: Print debug information resources: - type: bash_script path: script.sh diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh index 2d15843f..1edfb090 100644 --- a/src/cutadapt/script.sh +++ b/src/cutadapt/script.sh @@ -8,6 +8,7 @@ par_json='false' par_output='output' par_fasta='false' par_info_file='false' +par_debug='true' ## VIASH END # TODO: change this? @@ -17,11 +18,13 @@ else mkdir -p "$par_output" fi +function debug { + [[ "$par_debug" == "true" ]] && echo "DEBUG: $@" +} # Init ########################################################### -echo "Running cutadapt" -echo + echo ">> Paired-end data or not?" mode="" @@ -60,7 +63,7 @@ function add_flags { echo $output } -echo ">> Parsing arguments dealing with adapters" +debug ">> Parsing arguments dealing with adapters" adapter_args=$(echo \ ${par_adapter:+$(add_flags "$par_adapter" "--adapter")} \ ${par_adapter_fasta:+$(add_flags "$par_adapter_fasta" "--adapter" "file:")} \ @@ -76,9 +79,9 @@ adapter_args=$(echo \ ${par_anywhere_fasta_r2:+$(add_flags "$par_anywhere_fasta_r2" "-B" "file:")} \ ) -echo "Arguments to cutadapt:" -echo "$adapter_args" -echo +debug "Arguments to cutadapt:" +debug "$adapter_args" +debug # Paired-end options ########################################################### @@ -91,9 +94,9 @@ paired_args=$(echo \ ${par_pair_filter:+--pair-filter "${par_pair_filter}"} \ ${par_interleaved:+--interleaved} ) -echo "Arguments to cutadapt:" -echo $paired_args -echo +debug "Arguments to cutadapt:" +debug $paired_args +debug # Input arguments ########################################################### @@ -113,9 +116,9 @@ input_args=$(echo \ ${par_action:+--action "${par_action}"} \ ${par_revcomp:+--revcomp} \ ) -echo "Arguments to cutadapt:" -echo $input_args -echo +debug "Arguments to cutadapt:" +debug $input_args +debug # Read modifications ########################################################### @@ -141,9 +144,9 @@ mod_args=$(echo \ ${par_rename:+--rename "${par_rename}"} \ ${par_zero_cap:+--zero-cap} \ ) -echo "Arguments to cutadapt:" -echo $mod_args -echo +debug "Arguments to cutadapt:" +debug $mod_args +debug # Filtering of processed reads arguments ########################################################### @@ -165,9 +168,9 @@ filter_args=$(echo \ ${par_discard_untrimmed:+--discard-untrimmed} \ ${par_discard_casava:+--discard-casava} \ ) -echo "Arguments to cutadapt:" -echo $filter_args -echo +debug "Arguments to cutadapt:" +debug $filter_args +debug # Optional output arguments ########################################################### @@ -183,9 +186,9 @@ optional_output_args=$(echo \ ${par_info_file:+--info-file "$par_output/info.txt"} \ ) -echo "Arguments to cutadapt:" -echo $optional_output_args -echo +debug "Arguments to cutadapt:" +debug $optional_output_args +debug # Output arguments # We write the output to a directory rather than @@ -209,15 +212,14 @@ else ) fi -echo "Arguments to cutadapt:" -echo $output_args -echo +debug "Arguments to cutadapt:" +debug $output_args +debug # Full CLI # Set the --cores argument to 0 unless meta_cpus is set ########################################################### -echo ">> Full CLI to be run:" - +echo ">> Running cutadapt" par_cpus=0 [[ ! -z $meta_cpus ]] && par_cpus=$meta_cpus @@ -233,6 +235,8 @@ cli=$(echo \ --cores $par_cpus ) -echo cutadapt $cli | sed -e 's/--/\r\n --/g' +debug ">> Full CLI to be run:" +debug cutadapt $cli | sed -e 's/--/\r\n --/g' +debug cutadapt $cli | tee $par_output/report.txt From 8370251ab0f46ce846decc9d8d101f8f00165a07 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Sat, 24 Feb 2024 22:24:42 +0100 Subject: [PATCH 26/32] remove comment --- _viash.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/_viash.yaml b/_viash.yaml index f13febba..65344505 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -1,6 +1,5 @@ viash_version: 0.8.5 -# these config mods will be added by PR #25 config_mods: | .functionality.arguments[.multiple == true].multiple_sep := ";" .functionality.argument_groups[true].arguments[.multiple == true].multiple_sep := ";" \ No newline at end of file From e3204f9dd69fbd9e93d72c048163c8cce83bf00d Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Wed, 5 Jun 2024 15:40:40 +0200 Subject: [PATCH 27/32] Update to Viash 0.9.0-RC4 --- _viash.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_viash.yaml b/_viash.yaml index 397a8d69..f0521ff1 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -7,7 +7,7 @@ links: issue_tracker: https://github.com/viash-hub/biobase/issues repository: https://github.com/viash-hub/biobase -viash_version: 0.9.0-RC3 +viash_version: 0.9.0-RC4 config_mods: | .requirements.commands := ['ps'] From f399e38f189bda3f1408d6f94ac558e263534a2a Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Wed, 5 Jun 2024 15:40:52 +0200 Subject: [PATCH 28/32] Ability to specify output globbing patterns --- src/cutadapt/config.vsh.yaml | 894 ++++++++++++++++++----------------- src/cutadapt/script.sh | 19 +- src/cutadapt/test.sh | 73 ++- 3 files changed, 519 insertions(+), 467 deletions(-) diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index ebd56e3b..b7ca2997 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -1,454 +1,462 @@ -functionality: - name: cutadapt - description: | - Cutadapt removes adapter sequences from high-throughput sequencing reads. - info: - keywords: [RNA-seq, scRNA-seq, high-throughput] - homepage: https://cutadapt.readthedocs.io/ - documentation: https://cutadapt.readthedocs.io/ - repository: https://github.com/marcelm/cutadapt - reference: http://dx.doi.org/10.14806/ej.17.1.200 - license: MIT - argument_groups: - #################################################################### - - name: Specify Adapters for R1 - arguments: - - name: --adapter - alternatives: [-a] - type: string - multiple: true - description: | - Sequence of an adapter ligated to the 3' end (paired data: - of the first read). The adapter and subsequent bases are - trimmed. If a '$' character is appended ('anchoring'), the - adapter is only found if it is a suffix of the read. - required: false - - name: --front - alternatives: [-g] - type: string - multiple: true - description: | - Sequence of an adapter ligated to the 5' end (paired data: - of the first read). The adapter and any preceding bases - are trimmed. Partial matches at the 5' end are allowed. If - a '^' character is prepended ('anchoring'), the adapter is - only found if it is a prefix of the read. - required: false - - name: --anywhere - alternatives: [-b] - type: string - multiple: true - description: | - Sequence of an adapter that may be ligated to the 5' or 3' - end (paired data: of the first read). Both types of - matches as described under -a and -g are allowed. If the - first base of the read is part of the match, the behavior - is as with -g, otherwise as with -a. This option is mostly - for rescuing failed library preparations - do not use if - you know which end your adapter was ligated to! - required: false +name: cutadapt +description: | + Cutadapt removes adapter sequences from high-throughput sequencing reads. +info: + keywords: [RNA-seq, scRNA-seq, high-throughput] + homepage: https://cutadapt.readthedocs.io/ + documentation: https://cutadapt.readthedocs.io/ + repository: https://github.com/marcelm/cutadapt + reference: http://dx.doi.org/10.14806/ej.17.1.200 + license: MIT +argument_groups: + #################################################################### + - name: Specify Adapters for R1 + arguments: + - name: --adapter + alternatives: [-a] + type: string + multiple: true + description: | + Sequence of an adapter ligated to the 3' end (paired data: + of the first read). The adapter and subsequent bases are + trimmed. If a '$' character is appended ('anchoring'), the + adapter is only found if it is a suffix of the read. + required: false + - name: --front + alternatives: [-g] + type: string + multiple: true + description: | + Sequence of an adapter ligated to the 5' end (paired data: + of the first read). The adapter and any preceding bases + are trimmed. Partial matches at the 5' end are allowed. If + a '^' character is prepended ('anchoring'), the adapter is + only found if it is a prefix of the read. + required: false + - name: --anywhere + alternatives: [-b] + type: string + multiple: true + description: | + Sequence of an adapter that may be ligated to the 5' or 3' + end (paired data: of the first read). Both types of + matches as described under -a and -g are allowed. If the + first base of the read is part of the match, the behavior + is as with -g, otherwise as with -a. This option is mostly + for rescuing failed library preparations - do not use if + you know which end your adapter was ligated to! + required: false - #################################################################### - - name: Specify Adapters using Fasta files for R1 - arguments: - - name: --adapter_fasta - type: file - multiple: true - description: | - Fasta file containing sequences of an adapter ligated to the 3' end (paired data: - of the first read). The adapter and subsequent bases are - trimmed. If a '$' character is appended ('anchoring'), the - adapter is only found if it is a suffix of the read. - required: false - - name: --front_fasta - type: file - description: | - Fasta file containing sequences of an adapter ligated to the 5' end (paired data: - of the first read). The adapter and any preceding bases - are trimmed. Partial matches at the 5' end are allowed. If - a '^' character is prepended ('anchoring'), the adapter is - only found if it is a prefix of the read. - required: false - - name: --anywhere_fasta - type: file - description: | - Fasta file containing sequences of an adapter that may be ligated to the 5' or 3' - end (paired data: of the first read). Both types of - matches as described under -a and -g are allowed. If the - first base of the read is part of the match, the behavior - is as with -g, otherwise as with -a. This option is mostly - for rescuing failed library preparations - do not use if - you know which end your adapter was ligated to! - required: false + #################################################################### + - name: Specify Adapters using Fasta files for R1 + arguments: + - name: --adapter_fasta + type: file + multiple: true + description: | + Fasta file containing sequences of an adapter ligated to the 3' end (paired data: + of the first read). The adapter and subsequent bases are + trimmed. If a '$' character is appended ('anchoring'), the + adapter is only found if it is a suffix of the read. + required: false + - name: --front_fasta + type: file + description: | + Fasta file containing sequences of an adapter ligated to the 5' end (paired data: + of the first read). The adapter and any preceding bases + are trimmed. Partial matches at the 5' end are allowed. If + a '^' character is prepended ('anchoring'), the adapter is + only found if it is a prefix of the read. + required: false + - name: --anywhere_fasta + type: file + description: | + Fasta file containing sequences of an adapter that may be ligated to the 5' or 3' + end (paired data: of the first read). Both types of + matches as described under -a and -g are allowed. If the + first base of the read is part of the match, the behavior + is as with -g, otherwise as with -a. This option is mostly + for rescuing failed library preparations - do not use if + you know which end your adapter was ligated to! + required: false - #################################################################### - - name: Specify Adapters for R2 - arguments: - - name: --adapter_r2 - alternatives: [-A] - type: string - multiple: true - description: | - Sequence of an adapter ligated to the 3' end (paired data: - of the first read). The adapter and subsequent bases are - trimmed. If a '$' character is appended ('anchoring'), the - adapter is only found if it is a suffix of the read. - required: false - - name: --front_r2 - alternatives: [-G] - type: string - multiple: true - description: | - Sequence of an adapter ligated to the 5' end (paired data: - of the first read). The adapter and any preceding bases - are trimmed. Partial matches at the 5' end are allowed. If - a '^' character is prepended ('anchoring'), the adapter is - only found if it is a prefix of the read. - required: false - - name: --anywhere_r2 - alternatives: [-B] - type: string - multiple: true - description: | - Sequence of an adapter that may be ligated to the 5' or 3' - end (paired data: of the first read). Both types of - matches as described under -a and -g are allowed. If the - first base of the read is part of the match, the behavior - is as with -g, otherwise as with -a. This option is mostly - for rescuing failed library preparations - do not use if - you know which end your adapter was ligated to! - required: false + #################################################################### + - name: Specify Adapters for R2 + arguments: + - name: --adapter_r2 + alternatives: [-A] + type: string + multiple: true + description: | + Sequence of an adapter ligated to the 3' end (paired data: + of the first read). The adapter and subsequent bases are + trimmed. If a '$' character is appended ('anchoring'), the + adapter is only found if it is a suffix of the read. + required: false + - name: --front_r2 + alternatives: [-G] + type: string + multiple: true + description: | + Sequence of an adapter ligated to the 5' end (paired data: + of the first read). The adapter and any preceding bases + are trimmed. Partial matches at the 5' end are allowed. If + a '^' character is prepended ('anchoring'), the adapter is + only found if it is a prefix of the read. + required: false + - name: --anywhere_r2 + alternatives: [-B] + type: string + multiple: true + description: | + Sequence of an adapter that may be ligated to the 5' or 3' + end (paired data: of the first read). Both types of + matches as described under -a and -g are allowed. If the + first base of the read is part of the match, the behavior + is as with -g, otherwise as with -a. This option is mostly + for rescuing failed library preparations - do not use if + you know which end your adapter was ligated to! + required: false - #################################################################### - - name: Specify Adapters using Fasta files for R2 - arguments: - - name: --adapter_r2_fasta - type: file - description: | - Fasta file containing sequences of an adapter ligated to the 3' end (paired data: - of the first read). The adapter and subsequent bases are - trimmed. If a '$' character is appended ('anchoring'), the - adapter is only found if it is a suffix of the read. - required: false - - name: --front_r2_fasta - type: file - description: | - Fasta file containing sequences of an adapter ligated to the 5' end (paired data: - of the first read). The adapter and any preceding bases - are trimmed. Partial matches at the 5' end are allowed. If - a '^' character is prepended ('anchoring'), the adapter is - only found if it is a prefix of the read. - required: false - - name: --anywhere_r2_fasta - type: file - description: | - Fasta file containing sequences of an adapter that may be ligated to the 5' or 3' - end (paired data: of the first read). Both types of - matches as described under -a and -g are allowed. If the - first base of the read is part of the match, the behavior - is as with -g, otherwise as with -a. This option is mostly - for rescuing failed library preparations - do not use if - you know which end your adapter was ligated to! - required: false + #################################################################### + - name: Specify Adapters using Fasta files for R2 + arguments: + - name: --adapter_r2_fasta + type: file + description: | + Fasta file containing sequences of an adapter ligated to the 3' end (paired data: + of the first read). The adapter and subsequent bases are + trimmed. If a '$' character is appended ('anchoring'), the + adapter is only found if it is a suffix of the read. + required: false + - name: --front_r2_fasta + type: file + description: | + Fasta file containing sequences of an adapter ligated to the 5' end (paired data: + of the first read). The adapter and any preceding bases + are trimmed. Partial matches at the 5' end are allowed. If + a '^' character is prepended ('anchoring'), the adapter is + only found if it is a prefix of the read. + required: false + - name: --anywhere_r2_fasta + type: file + description: | + Fasta file containing sequences of an adapter that may be ligated to the 5' or 3' + end (paired data: of the first read). Both types of + matches as described under -a and -g are allowed. If the + first base of the read is part of the match, the behavior + is as with -g, otherwise as with -a. This option is mostly + for rescuing failed library preparations - do not use if + you know which end your adapter was ligated to! + required: false - #################################################################### - - name: Paired-end options - arguments: - - name: --pair_adapters - type: boolean_true - description: | - Treat adapters given with -a/-A etc. as pairs. Either both - or none are removed from each read pair. - - name: --pair_filter - type: string - choices: [any, both, first] - description: | - Which of the reads in a paired-end read have to match the - filtering criterion in order for the pair to be filtered. - - name: --interleaved - type: boolean_true - description: | - Read and/or write interleaved paired-end reads. + #################################################################### + - name: Paired-end options + arguments: + - name: --pair_adapters + type: boolean_true + description: | + Treat adapters given with -a/-A etc. as pairs. Either both + or none are removed from each read pair. + - name: --pair_filter + type: string + choices: [any, both, first] + description: | + Which of the reads in a paired-end read have to match the + filtering criterion in order for the pair to be filtered. + - name: --interleaved + type: boolean_true + description: | + Read and/or write interleaved paired-end reads. - #################################################################### - - name: Input parameters - arguments: - - name: --input - type: file - required: true - description: | - Input fastq file for single-end reads or R1 for paired-end reads. - - name: --input_r2 - type: file - required: false - description: | - Input fastq file for R2 in the case of paired-end reads. - - name: --error_rate - alternatives: [-E, --errors] - type: double - description: | - Maximum allowed error rate (if 0 <= E < 1), or absolute - number of errors for full-length adapter match (if E is an - integer >= 1). Error rate = no. of errors divided by - length of matching region. Default: 0.1 (10%). - example: 0.1 - - name: --no_indels - type: boolean_false - description: | - Allow only mismatches in alignments. + #################################################################### + - name: Input parameters + arguments: + - name: --input + type: file + required: true + description: | + Input fastq file for single-end reads or R1 for paired-end reads. + - name: --input_r2 + type: file + required: false + description: | + Input fastq file for R2 in the case of paired-end reads. + - name: --error_rate + alternatives: [-E, --errors] + type: double + description: | + Maximum allowed error rate (if 0 <= E < 1), or absolute + number of errors for full-length adapter match (if E is an + integer >= 1). Error rate = no. of errors divided by + length of matching region. Default: 0.1 (10%). + example: 0.1 + - name: --no_indels + type: boolean_false + description: | + Allow only mismatches in alignments. - - name: --times - type: integer - alternatives: [-n] - description: | - Remove up to COUNT adapters from each read. Default: 1. - example: 1 - - name: --overlap - alternatives: [-O] - type: integer - description: | - Require MINLENGTH overlap between read and adapter for an - adapter to be found. The default is 3. - example: 3 - - name: --match_read_wildcards - type: boolean_true - description: | - Interpret IUPAC wildcards in reads. - - name: --no_match_adapter_wildcards - type: boolean_false - description: | - Do not interpret IUPAC wildcards in adapters. - - name: --action - type: string - choices: - - trim - - retain - - mask - - lowercase - - none - description: | - What to do if a match was found. trim: trim adapter and - up- or downstream sequence; retain: trim, but retain - adapter; mask: replace with 'N' characters; lowercase: - convert to lowercase; none: leave unchanged. - The default is trim. - example: trim - - name: --revcomp - alternatives: [--rc] - type: boolean_true - description: | - Check both the read and its reverse complement for adapter - matches. If match is on reverse-complemented version, - output that one. + - name: --times + type: integer + alternatives: [-n] + description: | + Remove up to COUNT adapters from each read. Default: 1. + example: 1 + - name: --overlap + alternatives: [-O] + type: integer + description: | + Require MINLENGTH overlap between read and adapter for an + adapter to be found. The default is 3. + example: 3 + - name: --match_read_wildcards + type: boolean_true + description: | + Interpret IUPAC wildcards in reads. + - name: --no_match_adapter_wildcards + type: boolean_false + description: | + Do not interpret IUPAC wildcards in adapters. + - name: --action + type: string + choices: + - trim + - retain + - mask + - lowercase + - none + description: | + What to do if a match was found. trim: trim adapter and + up- or downstream sequence; retain: trim, but retain + adapter; mask: replace with 'N' characters; lowercase: + convert to lowercase; none: leave unchanged. + The default is trim. + example: trim + - name: --revcomp + alternatives: [--rc] + type: boolean_true + description: | + Check both the read and its reverse complement for adapter + matches. If match is on reverse-complemented version, + output that one. - #################################################################### - - name: Read modifications - arguments: - - name: --cut - alternatives: [-u] - type: integer - multiple: true - description: | - Remove LEN bases from each read (or R1 if paired; use --cut_r2 - option for R2). If LEN is positive, remove bases from the - beginning. If LEN is negative, remove bases from the end. - Can be used twice if LENs have different signs. Applied - *before* adapter trimming. - - name: --cut_r2 - type: integer - multiple: true - description: | - Remove LEN bases from each read (for R2). If LEN is positive, remove bases from the - beginning. If LEN is negative, remove bases from the end. - Can be used twice if LENs have different signs. Applied - *before* adapter trimming. - - name: --nextseq_trim - type: string - description: | - NextSeq-specific quality trimming (each read). Trims also - dark cycles appearing as high-quality G bases. - - name: --quality_cutoff - alternatives: [-q] - type: string - description: | - Trim low-quality bases from 5' and/or 3' ends of each read - before adapter removal. Applied to both reads if data is - paired. If one value is given, only the 3' end is trimmed. - If two comma-separated cutoffs are given, the 5' end is - trimmed with the first cutoff, the 3' end with the second. - - name: --quality_cutoff_r2 - alternatives: [-Q] - type: string - description: | - Quality-trimming cutoff for R2. Default: same as for R1 - - name: --quality_base - type: integer - description: | - Assume that quality values in FASTQ are encoded as - ascii(quality + N). This needs to be set to 64 for some - old Illumina FASTQ files. The default is 33. - example: 33 - - name: --poly_a - type: boolean_true - description: Trim poly-A tails - - name: --length - alternatives: [-l] - type: integer - description: | - Shorten reads to LENGTH. Positive values remove bases at - the end while negative ones remove bases at the beginning. - This and the following modifications are applied after - adapter trimming. - - name: --trim_n - type: boolean_true - description: Trim N's on ends of reads. - - name: --length_tag - type: string - description: | - Search for TAG followed by a decimal number in the - description field of the read. Replace the decimal number - with the correct length of the trimmed read. For example, - use --length-tag 'length=' to correct fields like - 'length=123'. - example: "length=" - - name: --strip_suffix - type: string - description: | - Remove this suffix from read names if present. Can be - given multiple times. - - name: --prefix - alternatives: [-x] - type: string - description: | - Add this prefix to read names. Use {name} to insert the - name of the matching adapter. - - name: --suffix - alternatives: [-y] - type: string - description: | - Add this suffix to read names; can also include {name} - - name: --rename - type: string - description: | - Rename reads using TEMPLATE containing variables such as - {id}, {adapter_name} etc. (see documentation) - - name: --zero_cap - alternatives: [-z] - type: boolean_true - description: Change negative quality values to zero. + #################################################################### + - name: Read modifications + arguments: + - name: --cut + alternatives: [-u] + type: integer + multiple: true + description: | + Remove LEN bases from each read (or R1 if paired; use --cut_r2 + option for R2). If LEN is positive, remove bases from the + beginning. If LEN is negative, remove bases from the end. + Can be used twice if LENs have different signs. Applied + *before* adapter trimming. + - name: --cut_r2 + type: integer + multiple: true + description: | + Remove LEN bases from each read (for R2). If LEN is positive, remove bases from the + beginning. If LEN is negative, remove bases from the end. + Can be used twice if LENs have different signs. Applied + *before* adapter trimming. + - name: --nextseq_trim + type: string + description: | + NextSeq-specific quality trimming (each read). Trims also + dark cycles appearing as high-quality G bases. + - name: --quality_cutoff + alternatives: [-q] + type: string + description: | + Trim low-quality bases from 5' and/or 3' ends of each read + before adapter removal. Applied to both reads if data is + paired. If one value is given, only the 3' end is trimmed. + If two comma-separated cutoffs are given, the 5' end is + trimmed with the first cutoff, the 3' end with the second. + - name: --quality_cutoff_r2 + alternatives: [-Q] + type: string + description: | + Quality-trimming cutoff for R2. Default: same as for R1 + - name: --quality_base + type: integer + description: | + Assume that quality values in FASTQ are encoded as + ascii(quality + N). This needs to be set to 64 for some + old Illumina FASTQ files. The default is 33. + example: 33 + - name: --poly_a + type: boolean_true + description: Trim poly-A tails + - name: --length + alternatives: [-l] + type: integer + description: | + Shorten reads to LENGTH. Positive values remove bases at + the end while negative ones remove bases at the beginning. + This and the following modifications are applied after + adapter trimming. + - name: --trim_n + type: boolean_true + description: Trim N's on ends of reads. + - name: --length_tag + type: string + description: | + Search for TAG followed by a decimal number in the + description field of the read. Replace the decimal number + with the correct length of the trimmed read. For example, + use --length-tag 'length=' to correct fields like + 'length=123'. + example: "length=" + - name: --strip_suffix + type: string + description: | + Remove this suffix from read names if present. Can be + given multiple times. + - name: --prefix + alternatives: [-x] + type: string + description: | + Add this prefix to read names. Use {name} to insert the + name of the matching adapter. + - name: --suffix + alternatives: [-y] + type: string + description: | + Add this suffix to read names; can also include {name} + - name: --rename + type: string + description: | + Rename reads using TEMPLATE containing variables such as + {id}, {adapter_name} etc. (see documentation) + - name: --zero_cap + alternatives: [-z] + type: boolean_true + description: Change negative quality values to zero. - #################################################################### - - name: Filtering of processed reads - description: | - Filters are applied after above read modifications. Paired-end reads are - always discarded pairwise (see also --pair_filter). - arguments: - - name: --minimum_length - alternatives: [-m] - type: string - description: | - Discard reads shorter than LEN. Default is 0. - When trimming paired-end reads, the minimum lengths for R1 and R2 can be specified separately by separating them with a colon (:). - If the colon syntax is not used, the same minimum length applies to both reads, as discussed above. - Also, one of the values can be omitted to impose no restrictions. - For example, with -m 17:, the length of R1 must be at least 17, but the length of R2 is ignored. - example: "0" - - name: --maximum_length - alternatives: [-M] - type: string - description: | - Discard reads longer than LEN. Default: no limit. - For paired reads, see the remark for --minimum_length - - name: --max_n - type: string - description: | - Discard reads with more than COUNT 'N' bases. If COUNT is - a number between 0 and 1, it is interpreted as a fraction - of the read length. - - name: --max_expected_errors - alternatives: [--max_ee] - type: long - description: | - Discard reads whose expected number of errors (computed - from quality values) exceeds ERRORS. - - name: --max_average_error_rate - alternatives: [--max_aer] - type: long - description: | - as --max_expected_errors (see above), but divided by - length to account for reads of varying length. - - name: --discard_trimmed - alternatives: [--discard] - type: boolean_true - description: | - Discard reads that contain an adapter. Use also -O to - avoid discarding too many randomly matching reads. - - name: --discard_untrimmed - alternatives: [--trimmed_only] - type: boolean_true - description: | - Discard reads that do not contain an adapter. - - name: --discard_casava - type: boolean_true - description: | - Discard reads that did not pass CASAVA filtering (header - has :Y:). + #################################################################### + - name: Filtering of processed reads + description: | + Filters are applied after above read modifications. Paired-end reads are + always discarded pairwise (see also --pair_filter). + arguments: + - name: --minimum_length + alternatives: [-m] + type: string + description: | + Discard reads shorter than LEN. Default is 0. + When trimming paired-end reads, the minimum lengths for R1 and R2 can be specified separately by separating them with a colon (:). + If the colon syntax is not used, the same minimum length applies to both reads, as discussed above. + Also, one of the values can be omitted to impose no restrictions. + For example, with -m 17:, the length of R1 must be at least 17, but the length of R2 is ignored. + example: "0" + - name: --maximum_length + alternatives: [-M] + type: string + description: | + Discard reads longer than LEN. Default: no limit. + For paired reads, see the remark for --minimum_length + - name: --max_n + type: string + description: | + Discard reads with more than COUNT 'N' bases. If COUNT is + a number between 0 and 1, it is interpreted as a fraction + of the read length. + - name: --max_expected_errors + alternatives: [--max_ee] + type: long + description: | + Discard reads whose expected number of errors (computed + from quality values) exceeds ERRORS. + - name: --max_average_error_rate + alternatives: [--max_aer] + type: long + description: | + as --max_expected_errors (see above), but divided by + length to account for reads of varying length. + - name: --discard_trimmed + alternatives: [--discard] + type: boolean_true + description: | + Discard reads that contain an adapter. Use also -O to + avoid discarding too many randomly matching reads. + - name: --discard_untrimmed + alternatives: [--trimmed_only] + type: boolean_true + description: | + Discard reads that do not contain an adapter. + - name: --discard_casava + type: boolean_true + description: | + Discard reads that did not pass CASAVA filtering (header + has :Y:). - #################################################################### - - name: Output parameters - arguments: - - name: --report - type: string - choices: [full, minimal] - description: | - Which type of report to print: 'full' (default) or 'minimal'. - example: full - - name: --json - type: boolean_true - description: | - Write report in JSON format to report.json in - the output directory. - - name: --output - type: file - description: | - Write trimmed reads to this directory and name the files using {name}. - FASTQ or FASTA format is chosen depending on input. - Summary report is sent to standard output. - default: output - direction: output - required: true - must_exist: true - - name: --fasta - type: boolean_true - description: | - Output FASTA to standard output even on FASTQ input. - - name: --info_file - type: boolean_true - description: | - Write information about each read and its adapter matches - into info.txt in the output directory. - See the documentation for the file format. - # - name: -Z - # - name: --rest_file - # - name: --wildcard-file - # - name: --too_short_output - # - name: --too_long_output - # - name: --untrimmed_output - # - name: --untrimmed_paired_output - # - name: too_short_paired_output - # - name: too_long_paired_output - - name: Debug - arguments: - - type: boolean_true - name: --debug - description: Print debug information - resources: - - type: bash_script - path: script.sh - test_resources: - - type: bash_script - path: test.sh -platforms: + #################################################################### + - name: Output parameters + arguments: + - name: --report + type: string + choices: [full, minimal] + description: | + Which type of report to print: 'full' (default) or 'minimal'. + example: full + - name: --json + type: boolean_true + description: | + Write report in JSON format to this file. + - name: --output_dir + type: file + description: | + Write trimmed reads to this directory and name the files using {name}. + FASTQ or FASTA format is chosen depending on input. + Summary report is sent to standard output. + default: ./ + direction: output + must_exist: true + - name: --output + type: file + description: | + Glob pattern for matching the expected output files. + Should include `$output_dir`. + example: "*.fast[a,q]" + direction: output + required: true + must_exist: true + multiple: true + - name: --fasta + type: boolean_true + description: | + Output FASTA to standard output even on FASTQ input. + - name: --info_file + type: boolean_true + description: | + Write information about each read and its adapter matches + into info.txt in the output directory. + See the documentation for the file format. + # - name: -Z + # - name: --rest_file + # - name: --wildcard-file + # - name: --too_short_output + # - name: --too_long_output + # - name: --untrimmed_output + # - name: --untrimmed_paired_output + # - name: too_short_paired_output + # - name: too_long_paired_output + - name: Debug + arguments: + - type: boolean_true + name: --debug + description: Print debug information +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + +engines: - type: docker image: python:3.12 setup: @@ -458,4 +466,6 @@ platforms: - type: docker run: | cutadapt --version | sed 's/\(.*\)/cutadapt: "\1"/' > /var/software_versions.txt +runners: + - type: executable - type: nextflow diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh index 1edfb090..5bfede70 100644 --- a/src/cutadapt/script.sh +++ b/src/cutadapt/script.sh @@ -11,11 +11,10 @@ par_info_file='false' par_debug='true' ## VIASH END -# TODO: change this? -if [ -z $par_output ]; then - par_output=. +if [ -z $par_output_dir ]; then + par_output_dir=. else - mkdir -p "$par_output" + mkdir -p "$par_output_dir" fi function debug { @@ -181,9 +180,9 @@ echo ">> Optional arguments" optional_output_args=$(echo \ ${par_report:+--report "${par_report}"} \ - ${par_json:+--json "${par_output}/report.json"} \ + ${par_json:+--json "report.json"} \ ${par_fasta:+--fasta} \ - ${par_info_file:+--info-file "$par_output/info.txt"} \ + ${par_info_file:+--info-file "info.txt"} \ ) debug "Arguments to cutadapt:" @@ -203,12 +202,12 @@ fi if [ $mode = "se" ]; then output_args=$(echo \ - --output "$par_output/{name}_001.$ext" \ + --output "$par_output_dir/{name}_001.$ext" \ ) else output_args=$(echo \ - --output "$par_output/{name}_R1_001.$ext" \ - --paired-output "$par_output/{name}_R2_001.$ext" \ + --output "$par_output_dir/{name}_R1_001.$ext" \ + --paired-output "$par_output_dir/{name}_R2_001.$ext" \ ) fi @@ -239,4 +238,4 @@ debug ">> Full CLI to be run:" debug cutadapt $cli | sed -e 's/--/\r\n --/g' debug -cutadapt $cli | tee $par_output/report.txt +cutadapt $cli diff --git a/src/cutadapt/test.sh b/src/cutadapt/test.sh index d36e6798..7109660f 100644 --- a/src/cutadapt/test.sh +++ b/src/cutadapt/test.sh @@ -23,6 +23,52 @@ assert_file_not_contains() { grep -q "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) } +############################################# +mkdir test_multiple_output +cd test_multiple_output + +echo "#############################################" +echo "> Run cutadapt with multiple outputs" + +cat > example.fa <<'EOF' +>read1 +MYSEQUENCEADAPTER +>read2 +MYSEQUENCEADAP +>read3 +MYSEQUENCEADAPTERSOMETHINGELSE +>read4 +MYSEQUENCEADABTER +>read5 +MYSEQUENCEADAPTR +>read6 +MYSEQUENCEADAPPTER +>read7 +ADAPTERMYSEQUENCE +>read8 +PTERMYSEQUENCE +>read9 +SOMETHINGADAPTERMYSEQUENCE +EOF + +"$meta_executable" \ + --report minimal \ + --output_dir out_test \ + --output "out_test/*.fasta" \ + --adapter ADAPTER \ + --input example.fa \ + --fasta \ + --no_match_adapter_wildcards \ + --json + +echo ">> Checking output" +assert_file_exists "report.json" +assert_file_exists "out_test/1_001.fasta" +assert_file_exists "out_test/unknown_001.fasta" + +cd .. +echo + ############################################# mkdir test_simple_single_end cd test_simple_single_end @@ -53,7 +99,8 @@ EOF "$meta_executable" \ --report minimal \ - --output out_test1 \ + --output_dir out_test1 \ + --output "out_test1/*.fasta" \ --adapter ADAPTER \ --input example.fa \ --fasta \ @@ -61,14 +108,12 @@ EOF --json echo ">> Checking output" -assert_file_exists "out_test1/report.txt" -assert_file_exists "out_test1/report.json" +assert_file_exists "report.json" assert_file_exists "out_test1/1_001.fasta" assert_file_exists "out_test1/unknown_001.fasta" echo ">> Check if output is empty" -assert_file_not_empty "out_test1/report.txt" -assert_file_not_empty "out_test1/report.json" +assert_file_not_empty "report.json" assert_file_not_empty "out_test1/1_001.fasta" assert_file_not_empty "out_test1/unknown_001.fasta" @@ -113,7 +158,8 @@ EOF "$meta_executable" \ --report minimal \ - --output out_test2 \ + --output_dir out_test2 \ + --output "out_test2/*.fasta" \ --adapter AAAAA \ --adapter_fasta adapters1.fasta \ --adapter_fasta adapters2.fasta \ @@ -122,16 +168,14 @@ EOF --json echo ">> Checking output" -assert_file_exists "out_test2/report.txt" -assert_file_exists "out_test2/report.json" +assert_file_exists "report.json" assert_file_exists "out_test2/1_001.fasta" assert_file_exists "out_test2/adapter1_001.fasta" assert_file_exists "out_test2/adapter2_001.fasta" assert_file_exists "out_test2/unknown_001.fasta" echo ">> Check if output is empty" -assert_file_not_empty "out_test2/report.txt" -assert_file_not_empty "out_test2/report.json" +assert_file_not_empty "report.json" assert_file_not_empty "out_test2/1_001.fasta" assert_file_not_empty "out_test2/adapter1_001.fasta" assert_file_not_empty "out_test2/adapter2_001.fasta" @@ -177,7 +221,8 @@ EOF "$meta_executable" \ --report minimal \ - --output out_test3 \ + --output_dir out_test3 \ + --output "out_test3/*.fastq" \ --adapter AAAAA \ --adapter_r2 GGGGG \ --input example_R1.fastq \ @@ -187,16 +232,14 @@ EOF ---cpus 1 echo ">> Checking output" -assert_file_exists "out_test3/report.txt" -assert_file_exists "out_test3/report.json" +assert_file_exists "report.json" assert_file_exists "out_test3/1_R1_001.fastq" assert_file_exists "out_test3/1_R2_001.fastq" assert_file_exists "out_test3/unknown_R1_001.fastq" assert_file_exists "out_test3/unknown_R2_001.fastq" echo ">> Check if output is empty" -assert_file_not_empty "out_test3/report.txt" -assert_file_not_empty "out_test3/report.json" +assert_file_not_empty "report.json" assert_file_not_empty "out_test3/1_R1_001.fastq" assert_file_not_empty "out_test3/1_R2_001.fastq" assert_file_not_empty "out_test3/unknown_R1_001.fastq" From 58a9ec7bf458f5f62b1581fa789166b2cd26ec67 Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Thu, 6 Jun 2024 17:17:28 +0200 Subject: [PATCH 29/32] Avoid the need for both output_dir and output --- src/cutadapt/config.vsh.yaml | 11 +---------- src/cutadapt/script.sh | 16 ++++++---------- src/cutadapt/test.sh | 4 ---- 3 files changed, 7 insertions(+), 24 deletions(-) diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index b7ca2997..b1a48950 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -406,21 +406,12 @@ argument_groups: type: boolean_true description: | Write report in JSON format to this file. - - name: --output_dir - type: file - description: | - Write trimmed reads to this directory and name the files using {name}. - FASTQ or FASTA format is chosen depending on input. - Summary report is sent to standard output. - default: ./ - direction: output - must_exist: true - name: --output type: file description: | Glob pattern for matching the expected output files. Should include `$output_dir`. - example: "*.fast[a,q]" + example: "fastq/*_001.fast[a,q]" direction: output required: true must_exist: true diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh index 5bfede70..5e1f9e30 100644 --- a/src/cutadapt/script.sh +++ b/src/cutadapt/script.sh @@ -5,22 +5,18 @@ par_adapter='AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;GGATCGGAAGAGCACACGTCTGAACTCCAGTC par_input='src/cutadapt/test_data/se/a.fastq' par_report='full' par_json='false' -par_output='output' par_fasta='false' par_info_file='false' par_debug='true' ## VIASH END -if [ -z $par_output_dir ]; then - par_output_dir=. -else - mkdir -p "$par_output_dir" -fi - function debug { [[ "$par_debug" == "true" ]] && echo "DEBUG: $@" } +output_dir=$(dirname $par_output) +[[ ! -d $output_dir ]] && mkdir -p $output_dir + # Init ########################################################### @@ -202,12 +198,12 @@ fi if [ $mode = "se" ]; then output_args=$(echo \ - --output "$par_output_dir/{name}_001.$ext" \ + --output "$output_dir/{name}_001.$ext" \ ) else output_args=$(echo \ - --output "$par_output_dir/{name}_R1_001.$ext" \ - --paired-output "$par_output_dir/{name}_R2_001.$ext" \ + --output "$output_dir/{name}_R1_001.$ext" \ + --paired-output "$output_dir/{name}_R2_001.$ext" \ ) fi diff --git a/src/cutadapt/test.sh b/src/cutadapt/test.sh index 7109660f..eff997d7 100644 --- a/src/cutadapt/test.sh +++ b/src/cutadapt/test.sh @@ -53,7 +53,6 @@ EOF "$meta_executable" \ --report minimal \ - --output_dir out_test \ --output "out_test/*.fasta" \ --adapter ADAPTER \ --input example.fa \ @@ -99,7 +98,6 @@ EOF "$meta_executable" \ --report minimal \ - --output_dir out_test1 \ --output "out_test1/*.fasta" \ --adapter ADAPTER \ --input example.fa \ @@ -158,7 +156,6 @@ EOF "$meta_executable" \ --report minimal \ - --output_dir out_test2 \ --output "out_test2/*.fasta" \ --adapter AAAAA \ --adapter_fasta adapters1.fasta \ @@ -221,7 +218,6 @@ EOF "$meta_executable" \ --report minimal \ - --output_dir out_test3 \ --output "out_test3/*.fastq" \ --adapter AAAAA \ --adapter_r2 GGGGG \ From d519a141f7ede17a9381c4a69f5dc751316bd63d Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Fri, 7 Jun 2024 15:49:07 +0200 Subject: [PATCH 30/32] Move fields from `info` to `links` Co-authored-by: Robrecht Cannoodt --- src/cutadapt/config.vsh.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index b1a48950..88c902a5 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -1,13 +1,13 @@ name: cutadapt description: | Cutadapt removes adapter sequences from high-throughput sequencing reads. -info: - keywords: [RNA-seq, scRNA-seq, high-throughput] - homepage: https://cutadapt.readthedocs.io/ - documentation: https://cutadapt.readthedocs.io/ +keywords: [RNA-seq, scRNA-seq, high-throughput] +links: + homepage: https://cutadapt.readthedocs.io + documentation: https://cutadapt.readthedocs.io repository: https://github.com/marcelm/cutadapt reference: http://dx.doi.org/10.14806/ej.17.1.200 - license: MIT +license: MIT argument_groups: #################################################################### - name: Specify Adapters for R1 From 200aa6596fdaf00d08249ad9077eee9a28a2c0ed Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Fri, 7 Jun 2024 16:03:29 +0200 Subject: [PATCH 31/32] Move references back to the info field --- src/cutadapt/config.vsh.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index 88c902a5..f50400dd 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -6,6 +6,7 @@ links: homepage: https://cutadapt.readthedocs.io documentation: https://cutadapt.readthedocs.io repository: https://github.com/marcelm/cutadapt +info: reference: http://dx.doi.org/10.14806/ej.17.1.200 license: MIT argument_groups: From 417a559f12cae9758614d962c9abca53d0ee991a Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Fri, 21 Jun 2024 16:50:13 +0200 Subject: [PATCH 32/32] apologies, I proposed a wrong syntax --- src/cutadapt/config.vsh.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index f50400dd..a62f0aa9 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -6,8 +6,8 @@ links: homepage: https://cutadapt.readthedocs.io documentation: https://cutadapt.readthedocs.io repository: https://github.com/marcelm/cutadapt -info: - reference: http://dx.doi.org/10.14806/ej.17.1.200 +references: + doi: 10.14806/ej.17.1.200 license: MIT argument_groups: ####################################################################