diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml new file mode 100644 index 00000000..a62f0aa9 --- /dev/null +++ b/src/cutadapt/config.vsh.yaml @@ -0,0 +1,463 @@ +name: cutadapt +description: | + Cutadapt removes adapter sequences from high-throughput sequencing reads. +keywords: [RNA-seq, scRNA-seq, high-throughput] +links: + homepage: https://cutadapt.readthedocs.io + documentation: https://cutadapt.readthedocs.io + repository: https://github.com/marcelm/cutadapt +references: + doi: 10.14806/ej.17.1.200 +license: MIT +argument_groups: + #################################################################### + - name: Specify Adapters for R1 + arguments: + - name: --adapter + alternatives: [-a] + type: string + multiple: true + description: | + Sequence of an adapter ligated to the 3' end (paired data: + of the first read). The adapter and subsequent bases are + trimmed. If a '$' character is appended ('anchoring'), the + adapter is only found if it is a suffix of the read. + required: false + - name: --front + alternatives: [-g] + type: string + multiple: true + description: | + Sequence of an adapter ligated to the 5' end (paired data: + of the first read). The adapter and any preceding bases + are trimmed. Partial matches at the 5' end are allowed. If + a '^' character is prepended ('anchoring'), the adapter is + only found if it is a prefix of the read. + required: false + - name: --anywhere + alternatives: [-b] + type: string + multiple: true + description: | + Sequence of an adapter that may be ligated to the 5' or 3' + end (paired data: of the first read). Both types of + matches as described under -a and -g are allowed. If the + first base of the read is part of the match, the behavior + is as with -g, otherwise as with -a. This option is mostly + for rescuing failed library preparations - do not use if + you know which end your adapter was ligated to! + required: false + + #################################################################### + - name: Specify Adapters using Fasta files for R1 + arguments: + - name: --adapter_fasta + type: file + multiple: true + description: | + Fasta file containing sequences of an adapter ligated to the 3' end (paired data: + of the first read). The adapter and subsequent bases are + trimmed. If a '$' character is appended ('anchoring'), the + adapter is only found if it is a suffix of the read. + required: false + - name: --front_fasta + type: file + description: | + Fasta file containing sequences of an adapter ligated to the 5' end (paired data: + of the first read). The adapter and any preceding bases + are trimmed. Partial matches at the 5' end are allowed. If + a '^' character is prepended ('anchoring'), the adapter is + only found if it is a prefix of the read. + required: false + - name: --anywhere_fasta + type: file + description: | + Fasta file containing sequences of an adapter that may be ligated to the 5' or 3' + end (paired data: of the first read). Both types of + matches as described under -a and -g are allowed. If the + first base of the read is part of the match, the behavior + is as with -g, otherwise as with -a. This option is mostly + for rescuing failed library preparations - do not use if + you know which end your adapter was ligated to! + required: false + + #################################################################### + - name: Specify Adapters for R2 + arguments: + - name: --adapter_r2 + alternatives: [-A] + type: string + multiple: true + description: | + Sequence of an adapter ligated to the 3' end (paired data: + of the first read). The adapter and subsequent bases are + trimmed. If a '$' character is appended ('anchoring'), the + adapter is only found if it is a suffix of the read. + required: false + - name: --front_r2 + alternatives: [-G] + type: string + multiple: true + description: | + Sequence of an adapter ligated to the 5' end (paired data: + of the first read). The adapter and any preceding bases + are trimmed. Partial matches at the 5' end are allowed. If + a '^' character is prepended ('anchoring'), the adapter is + only found if it is a prefix of the read. + required: false + - name: --anywhere_r2 + alternatives: [-B] + type: string + multiple: true + description: | + Sequence of an adapter that may be ligated to the 5' or 3' + end (paired data: of the first read). Both types of + matches as described under -a and -g are allowed. If the + first base of the read is part of the match, the behavior + is as with -g, otherwise as with -a. This option is mostly + for rescuing failed library preparations - do not use if + you know which end your adapter was ligated to! + required: false + + #################################################################### + - name: Specify Adapters using Fasta files for R2 + arguments: + - name: --adapter_r2_fasta + type: file + description: | + Fasta file containing sequences of an adapter ligated to the 3' end (paired data: + of the first read). The adapter and subsequent bases are + trimmed. If a '$' character is appended ('anchoring'), the + adapter is only found if it is a suffix of the read. + required: false + - name: --front_r2_fasta + type: file + description: | + Fasta file containing sequences of an adapter ligated to the 5' end (paired data: + of the first read). The adapter and any preceding bases + are trimmed. Partial matches at the 5' end are allowed. If + a '^' character is prepended ('anchoring'), the adapter is + only found if it is a prefix of the read. + required: false + - name: --anywhere_r2_fasta + type: file + description: | + Fasta file containing sequences of an adapter that may be ligated to the 5' or 3' + end (paired data: of the first read). Both types of + matches as described under -a and -g are allowed. If the + first base of the read is part of the match, the behavior + is as with -g, otherwise as with -a. This option is mostly + for rescuing failed library preparations - do not use if + you know which end your adapter was ligated to! + required: false + + #################################################################### + - name: Paired-end options + arguments: + - name: --pair_adapters + type: boolean_true + description: | + Treat adapters given with -a/-A etc. as pairs. Either both + or none are removed from each read pair. + - name: --pair_filter + type: string + choices: [any, both, first] + description: | + Which of the reads in a paired-end read have to match the + filtering criterion in order for the pair to be filtered. + - name: --interleaved + type: boolean_true + description: | + Read and/or write interleaved paired-end reads. + + #################################################################### + - name: Input parameters + arguments: + - name: --input + type: file + required: true + description: | + Input fastq file for single-end reads or R1 for paired-end reads. + - name: --input_r2 + type: file + required: false + description: | + Input fastq file for R2 in the case of paired-end reads. + - name: --error_rate + alternatives: [-E, --errors] + type: double + description: | + Maximum allowed error rate (if 0 <= E < 1), or absolute + number of errors for full-length adapter match (if E is an + integer >= 1). Error rate = no. of errors divided by + length of matching region. Default: 0.1 (10%). + example: 0.1 + - name: --no_indels + type: boolean_false + description: | + Allow only mismatches in alignments. + + - name: --times + type: integer + alternatives: [-n] + description: | + Remove up to COUNT adapters from each read. Default: 1. + example: 1 + - name: --overlap + alternatives: [-O] + type: integer + description: | + Require MINLENGTH overlap between read and adapter for an + adapter to be found. The default is 3. + example: 3 + - name: --match_read_wildcards + type: boolean_true + description: | + Interpret IUPAC wildcards in reads. + - name: --no_match_adapter_wildcards + type: boolean_false + description: | + Do not interpret IUPAC wildcards in adapters. + - name: --action + type: string + choices: + - trim + - retain + - mask + - lowercase + - none + description: | + What to do if a match was found. trim: trim adapter and + up- or downstream sequence; retain: trim, but retain + adapter; mask: replace with 'N' characters; lowercase: + convert to lowercase; none: leave unchanged. + The default is trim. + example: trim + - name: --revcomp + alternatives: [--rc] + type: boolean_true + description: | + Check both the read and its reverse complement for adapter + matches. If match is on reverse-complemented version, + output that one. + + #################################################################### + - name: Read modifications + arguments: + - name: --cut + alternatives: [-u] + type: integer + multiple: true + description: | + Remove LEN bases from each read (or R1 if paired; use --cut_r2 + option for R2). If LEN is positive, remove bases from the + beginning. If LEN is negative, remove bases from the end. + Can be used twice if LENs have different signs. Applied + *before* adapter trimming. + - name: --cut_r2 + type: integer + multiple: true + description: | + Remove LEN bases from each read (for R2). If LEN is positive, remove bases from the + beginning. If LEN is negative, remove bases from the end. + Can be used twice if LENs have different signs. Applied + *before* adapter trimming. + - name: --nextseq_trim + type: string + description: | + NextSeq-specific quality trimming (each read). Trims also + dark cycles appearing as high-quality G bases. + - name: --quality_cutoff + alternatives: [-q] + type: string + description: | + Trim low-quality bases from 5' and/or 3' ends of each read + before adapter removal. Applied to both reads if data is + paired. If one value is given, only the 3' end is trimmed. + If two comma-separated cutoffs are given, the 5' end is + trimmed with the first cutoff, the 3' end with the second. + - name: --quality_cutoff_r2 + alternatives: [-Q] + type: string + description: | + Quality-trimming cutoff for R2. Default: same as for R1 + - name: --quality_base + type: integer + description: | + Assume that quality values in FASTQ are encoded as + ascii(quality + N). This needs to be set to 64 for some + old Illumina FASTQ files. The default is 33. + example: 33 + - name: --poly_a + type: boolean_true + description: Trim poly-A tails + - name: --length + alternatives: [-l] + type: integer + description: | + Shorten reads to LENGTH. Positive values remove bases at + the end while negative ones remove bases at the beginning. + This and the following modifications are applied after + adapter trimming. + - name: --trim_n + type: boolean_true + description: Trim N's on ends of reads. + - name: --length_tag + type: string + description: | + Search for TAG followed by a decimal number in the + description field of the read. Replace the decimal number + with the correct length of the trimmed read. For example, + use --length-tag 'length=' to correct fields like + 'length=123'. + example: "length=" + - name: --strip_suffix + type: string + description: | + Remove this suffix from read names if present. Can be + given multiple times. + - name: --prefix + alternatives: [-x] + type: string + description: | + Add this prefix to read names. Use {name} to insert the + name of the matching adapter. + - name: --suffix + alternatives: [-y] + type: string + description: | + Add this suffix to read names; can also include {name} + - name: --rename + type: string + description: | + Rename reads using TEMPLATE containing variables such as + {id}, {adapter_name} etc. (see documentation) + - name: --zero_cap + alternatives: [-z] + type: boolean_true + description: Change negative quality values to zero. + + #################################################################### + - name: Filtering of processed reads + description: | + Filters are applied after above read modifications. Paired-end reads are + always discarded pairwise (see also --pair_filter). + arguments: + - name: --minimum_length + alternatives: [-m] + type: string + description: | + Discard reads shorter than LEN. Default is 0. + When trimming paired-end reads, the minimum lengths for R1 and R2 can be specified separately by separating them with a colon (:). + If the colon syntax is not used, the same minimum length applies to both reads, as discussed above. + Also, one of the values can be omitted to impose no restrictions. + For example, with -m 17:, the length of R1 must be at least 17, but the length of R2 is ignored. + example: "0" + - name: --maximum_length + alternatives: [-M] + type: string + description: | + Discard reads longer than LEN. Default: no limit. + For paired reads, see the remark for --minimum_length + - name: --max_n + type: string + description: | + Discard reads with more than COUNT 'N' bases. If COUNT is + a number between 0 and 1, it is interpreted as a fraction + of the read length. + - name: --max_expected_errors + alternatives: [--max_ee] + type: long + description: | + Discard reads whose expected number of errors (computed + from quality values) exceeds ERRORS. + - name: --max_average_error_rate + alternatives: [--max_aer] + type: long + description: | + as --max_expected_errors (see above), but divided by + length to account for reads of varying length. + - name: --discard_trimmed + alternatives: [--discard] + type: boolean_true + description: | + Discard reads that contain an adapter. Use also -O to + avoid discarding too many randomly matching reads. + - name: --discard_untrimmed + alternatives: [--trimmed_only] + type: boolean_true + description: | + Discard reads that do not contain an adapter. + - name: --discard_casava + type: boolean_true + description: | + Discard reads that did not pass CASAVA filtering (header + has :Y:). + + #################################################################### + - name: Output parameters + arguments: + - name: --report + type: string + choices: [full, minimal] + description: | + Which type of report to print: 'full' (default) or 'minimal'. + example: full + - name: --json + type: boolean_true + description: | + Write report in JSON format to this file. + - name: --output + type: file + description: | + Glob pattern for matching the expected output files. + Should include `$output_dir`. + example: "fastq/*_001.fast[a,q]" + direction: output + required: true + must_exist: true + multiple: true + - name: --fasta + type: boolean_true + description: | + Output FASTA to standard output even on FASTQ input. + - name: --info_file + type: boolean_true + description: | + Write information about each read and its adapter matches + into info.txt in the output directory. + See the documentation for the file format. + # - name: -Z + # - name: --rest_file + # - name: --wildcard-file + # - name: --too_short_output + # - name: --too_long_output + # - name: --untrimmed_output + # - name: --untrimmed_paired_output + # - name: too_short_paired_output + # - name: too_long_paired_output + - name: Debug + arguments: + - type: boolean_true + name: --debug + description: Print debug information +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: python:3.12 + setup: + - type: python + pip: + - cutadapt + - type: docker + run: | + cutadapt --version | sed 's/\(.*\)/cutadapt: "\1"/' > /var/software_versions.txt +runners: + - type: executable + - type: nextflow diff --git a/src/cutadapt/help.txt b/src/cutadapt/help.txt new file mode 100644 index 00000000..2280c3e2 --- /dev/null +++ b/src/cutadapt/help.txt @@ -0,0 +1,218 @@ +cutadapt version 4.6 + +Copyright (C) 2010 Marcel Martin and contributors + +Cutadapt removes adapter sequences from high-throughput sequencing reads. + +Usage: + cutadapt -a ADAPTER [options] [-o output.fastq] input.fastq + +For paired-end reads: + cutadapt -a ADAPT1 -A ADAPT2 [options] -o out1.fastq -p out2.fastq in1.fastq in2.fastq + +Replace "ADAPTER" with the actual sequence of your 3' adapter. IUPAC wildcard +characters are supported. All reads from input.fastq will be written to +output.fastq with the adapter sequence removed. Adapter matching is +error-tolerant. Multiple adapter sequences can be given (use further -a +options), but only the best-matching adapter will be removed. + +Input may also be in FASTA format. Compressed input and output is supported and +auto-detected from the file name (.gz, .xz, .bz2). Use the file name '-' for +standard input/output. Without the -o option, output is sent to standard output. + +Citation: + +Marcel Martin. Cutadapt removes adapter sequences from high-throughput +sequencing reads. EMBnet.Journal, 17(1):10-12, May 2011. +http://dx.doi.org/10.14806/ej.17.1.200 + +Run "cutadapt --help" to see all command-line options. +See https://cutadapt.readthedocs.io/ for full documentation. + +Options: + -h, --help Show this help message and exit + --version Show version number and exit + --debug Print debug log. Use twice to also print DP matrices + -j CORES, --cores CORES + Number of CPU cores to use. Use 0 to auto-detect. Default: + 1 + +Finding adapters: + Parameters -a, -g, -b specify adapters to be removed from each read (or from + R1 if data is paired-end. If specified multiple times, only the best matching + adapter is trimmed (but see the --times option). Use notation 'file:FILE' to + read adapter sequences from a FASTA file. + + -a ADAPTER, --adapter ADAPTER + Sequence of an adapter ligated to the 3' end (paired data: + of the first read). The adapter and subsequent bases are + trimmed. If a '$' character is appended ('anchoring'), the + adapter is only found if it is a suffix of the read. + -g ADAPTER, --front ADAPTER + Sequence of an adapter ligated to the 5' end (paired data: + of the first read). The adapter and any preceding bases + are trimmed. Partial matches at the 5' end are allowed. If + a '^' character is prepended ('anchoring'), the adapter is + only found if it is a prefix of the read. + -b ADAPTER, --anywhere ADAPTER + Sequence of an adapter that may be ligated to the 5' or 3' + end (paired data: of the first read). Both types of + matches as described under -a and -g are allowed. If the + first base of the read is part of the match, the behavior + is as with -g, otherwise as with -a. This option is mostly + for rescuing failed library preparations - do not use if + you know which end your adapter was ligated to! + -e E, --error-rate E, --errors E + Maximum allowed error rate (if 0 <= E < 1), or absolute + number of errors for full-length adapter match (if E is an + integer >= 1). Error rate = no. of errors divided by + length of matching region. Default: 0.1 (10%) + --no-indels Allow only mismatches in alignments. Default: allow both + mismatches and indels + -n COUNT, --times COUNT + Remove up to COUNT adapters from each read. Default: 1 + -O MINLENGTH, --overlap MINLENGTH + Require MINLENGTH overlap between read and adapter for an + adapter to be found. Default: 3 + --match-read-wildcards + Interpret IUPAC wildcards in reads. Default: False + -N, --no-match-adapter-wildcards + Do not interpret IUPAC wildcards in adapters. + --action {trim,retain,mask,lowercase,none} + What to do if a match was found. trim: trim adapter and + up- or downstream sequence; retain: trim, but retain + adapter; mask: replace with 'N' characters; lowercase: + convert to lowercase; none: leave unchanged. Default: trim + --rc, --revcomp Check both the read and its reverse complement for adapter + matches. If match is on reverse-complemented version, + output that one. Default: check only read + +Additional read modifications: + -u LEN, --cut LEN Remove LEN bases from each read (or R1 if paired; use -U + option for R2). If LEN is positive, remove bases from the + beginning. If LEN is negative, remove bases from the end. + Can be used twice if LENs have different signs. Applied + *before* adapter trimming. + --nextseq-trim 3'CUTOFF + NextSeq-specific quality trimming (each read). Trims also + dark cycles appearing as high-quality G bases. + -q [5'CUTOFF,]3'CUTOFF, --quality-cutoff [5'CUTOFF,]3'CUTOFF + Trim low-quality bases from 5' and/or 3' ends of each read + before adapter removal. Applied to both reads if data is + paired. If one value is given, only the 3' end is trimmed. + If two comma-separated cutoffs are given, the 5' end is + trimmed with the first cutoff, the 3' end with the second. + --quality-base N Assume that quality values in FASTQ are encoded as + ascii(quality + N). This needs to be set to 64 for some + old Illumina FASTQ files. Default: 33 + --poly-a Trim poly-A tails + --length LENGTH, -l LENGTH + Shorten reads to LENGTH. Positive values remove bases at + the end while negative ones remove bases at the beginning. + This and the following modifications are applied after + adapter trimming. + --trim-n Trim N's on ends of reads. + --length-tag TAG Search for TAG followed by a decimal number in the + description field of the read. Replace the decimal number + with the correct length of the trimmed read. For example, + use --length-tag 'length=' to correct fields like + 'length=123'. + --strip-suffix STRIP_SUFFIX + Remove this suffix from read names if present. Can be + given multiple times. + -x PREFIX, --prefix PREFIX + Add this prefix to read names. Use {name} to insert the + name of the matching adapter. + -y SUFFIX, --suffix SUFFIX + Add this suffix to read names; can also include {name} + --rename TEMPLATE Rename reads using TEMPLATE containing variables such as + {id}, {adapter_name} etc. (see documentation) + --zero-cap, -z Change negative quality values to zero. + +Filtering of processed reads: + Filters are applied after above read modifications. Paired-end reads are + always discarded pairwise (see also --pair-filter). + + -m LEN[:LEN2], --minimum-length LEN[:LEN2] + Discard reads shorter than LEN. Default: 0 + -M LEN[:LEN2], --maximum-length LEN[:LEN2] + Discard reads longer than LEN. Default: no limit + --max-n COUNT Discard reads with more than COUNT 'N' bases. If COUNT is + a number between 0 and 1, it is interpreted as a fraction + of the read length. + --max-expected-errors ERRORS, --max-ee ERRORS + Discard reads whose expected number of errors (computed + from quality values) exceeds ERRORS. + --max-average-error-rate ERROR_RATE, --max-aer ERROR_RATE + as --max-expected-errors (see above), but divided by + length to account for reads of varying length. + --discard-trimmed, --discard + Discard reads that contain an adapter. Use also -O to + avoid discarding too many randomly matching reads. + --discard-untrimmed, --trimmed-only + Discard reads that do not contain an adapter. + --discard-casava Discard reads that did not pass CASAVA filtering (header + has :Y:). + +Output: + --quiet Print only error messages. + --report {full,minimal} + Which type of report to print: 'full' or 'minimal'. + Default: full + --json FILE Dump report in JSON format to FILE + -o FILE, --output FILE + Write trimmed reads to FILE. FASTQ or FASTA format is + chosen depending on input. Summary report is sent to + standard output. Use '{name}' for demultiplexing (see + docs). Default: write to standard output + --fasta Output FASTA to standard output even on FASTQ input. + -Z Use compression level 1 for gzipped output files (faster, + but uses more space) + --info-file FILE Write information about each read and its adapter matches + into FILE. See the documentation for the file format. + -r FILE, --rest-file FILE + When the adapter matches in the middle of a read, write + the rest (after the adapter) to FILE. + --wildcard-file FILE When the adapter has N wildcard bases, write adapter bases + matching wildcard positions to FILE. (Inaccurate with + indels.) + --too-short-output FILE + Write reads that are too short (according to length + specified by -m) to FILE. Default: discard reads + --too-long-output FILE + Write reads that are too long (according to length + specified by -M) to FILE. Default: discard reads + --untrimmed-output FILE + Write reads that do not contain any adapter to FILE. + Default: output to same file as trimmed reads + +Paired-end options: + The -A/-G/-B/-U/-Q options work like their lowercase counterparts, but are + applied to R2 (second read in pair) + + -A ADAPTER 3' adapter to be removed from R2 + -G ADAPTER 5' adapter to be removed from R2 + -B ADAPTER 5'/3 adapter to be removed from R2 + -U LENGTH Remove LENGTH bases from R2 + -Q [5'CUTOFF,]3'CUTOFF + Quality-trimming cutoff for R2. Default: same as for R1 + -p FILE, --paired-output FILE + Write R2 to FILE. + --pair-adapters Treat adapters given with -a/-A etc. as pairs. Either both + or none are removed from each read pair. + --pair-filter {any,both,first} + Which of the reads in a paired-end read have to match the + filtering criterion in order for the pair to be filtered. + Default: any + --interleaved Read and/or write interleaved paired-end reads. + --untrimmed-paired-output FILE + Write second read in a pair to this FILE when no adapter + was found. Use with --untrimmed-output. Default: output to + same file as trimmed reads + --too-short-paired-output FILE + Write second read in a pair to this file if pair is too + short. + --too-long-paired-output FILE + Write second read in a pair to this file if pair is too + long. + diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh new file mode 100644 index 00000000..5e1f9e30 --- /dev/null +++ b/src/cutadapt/script.sh @@ -0,0 +1,237 @@ +#!/bin/bash + +## VIASH START +par_adapter='AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;GGATCGGAAGAGCACACGTCTGAACTCCAGTCAC' +par_input='src/cutadapt/test_data/se/a.fastq' +par_report='full' +par_json='false' +par_fasta='false' +par_info_file='false' +par_debug='true' +## VIASH END + +function debug { + [[ "$par_debug" == "true" ]] && echo "DEBUG: $@" +} + +output_dir=$(dirname $par_output) +[[ ! -d $output_dir ]] && mkdir -p $output_dir + +# Init +########################################################### + +echo ">> Paired-end data or not?" + +mode="" +if [[ -z $par_input_r2 ]]; then + mode="se" + echo " Single end" + input="$par_input" +else + echo " Paired end" + mode="pe" + input="$par_input $par_input_r2" +fi + +# Adapter arguments +# - paired and single-end +# - string and fasta +########################################################### + +function add_flags { + local arg=$1 + local flag=$2 + local prefix=$3 + [[ -z $prefix ]] && prefix="" + + # This function should not be called if the input is empty + # but check for it just in case + if [[ -z $arg ]]; then + return + fi + + local output="" + IFS=';' read -r -a array <<< "$arg" + for a in "${array[@]}"; do + output="$output $flag $prefix$a" + done + echo $output +} + +debug ">> Parsing arguments dealing with adapters" +adapter_args=$(echo \ + ${par_adapter:+$(add_flags "$par_adapter" "--adapter")} \ + ${par_adapter_fasta:+$(add_flags "$par_adapter_fasta" "--adapter" "file:")} \ + ${par_front:+$(add_flags "$par_front" "--front")} \ + ${par_front_fasta:+$(add_flags "$par_front_fasta" "--front" "file:")} \ + ${par_anywhere:+$(add_flags "$par_anywhere" "--anywhere")} \ + ${par_anywhere_fasta:+$(add_flags "$par_anywhere_fasta" "--anywhere" "file:")} \ + ${par_adapter_r2:+$(add_flags "$par_adapter_r2" "-A")} \ + ${par_adapter_fasta_r2:+$(add_flags "$par_adapter_fasta_r2" "-A" "file:")} \ + ${par_front_r2:+$(add_flags "$par_front_r2" "-G")} \ + ${par_front_fasta_r2:+$(add_flags "$par_front_fasta_r2" "-G" "file:")} \ + ${par_anywhere_r2:+$(add_flags "$par_anywhere_r2" "-B")} \ + ${par_anywhere_fasta_r2:+$(add_flags "$par_anywhere_fasta_r2" "-B" "file:")} \ +) + +debug "Arguments to cutadapt:" +debug "$adapter_args" +debug + +# Paired-end options +########################################################### +echo ">> Parsing arguments for paired-end reads" +[[ "$par_pair_adapters" == "false" ]] && unset par_pair_adapters +[[ "$par_interleaved" == "false" ]] && unset par_interleaved + +paired_args=$(echo \ + ${par_pair_adapters:+--pair-adapters} \ + ${par_pair_filter:+--pair-filter "${par_pair_filter}"} \ + ${par_interleaved:+--interleaved} +) +debug "Arguments to cutadapt:" +debug $paired_args +debug + +# Input arguments +########################################################### +echo ">> Parsing input arguments" +[[ "$par_no_indels" == "true" ]] && unset par_no_indels +[[ "$par_match_read_wildcards" == "false" ]] && unset par_match_read_wildcards +[[ "$par_no_match_adapter_wildcards" == "true" ]] && unset par_no_match_adapter_wildcards +[[ "$par_revcomp" == "false" ]] && unset par_revcomp + +input_args=$(echo \ + ${par_error_rate:+--error-rate "${par_error_rate}"} \ + ${par_no_indels:+--no-indels} \ + ${par_times:+--times "${par_times}"} \ + ${par_overlap:+--overlap "${par_overlap}"} \ + ${par_match_read_wildcards:+--match-read-wildcards} \ + ${par_no_match_adapter_wildcards:+--no-match-adapter-wildcards} \ + ${par_action:+--action "${par_action}"} \ + ${par_revcomp:+--revcomp} \ +) +debug "Arguments to cutadapt:" +debug $input_args +debug + +# Read modifications +########################################################### +echo ">> Parsing read modification arguments" +[[ "$par_poly_a" == "false" ]] && unset par_poly_a +[[ "$par_trim_n" == "false" ]] && unset par_trim_n +[[ "$par_zero_cap" == "false" ]] && unset par_zero_cap + +mod_args=$(echo \ + ${par_cut:+--cut "${par_cut}"} \ + ${par_cut_r2:+--cut_r2 "${par_cut_r2}"} \ + ${par_nextseq_trim:+--nextseq-trim "${par_nextseq_trim}"} \ + ${par_quality_cutoff:+--quality-cutoff "${par_quality_cutoff}"} \ + ${par_quality_cutoff_r2:+--quality-cutoff_r2 "${par_quality_cutoff_r2}"} \ + ${par_quality_base:+--quality-base "${par_quality_base}"} \ + ${par_poly_a:+--poly-a} \ + ${par_length:+--length "${par_length}"} \ + ${par_trim_n:+--trim-n} \ + ${par_length_tag:+--length-tag "${par_length_tag}"} \ + ${par_strip_suffix:+--strip-suffix "${par_strip_suffix}"} \ + ${par_prefix:+--prefix "${par_prefix}"} \ + ${par_suffix:+--suffix "${par_suffix}"} \ + ${par_rename:+--rename "${par_rename}"} \ + ${par_zero_cap:+--zero-cap} \ +) +debug "Arguments to cutadapt:" +debug $mod_args +debug + +# Filtering of processed reads arguments +########################################################### +echo ">> Filtering of processed reads arguments" +[[ "$par_discard_trimmed" == "false" ]] && unset par_discard_trimmed +[[ "$par_discard_untrimmed" == "false" ]] && unset par_discard_untrimmed +[[ "$par_discard_casava" == "false" ]] && unset par_discard_casava + +# Parse and transform the minimum and maximum length arguments +[[ -z $par_minimum_length ]] + +filter_args=$(echo \ + ${par_minimum_length:+--minimum-length "${par_minimum_length}"} \ + ${par_maximum_length:+--maximum-length "${par_maximum_length}"} \ + ${par_max_n:+--max-n "${par_max_n}"} \ + ${par_max_expected_errors:+--max-expected-errors "${par_max_expected_errors}"} \ + ${par_max_average_error_rate:+--max-average-error-rate "${par_max_average_error_rate}"} \ + ${par_discard_trimmed:+--discard-trimmed} \ + ${par_discard_untrimmed:+--discard-untrimmed} \ + ${par_discard_casava:+--discard-casava} \ +) +debug "Arguments to cutadapt:" +debug $filter_args +debug + +# Optional output arguments +########################################################### +echo ">> Optional arguments" +[[ "$par_json" == "false" ]] && unset par_json +[[ "$par_fasta" == "false" ]] && unset par_fasta +[[ "$par_info_file" == "false" ]] && unset par_info_file + +optional_output_args=$(echo \ + ${par_report:+--report "${par_report}"} \ + ${par_json:+--json "report.json"} \ + ${par_fasta:+--fasta} \ + ${par_info_file:+--info-file "info.txt"} \ +) + +debug "Arguments to cutadapt:" +debug $optional_output_args +debug + +# Output arguments +# We write the output to a directory rather than +# individual files. +########################################################### + +if [[ -z $par_fasta ]]; then + ext="fastq" +else + ext="fasta" +fi + +if [ $mode = "se" ]; then + output_args=$(echo \ + --output "$output_dir/{name}_001.$ext" \ + ) +else + output_args=$(echo \ + --output "$output_dir/{name}_R1_001.$ext" \ + --paired-output "$output_dir/{name}_R2_001.$ext" \ + ) +fi + +debug "Arguments to cutadapt:" +debug $output_args +debug + +# Full CLI +# Set the --cores argument to 0 unless meta_cpus is set +########################################################### +echo ">> Running cutadapt" +par_cpus=0 +[[ ! -z $meta_cpus ]] && par_cpus=$meta_cpus + +cli=$(echo \ + $input \ + $adapter_args \ + $paired_args \ + $input_args \ + $mod_args \ + $filter_args \ + $optional_output_args \ + $output_args \ + --cores $par_cpus +) + +debug ">> Full CLI to be run:" +debug cutadapt $cli | sed -e 's/--/\r\n --/g' +debug + +cutadapt $cli diff --git a/src/cutadapt/test.sh b/src/cutadapt/test.sh new file mode 100644 index 00000000..eff997d7 --- /dev/null +++ b/src/cutadapt/test.sh @@ -0,0 +1,256 @@ +#!/bin/bash + +set -e + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || (echo "File '$1' does not exist" && exit 1) +} +assert_file_doesnt_exist() { + [ ! -f "$1" ] || (echo "File '$1' exists but shouldn't" && exit 1) +} +assert_file_empty() { + [ ! -s "$1" ] || (echo "File '$1' is not empty but should be" && exit 1) +} +assert_file_not_empty() { + [ -s "$1" ] || (echo "File '$1' is empty but shouldn't be" && exit 1) +} +assert_file_contains() { + grep -q "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) +} +assert_file_not_contains() { + grep -q "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) +} + +############################################# +mkdir test_multiple_output +cd test_multiple_output + +echo "#############################################" +echo "> Run cutadapt with multiple outputs" + +cat > example.fa <<'EOF' +>read1 +MYSEQUENCEADAPTER +>read2 +MYSEQUENCEADAP +>read3 +MYSEQUENCEADAPTERSOMETHINGELSE +>read4 +MYSEQUENCEADABTER +>read5 +MYSEQUENCEADAPTR +>read6 +MYSEQUENCEADAPPTER +>read7 +ADAPTERMYSEQUENCE +>read8 +PTERMYSEQUENCE +>read9 +SOMETHINGADAPTERMYSEQUENCE +EOF + +"$meta_executable" \ + --report minimal \ + --output "out_test/*.fasta" \ + --adapter ADAPTER \ + --input example.fa \ + --fasta \ + --no_match_adapter_wildcards \ + --json + +echo ">> Checking output" +assert_file_exists "report.json" +assert_file_exists "out_test/1_001.fasta" +assert_file_exists "out_test/unknown_001.fasta" + +cd .. +echo + +############################################# +mkdir test_simple_single_end +cd test_simple_single_end + +echo "#############################################" +echo "> Run cutadapt on single-end data" + +cat > example.fa <<'EOF' +>read1 +MYSEQUENCEADAPTER +>read2 +MYSEQUENCEADAP +>read3 +MYSEQUENCEADAPTERSOMETHINGELSE +>read4 +MYSEQUENCEADABTER +>read5 +MYSEQUENCEADAPTR +>read6 +MYSEQUENCEADAPPTER +>read7 +ADAPTERMYSEQUENCE +>read8 +PTERMYSEQUENCE +>read9 +SOMETHINGADAPTERMYSEQUENCE +EOF + +"$meta_executable" \ + --report minimal \ + --output "out_test1/*.fasta" \ + --adapter ADAPTER \ + --input example.fa \ + --fasta \ + --no_match_adapter_wildcards \ + --json + +echo ">> Checking output" +assert_file_exists "report.json" +assert_file_exists "out_test1/1_001.fasta" +assert_file_exists "out_test1/unknown_001.fasta" + +echo ">> Check if output is empty" +assert_file_not_empty "report.json" +assert_file_not_empty "out_test1/1_001.fasta" +assert_file_not_empty "out_test1/unknown_001.fasta" + +echo ">> Check contents" +for i in 1 2 3 7 9; do + assert_file_contains "out_test1/1_001.fasta" ">read$i" +done +for i in 4 5 6 8; do + assert_file_contains "out_test1/unknown_001.fasta" ">read$i" +done + +cd .. +echo + +############################################# +mkdir test_multiple_single_end +cd test_multiple_single_end + +echo "#############################################" +echo "> Run with a combination of inputs" + +cat > example.fa <<'EOF' +>read1 +ACGTACGTACGTAAAAA +>read2 +ACGTACGTACGTCCCCC +>read3 +ACGTACGTACGTGGGGG +>read4 +ACGTACGTACGTTTTTT +EOF + +cat > adapters1.fasta <<'EOF' +>adapter1 +CCCCC +EOF + +cat > adapters2.fasta <<'EOF' +>adapter2 +GGGGG +EOF + +"$meta_executable" \ + --report minimal \ + --output "out_test2/*.fasta" \ + --adapter AAAAA \ + --adapter_fasta adapters1.fasta \ + --adapter_fasta adapters2.fasta \ + --input example.fa \ + --fasta \ + --json + +echo ">> Checking output" +assert_file_exists "report.json" +assert_file_exists "out_test2/1_001.fasta" +assert_file_exists "out_test2/adapter1_001.fasta" +assert_file_exists "out_test2/adapter2_001.fasta" +assert_file_exists "out_test2/unknown_001.fasta" + +echo ">> Check if output is empty" +assert_file_not_empty "report.json" +assert_file_not_empty "out_test2/1_001.fasta" +assert_file_not_empty "out_test2/adapter1_001.fasta" +assert_file_not_empty "out_test2/adapter2_001.fasta" +assert_file_not_empty "out_test2/unknown_001.fasta" + +echo ">> Check contents" +assert_file_contains "out_test2/1_001.fasta" ">read1" +assert_file_contains "out_test2/adapter1_001.fasta" ">read2" +assert_file_contains "out_test2/adapter2_001.fasta" ">read3" +assert_file_contains "out_test2/unknown_001.fasta" ">read4" + +cd .. +echo + +############################################# +mkdir test_simple_paired_end +cd test_simple_paired_end + +echo "#############################################" +echo "> Run cutadapt on paired-end data" + +cat > example_R1.fastq <<'EOF' +@read1 +ACGTACGTACGTAAAAA ++ +IIIIIIIIIIIIIIIII +@read2 +ACGTACGTACGTCCCCC ++ +IIIIIIIIIIIIIIIII +EOF + +cat > example_R2.fastq <<'EOF' +@read1 +ACGTACGTACGTGGGGG ++ +IIIIIIIIIIIIIIIII +@read2 +ACGTACGTACGTTTTTT ++ +IIIIIIIIIIIIIIIII +EOF + +"$meta_executable" \ + --report minimal \ + --output "out_test3/*.fastq" \ + --adapter AAAAA \ + --adapter_r2 GGGGG \ + --input example_R1.fastq \ + --input_r2 example_R2.fastq \ + --quality_cutoff 20 \ + --json \ + ---cpus 1 + +echo ">> Checking output" +assert_file_exists "report.json" +assert_file_exists "out_test3/1_R1_001.fastq" +assert_file_exists "out_test3/1_R2_001.fastq" +assert_file_exists "out_test3/unknown_R1_001.fastq" +assert_file_exists "out_test3/unknown_R2_001.fastq" + +echo ">> Check if output is empty" +assert_file_not_empty "report.json" +assert_file_not_empty "out_test3/1_R1_001.fastq" +assert_file_not_empty "out_test3/1_R2_001.fastq" +assert_file_not_empty "out_test3/unknown_R1_001.fastq" + +echo ">> Check contents" +assert_file_contains "out_test3/1_R1_001.fastq" "@read1" +assert_file_contains "out_test3/1_R2_001.fastq" "@read1" +assert_file_contains "out_test3/unknown_R1_001.fastq" "@read2" +assert_file_contains "out_test3/unknown_R2_001.fastq" "@read2" + +cd .. +echo + +############################################# + +echo "#############################################" +echo "> Test successful" +