diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index 88636952..ebd56e3b 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -437,14 +437,17 @@ functionality: # - name: --untrimmed_paired_output # - name: too_short_paired_output # - name: too_long_paired_output + - name: Debug + arguments: + - type: boolean_true + name: --debug + description: Print debug information resources: - type: bash_script path: script.sh test_resources: - type: bash_script path: test.sh - - type: file - path: test_data platforms: - type: docker image: python:3.12 diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh index 2ae29a3c..1edfb090 100644 --- a/src/cutadapt/script.sh +++ b/src/cutadapt/script.sh @@ -1,16 +1,30 @@ #!/bin/bash +## VIASH START +par_adapter='AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;GGATCGGAAGAGCACACGTCTGAACTCCAGTCAC' +par_input='src/cutadapt/test_data/se/a.fastq' +par_report='full' +par_json='false' +par_output='output' +par_fasta='false' +par_info_file='false' +par_debug='true' +## VIASH END + +# TODO: change this? if [ -z $par_output ]; then par_output=. else mkdir -p "$par_output" fi +function debug { + [[ "$par_debug" == "true" ]] && echo "DEBUG: $@" +} # Init ########################################################### -echo "Running cutadapt" -echo + echo ">> Paired-end data or not?" mode="" @@ -29,85 +43,45 @@ fi # - string and fasta ########################################################### -multi_adapter="" -for adapter in `echo $par_adapter | tr ':' ' '`; do - multi_adapter="$multi_adapter --adapter $adapter" -done - -multi_adapter_fasta="" -for adapter_fasta in `echo $par_adapter_fasta | tr ':' ' '`; do - multi_adapter_fasta="$multi_adapter_fasta --adapter file:$adapter_fasta" -done - -multi_adapter_r2="" -for adapter_r2 in `echo $par_adapter_r2 | tr ':' ' '`; do - multi_adapter_r2="$multi_adapter_r2 --adapter_r2 $adapter_r2" -done - -multi_adapter_fasta_r2="" -for adapter_fasta_r2 in `echo $par_adapter_fasta_r2 | tr ':' ' '`; do - multi_adapter_fasta_r2="$multi_adapter_fasta_r2 --adapter file:$adapter_fasta_r2" -done - -multi_front="" -for front in `echo $par_front | tr ':' ' '`; do - multi_front="$multi_front --front $front" -done - -multi_front_fasta="" -for front_fasta in `echo $par_front_fasta | tr ':' ' '`; do - multi_front_fasta="$multi_front_fasta --front file:$front_fasta" -done - -multi_front_r2="" -for front_r2 in `echo $par_front_r2 | tr ':' ' '`; do - multi_front_r2="$multi_front_r2 --front_r2 $front_r2" -done - -multi_front_fasta_r2="" -for front_fasta_r2 in `echo $par_front_fasta_r2 | tr ':' ' '`; do - multi_front_fasta_r2="$multi_front_fasta_r2 --front file:$front_fasta_r2" -done - -multi_anywhere="" -for anywhere in `echo $par_anywhere | tr ':' ' '`; do - multi_anywhere="$multi_anywhere --anywhere $anywhere" -done - -multi_anywhere_fasta="" -for anywhere_fasta in `echo $par_anywhere_fasta | tr ':' ' '`; do - multi_anywhere_fasta="$multi_anywhere_fasta --anywhere file:$anywhere_fasta" -done - -multi_anywhere_r2="" -for anywhere_r2 in `echo $par_anywhere_r2 | tr ':' ' '`; do - multi_anywhere_r2="$multi_anywhere_r2 --anywhere_r2 $anywhere_r2" -done - -multi_anywhere_fasta_r2="" -for anywhere_fasta_r2 in `echo $par_anywhere_fasta_r2 | tr ':' ' '`; do - multi_anywhere_fasta_r2="$multi_anywhere_fasta_r2 --anywhere file:$anywhere_fasta_r2" -done - -echo ">> Parsing arguments dealing with adapters" +function add_flags { + local arg=$1 + local flag=$2 + local prefix=$3 + [[ -z $prefix ]] && prefix="" + + # This function should not be called if the input is empty + # but check for it just in case + if [[ -z $arg ]]; then + return + fi + + local output="" + IFS=';' read -r -a array <<< "$arg" + for a in "${array[@]}"; do + output="$output $flag $prefix$a" + done + echo $output +} + +debug ">> Parsing arguments dealing with adapters" adapter_args=$(echo \ - ${par_adapter:+${multi_adapter}} \ - ${par_adapter_fasta:+${multi_adapter_fasta}} \ - ${par_front:+${multi_front}} \ - ${par_front_fasta:+${multi_front_fasta}} \ - ${par_anywhere:+${multi_anywhere}} \ - ${par_anywhere_fasta:+${multi_anywhere_fasta}} \ - - ${par_adapter_r2:+${multi_adapter_r2}} \ - ${par_adapter_fasta_r2:+${multi_adapter_fasta_r2}} \ - ${par_front_r2:+${multi_front_r2}} \ - ${par_front_fasta_r2:+${multi_front_fasta_r2}} \ - ${par_anywhere_r2:+${multi_anywhere_r2}} \ - ${par_anywhere_fasta_r2:+${multi_anywhere_fasta_r2}} \ + ${par_adapter:+$(add_flags "$par_adapter" "--adapter")} \ + ${par_adapter_fasta:+$(add_flags "$par_adapter_fasta" "--adapter" "file:")} \ + ${par_front:+$(add_flags "$par_front" "--front")} \ + ${par_front_fasta:+$(add_flags "$par_front_fasta" "--front" "file:")} \ + ${par_anywhere:+$(add_flags "$par_anywhere" "--anywhere")} \ + ${par_anywhere_fasta:+$(add_flags "$par_anywhere_fasta" "--anywhere" "file:")} \ + ${par_adapter_r2:+$(add_flags "$par_adapter_r2" "-A")} \ + ${par_adapter_fasta_r2:+$(add_flags "$par_adapter_fasta_r2" "-A" "file:")} \ + ${par_front_r2:+$(add_flags "$par_front_r2" "-G")} \ + ${par_front_fasta_r2:+$(add_flags "$par_front_fasta_r2" "-G" "file:")} \ + ${par_anywhere_r2:+$(add_flags "$par_anywhere_r2" "-B")} \ + ${par_anywhere_fasta_r2:+$(add_flags "$par_anywhere_fasta_r2" "-B" "file:")} \ ) -echo "Arguments to cutadapt:" -echo "$adapter_args" -echo + +debug "Arguments to cutadapt:" +debug "$adapter_args" +debug # Paired-end options ########################################################### @@ -120,9 +94,9 @@ paired_args=$(echo \ ${par_pair_filter:+--pair-filter "${par_pair_filter}"} \ ${par_interleaved:+--interleaved} ) -echo "Arguments to cutadapt:" -echo $paired_args -echo +debug "Arguments to cutadapt:" +debug $paired_args +debug # Input arguments ########################################################### @@ -142,9 +116,9 @@ input_args=$(echo \ ${par_action:+--action "${par_action}"} \ ${par_revcomp:+--revcomp} \ ) -echo "Arguments to cutadapt:" -echo $input_args -echo +debug "Arguments to cutadapt:" +debug $input_args +debug # Read modifications ########################################################### @@ -170,9 +144,9 @@ mod_args=$(echo \ ${par_rename:+--rename "${par_rename}"} \ ${par_zero_cap:+--zero-cap} \ ) -echo "Arguments to cutadapt:" -echo $mod_args -echo +debug "Arguments to cutadapt:" +debug $mod_args +debug # Filtering of processed reads arguments ########################################################### @@ -194,46 +168,58 @@ filter_args=$(echo \ ${par_discard_untrimmed:+--discard-untrimmed} \ ${par_discard_casava:+--discard-casava} \ ) -echo "Arguments to cutadapt:" -echo $filter_args -echo +debug "Arguments to cutadapt:" +debug $filter_args +debug -# Output arguments -# We write the output to a directory rather than -# individual files. +# Optional output arguments ########################################################### -echo ">> Output arguments" +echo ">> Optional arguments" [[ "$par_json" == "false" ]] && unset par_json [[ "$par_fasta" == "false" ]] && unset par_fasta [[ "$par_info_file" == "false" ]] && unset par_info_file +optional_output_args=$(echo \ + ${par_report:+--report "${par_report}"} \ + ${par_json:+--json "${par_output}/report.json"} \ + ${par_fasta:+--fasta} \ + ${par_info_file:+--info-file "$par_output/info.txt"} \ +) + +debug "Arguments to cutadapt:" +debug $optional_output_args +debug + +# Output arguments +# We write the output to a directory rather than +# individual files. +########################################################### + +if [[ -z $par_fasta ]]; then + ext="fastq" +else + ext="fasta" +fi + if [ $mode = "se" ]; then output_args=$(echo \ - ${par_report:+--report "${par_report}"} \ - ${par_json:+--json "${par_output}/report.json"} \ - --output "$par_output/{name}_R1_001.fastq" \ - ${par_fasta:+--fasta} \ - ${par_info_file:+--info-file} \ + --output "$par_output/{name}_001.$ext" \ ) else output_args=$(echo \ - ${par_report:+--report "${par_report}"} \ - ${par_json:+--json "${par_output}/report.json"} \ - --output "$par_output/{name}_R1_001.fastq" \ - --paired-output "$par_output/{name}_R2_001.fastq" \ - ${par_fasta:+--fasta} \ - ${par_info_file:+--info-file} \ + --output "$par_output/{name}_R1_001.$ext" \ + --paired-output "$par_output/{name}_R2_001.$ext" \ ) fi -echo "Arguments to cutadapt:" -echo $output_args -echo + +debug "Arguments to cutadapt:" +debug $output_args +debug # Full CLI # Set the --cores argument to 0 unless meta_cpus is set ########################################################### -echo ">> Full CLI to be run:" - +echo ">> Running cutadapt" par_cpus=0 [[ ! -z $meta_cpus ]] && par_cpus=$meta_cpus @@ -244,10 +230,13 @@ cli=$(echo \ $input_args \ $mod_args \ $filter_args \ + $optional_output_args \ $output_args \ --cores $par_cpus ) -echo cutadapt $cli | sed -e 's/--/\r\n --/g' +debug ">> Full CLI to be run:" +debug cutadapt $cli | sed -e 's/--/\r\n --/g' +debug cutadapt $cli | tee $par_output/report.txt diff --git a/src/cutadapt/test.sh b/src/cutadapt/test.sh index 14e2e6fe..d36e6798 100644 --- a/src/cutadapt/test.sh +++ b/src/cutadapt/test.sh @@ -2,88 +2,216 @@ set -e -dir_in="$meta_resources_dir/test_data" - +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || (echo "File '$1' does not exist" && exit 1) +} +assert_file_doesnt_exist() { + [ ! -f "$1" ] || (echo "File '$1' exists but shouldn't" && exit 1) +} +assert_file_empty() { + [ ! -s "$1" ] || (echo "File '$1' is not empty but should be" && exit 1) +} +assert_file_not_empty() { + [ -s "$1" ] || (echo "File '$1' is empty but shouldn't be" && exit 1) +} +assert_file_contains() { + grep -q "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) +} +assert_file_not_contains() { + grep -q "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) +} + +############################################# +mkdir test_simple_single_end +cd test_simple_single_end + +echo "#############################################" echo "> Run cutadapt on single-end data" + +cat > example.fa <<'EOF' +>read1 +MYSEQUENCEADAPTER +>read2 +MYSEQUENCEADAP +>read3 +MYSEQUENCEADAPTERSOMETHINGELSE +>read4 +MYSEQUENCEADABTER +>read5 +MYSEQUENCEADAPTR +>read6 +MYSEQUENCEADAPPTER +>read7 +ADAPTERMYSEQUENCE +>read8 +PTERMYSEQUENCE +>read9 +SOMETHINGADAPTERMYSEQUENCE +EOF + "$meta_executable" \ --report minimal \ - --output output-dir \ - --adapter AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ - --input $dir_in/se/a.fastq \ - --quality_cutoff 20 \ + --output out_test1 \ + --adapter ADAPTER \ + --input example.fa \ + --fasta \ + --no_match_adapter_wildcards \ --json echo ">> Checking output" -[ ! -f "output-dir/report.txt" ] && echo "report.txt does not exist" && exit 1 -[ ! -f "output-dir/report.json" ] && echo "report.json does not exist" && exit 1 -[ ! -f "output-dir/1_R1_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 -[ ! -f "output-dir/unknown_R1_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 +assert_file_exists "out_test1/report.txt" +assert_file_exists "out_test1/report.json" +assert_file_exists "out_test1/1_001.fasta" +assert_file_exists "out_test1/unknown_001.fasta" echo ">> Check if output is empty" -[ -s "output-dir/1_R1_001.fastq" ] && echo "1_R1_001.fastq should be empty" && exit 1 -[ ! -s "output-dir/unknown_R1_001.fastq" ] && echo "unkown_R1_001.fastq is empty" && exit 1 - -rm -r output-dir - +assert_file_not_empty "out_test1/report.txt" +assert_file_not_empty "out_test1/report.json" +assert_file_not_empty "out_test1/1_001.fasta" +assert_file_not_empty "out_test1/unknown_001.fasta" + +echo ">> Check contents" +for i in 1 2 3 7 9; do + assert_file_contains "out_test1/1_001.fasta" ">read$i" +done +for i in 4 5 6 8; do + assert_file_contains "out_test1/unknown_001.fasta" ">read$i" +done + +cd .. +echo + +############################################# +mkdir test_multiple_single_end +cd test_multiple_single_end + +echo "#############################################" echo "> Run with a combination of inputs" -echo ">adapter1" > adapters1.fasta -echo "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC" >> adapters1.fasta - -echo ">adapter1" > adapters2.fasta -echo "TGATCGGAAGAGCACACGTCTGAACTCCAGTCAC" >> adapters2.fasta +cat > example.fa <<'EOF' +>read1 +ACGTACGTACGTAAAAA +>read2 +ACGTACGTACGTCCCCC +>read3 +ACGTACGTACGTGGGGG +>read4 +ACGTACGTACGTTTTTT +EOF + +cat > adapters1.fasta <<'EOF' +>adapter1 +CCCCC +EOF + +cat > adapters2.fasta <<'EOF' +>adapter2 +GGGGG +EOF "$meta_executable" \ --report minimal \ - --output output-dir \ - --adapter AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ - --adapter GGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ + --output out_test2 \ + --adapter AAAAA \ --adapter_fasta adapters1.fasta \ --adapter_fasta adapters2.fasta \ - --input $dir_in/se/a.fastq \ - --quality_cutoff 20 \ + --input example.fa \ + --fasta \ --json echo ">> Checking output" -[ ! -f "output-dir/report.txt" ] && echo "report.txt does not exist" && exit 1 -[ ! -f "output-dir/report.json" ] && echo "report.json does not exist" && exit 1 -[ ! -f "output-dir/1_R1_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 -[ ! -f "output-dir/unknown_R1_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 +assert_file_exists "out_test2/report.txt" +assert_file_exists "out_test2/report.json" +assert_file_exists "out_test2/1_001.fasta" +assert_file_exists "out_test2/adapter1_001.fasta" +assert_file_exists "out_test2/adapter2_001.fasta" +assert_file_exists "out_test2/unknown_001.fasta" echo ">> Check if output is empty" -[ -s "output-dir/1_R1_001.fastq" ] && echo "1_R1_001.fastq should be empty" && exit 1 -[ ! -s "output-dir/unknown_R1_001.fastq" ] && echo "unkown_R1_001.fastq is empty" && exit 1 +assert_file_not_empty "out_test2/report.txt" +assert_file_not_empty "out_test2/report.json" +assert_file_not_empty "out_test2/1_001.fasta" +assert_file_not_empty "out_test2/adapter1_001.fasta" +assert_file_not_empty "out_test2/adapter2_001.fasta" +assert_file_not_empty "out_test2/unknown_001.fasta" + +echo ">> Check contents" +assert_file_contains "out_test2/1_001.fasta" ">read1" +assert_file_contains "out_test2/adapter1_001.fasta" ">read2" +assert_file_contains "out_test2/adapter2_001.fasta" ">read3" +assert_file_contains "out_test2/unknown_001.fasta" ">read4" + +cd .. +echo + +############################################# +mkdir test_simple_paired_end +cd test_simple_paired_end + +echo "#############################################" +echo "> Run cutadapt on paired-end data" -rm -r output-dir -rm adapters?.fasta +cat > example_R1.fastq <<'EOF' +@read1 +ACGTACGTACGTAAAAA ++ +IIIIIIIIIIIIIIIII +@read2 +ACGTACGTACGTCCCCC ++ +IIIIIIIIIIIIIIIII +EOF + +cat > example_R2.fastq <<'EOF' +@read1 +ACGTACGTACGTGGGGG ++ +IIIIIIIIIIIIIIIII +@read2 +ACGTACGTACGTTTTTT ++ +IIIIIIIIIIIIIIIII +EOF -echo "> Run cutadapt on paired-end data" "$meta_executable" \ --report minimal \ - --output output-dir \ - --adapter AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ - --adapter AGATCGGAAGAGCACACGTCTGAACTCCAGTCAB \ - --input $dir_in/pe/a.1.fastq \ - --input_r2 $dir_in/pe/a.2.fastq \ + --output out_test3 \ + --adapter AAAAA \ + --adapter_r2 GGGGG \ + --input example_R1.fastq \ + --input_r2 example_R2.fastq \ --quality_cutoff 20 \ --json \ ---cpus 1 echo ">> Checking output" -[ ! -f "output-dir/report.txt" ] && echo "report.txt does not exist" && exit 1 -[ ! -f "output-dir/report.json" ] && echo "report.json does not exist" && exit 1 -[ ! -f "output-dir/1_R1_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 -[ ! -f "output-dir/1_R2_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 -[ ! -f "output-dir/unknown_R1_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 -[ ! -f "output-dir/unknown_R2_001.fastq" ] && echo "1_R1_001.fastq does not exist" && exit 1 +assert_file_exists "out_test3/report.txt" +assert_file_exists "out_test3/report.json" +assert_file_exists "out_test3/1_R1_001.fastq" +assert_file_exists "out_test3/1_R2_001.fastq" +assert_file_exists "out_test3/unknown_R1_001.fastq" +assert_file_exists "out_test3/unknown_R2_001.fastq" echo ">> Check if output is empty" -[ -s "output-dir/1_R1_001.fastq" ] && echo "1_R1_001.fastq should be empty" && exit 1 -[ -s "output-dir/1_R2_001.fastq" ] && echo "1_R2_001.fastq should be empty" && exit 1 -[ ! -s "output-dir/unknown_R1_001.fastq" ] && echo "unkown_R1_001.fastq is empty" && exit 1 -[ ! -s "output-dir/unknown_R2_001.fastq" ] && echo "unkown_R2_001.fastq is empty" && exit 1 +assert_file_not_empty "out_test3/report.txt" +assert_file_not_empty "out_test3/report.json" +assert_file_not_empty "out_test3/1_R1_001.fastq" +assert_file_not_empty "out_test3/1_R2_001.fastq" +assert_file_not_empty "out_test3/unknown_R1_001.fastq" + +echo ">> Check contents" +assert_file_contains "out_test3/1_R1_001.fastq" "@read1" +assert_file_contains "out_test3/1_R2_001.fastq" "@read1" +assert_file_contains "out_test3/unknown_R1_001.fastq" "@read2" +assert_file_contains "out_test3/unknown_R2_001.fastq" "@read2" + +cd .. +echo -rm -r output-dir +############################################# +echo "#############################################" echo "> Test successful" diff --git a/src/cutadapt/test_data/pe/a.1.fastq b/src/cutadapt/test_data/pe/a.1.fastq deleted file mode 100644 index 42735560..00000000 --- a/src/cutadapt/test_data/pe/a.1.fastq +++ /dev/null @@ -1,4 +0,0 @@ -@1 -ACGGCAT -+ -!!!!!!! diff --git a/src/cutadapt/test_data/pe/a.2.fastq b/src/cutadapt/test_data/pe/a.2.fastq deleted file mode 100644 index 42735560..00000000 --- a/src/cutadapt/test_data/pe/a.2.fastq +++ /dev/null @@ -1,4 +0,0 @@ -@1 -ACGGCAT -+ -!!!!!!! diff --git a/src/cutadapt/test_data/script.sh b/src/cutadapt/test_data/script.sh deleted file mode 100755 index 3251b59c..00000000 --- a/src/cutadapt/test_data/script.sh +++ /dev/null @@ -1,16 +0,0 @@ -# cutadapt test data - -# Test data was obtained from https://github.com/snakemake/snakemake-wrappers/tree/master/bio/cutadapt/test - -if [ ! -d /tmp/snakemake-wrappers ]; then - git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers -fi - -mkdir -p src/cutadapt/test_data/pe -mkdir src/cutadapt/test_data/se - -cp -r /tmp/snakemake-wrappers/bio/cutadapt/se/test/reads/* src/cutadapt/test_data/se -cp -r /tmp/snakemake-wrappers/bio/cutadapt/pe/test/reads/* src/cutadapt/test_data/pe - -rm -rf /tmp/snakemake-wrappers - diff --git a/src/cutadapt/test_data/se/a.fastq b/src/cutadapt/test_data/se/a.fastq deleted file mode 100644 index 42735560..00000000 --- a/src/cutadapt/test_data/se/a.fastq +++ /dev/null @@ -1,4 +0,0 @@ -@1 -ACGGCAT -+ -!!!!!!!