viash-hub · rcannood · Feb 2, 2024 · Jan 19, 2024 · Jan 22, 2024 · Jan 24, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,8 @@
 
 * `fastp`: An ultra-fast all-in-one FASTQ preprocessor (PR #3).
 
+* `pear`: Paired-end read merger (PR #10).
+
 ## MAJOR CHANGES
 
 ## MINOR CHANGES

diff --git a/src/pear/config.vsh.yaml b/src/pear/config.vsh.yaml
@@ -0,0 +1,145 @@
+functionality:
+  name: pear
+  description: "PEAR is an ultrafast, memory-efficient and highly accurate pair-end read merger"
+  info:
+    keywords: [ "pair-end", "read", "merge" ]
+    homepage: https://cme.h-its.org/exelixis/web/software/pear
+    documentatoin: https://cme.h-its.org/exelixis/web/software/pear/doc.html
+    reference: doi:10.1093/bioinformatics/btt593
+    licence: "['Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported']"
+  requirements:
+    cpus: 1
+    memory: 4GB
+    commands: [ pear , gzip ]
+  argument_groups:
+    - name: Inputs
+      arguments:
+        - name: --forward_fastq
+          alternatives: -f
+          type: file
+          description: Forward paired-end FASTQ file
+          required: true
+          example: "forward.fastq"
+        - name: --reverse_fastq
+          alternatives: -r
+          type: file
+          description: Reverse paired-end FASTQ file
+          required: true
+          example: "reverse.fastq"
+    - name: Outputs
+      arguments:
+        - name: --output
+          alternatives: -o
+          type: string
+          description: The prefix of the output file(s). The output files will be named as <output>.assembled.fastq and <output>.unassembled.forward.fastq, <output>.unassembled.reverse.fastq and <output>.discarded.fastq.
+          required: true
+          example: "output.fastq"
+    - name: Arguments
+      arguments:
+        - name: --p_value
+          alternatives: -p
+          type: double
+          description: |
+            Specify a p-value for the statistical test. If the computed p-value of a possible assembly exceeds the specified p-value then  paired-end  read  will not be assembled. Valid options are: 0.0001, 0.001, 0.01, 0.05 and 1.0. Setting 1.0 disablesthe test.
+          default: 0.01
+          required: false
+        - name: --min_overlap
+          alternatives: -v
+          type: integer
+          description: |
+            Specify the minimum overlap size. The minimum overlap may be set to 1 when the statistical test is used. However, further restricting the minimum overlap size to a proper value may reduce false-positive assembles.
+          required: false
+          default: 10
+        - name: --max_assembly_length
+          alternatives: -m
+          type: integer
+          description: |
+            Specify the maximum possible length of the assembled sequences. Setting this value to 0 disables the restriction and assembled sequences may be arbitrary long.
+          required: false
+          default: 0
+        - name: --min_assembly_length
+          alternatives: -n
+          type: integer
+          description: |
+            Specify the minimum possible length of the assembled sequences. Setting this value to 0 disables the restriction and assembled sequences may be arbitrary short.
+          required: false
+          default: 0
+        - name: --min_trim_length
+          alternatives: -t
+          type: integer
+          description: |
+            Specify  the  minimum length of reads after trimming the low quality part (see option -q)
+          required: false
+          default: 1
+        - name: --quality_threshold
+          alternatives: -q
+          type: integer
+          description: |
+            Specify the quality threshold for trimming the low quality part of a read. If  the  quality  scores of two consecutive bases are strictly less than the specified threshold, the rest of the read will be trimmed.
+          required: false
+          default: 0
+        - name: --max_uncalled_base
+          alternatives: -u
+          type: double
+          description: |
+            Specify the maximal proportion of uncalled bases in a read. Setting this value to 0 will cause PEAR to discard all reads containing  uncalled  bases. The other extreme setting is 1 which  causes PEAR to process all reads independent on the number of uncalled bases.
+          default: 1.0
+          required: false
+        - name: --test_method
+          alternatives: -g
+          type: integer
+          description: |
+            Specify the type of statistical test. Two options are available. 1: Given the minimum allowed overlap, test using the highest OES. Note that due to its discrete nature, this test usually yields a lower p-value for the assembled read than the cut- off (specified by -p). For example, setting the cut-off to 0.05 using this test, the assembled reads might have an actual p-value of 0.02.
+            2. Use the acceptance probability (m.a.p). This test methods computes the same probability as test method 1. However, it assumes that the minimal overlap is the observed overlap with the  highest OES, instead of the one specified by -v. Therefore, this is not a valid statistical test and the 'p-value' is  in fact the maximal probability for accepting the assembly. Nevertheless, we observed in practice that for the case the actual overlap sizes are relatively small, test 2 can correctly assemble more reads with only slightly higher false-positive rate.
+          required: false
+          default: 1
+        - name: --emperical_freqs
+          alternatives: -e
+          type: boolean_true
+          description: |
+            Disable  empirical base frequencies.
+        - name: --score_method
+          alternatives: -s
+          type: integer
+          description: |
+            Specify the scoring method. 1. OES with +1 for match and -1 for mismatch. 2: Assembly score (AS). Use +1 for match and -1 for mismatch multiplied by base quality scores. 3: Ignore quality scores and use +1 for a match and -1 for a mismatch.
+          required: false
+          default: 2
+        - name: --phred_base
+          alternatives: -b
+          type: integer
+          description: |
+            Base PHRED quality score.
+          required: false
+          default: 33
+        - name: --memory
+          alternatives: -y
+          type: string
+          description: |
+            Specify the amount of memory to be used. The number may be followed by one of the letters K, M, or G denoting Kilobytes, Megabytes and Gigabytes, respectively. Bytes are assumed in case no letter is specified.
+          required: false
+          example: 4G
+        - name: --cap
+          alternatives: -c
+          type: integer
+          description: |
+            Specify  the upper bound for the resulting quality score. If set to zero, capping is disabled.
+          required: false
+          default: 40
+        - name: --nbase
+          alternatives: -z
+          type: boolean_true
+          description: |
+            When merging a base-pair that consists of two non-equal bases out of which none is degenerate, set the merged base to N and use the highest quality score of the two bases
+  resources:
+    - type: bash_script
+      path: script.sh
+  test_resources:
+    - type: bash_script
+      path: test.sh
+    - type: file
+      path: test_data
+platforms:
+  - type: docker
+    image: quay.io/biocontainers/pear:0.9.6--h9d449c0_10
+  - type: nextflow
diff --git a/src/pear/help.txt b/src/pear/help.txt
@@ -0,0 +1,91 @@
+```bash
+pear -h
+```
+
+ ____  _____    _    ____ 
+|  _ \| ____|  / \  |  _ \
+| |_) |  _|   / _ \ | |_) |
+|  __/| |___ / ___ \|  _ <
+|_|   |_____/_/   \_\_| \_\
+PEAR v0.9.6 [January 15, 2015]  - [+bzlib +zlib]
+
+Citation - PEAR: a fast and accurate Illumina Paired-End reAd mergeR
+Zhang et al (2014) Bioinformatics 30(5): 614-620 | doi:10.1093/bioinformatics/btt593
+
+License: Creative Commons Licence
+Bug-reports and requests to: [email protected] and [email protected]
+
+
+Usage: pear <options>
+Standard (mandatory):
+  -f, --forward-fastq         <str>     Forward paired-end FASTQ file.
+  -r, --reverse-fastq         <str>     Reverse paired-end FASTQ file.
+  -o, --output                <str>     Output filename.
+Optional:
+  -p, --p-value               <float>   Specify  a p-value for the statistical test. If the computed
+                                        p-value of a possible assembly exceeds the specified p-value
+                                        then  paired-end  read  will not be assembled. Valid options
+                                        are: 0.0001, 0.001, 0.01, 0.05 and 1.0. Setting 1.0 disables
+                                        the test. (default: 0.01)
+  -v, --min-overlap           <int>     Specify the minimum overlap size. The minimum overlap may be
+                                        set to 1 when the statistical test is used. However, further
+                                        restricting  the  minimum overlap size to a proper value may
+                                        reduce false-positive assembles. (default: 10)
+  -m, --max-assembly-length   <int>     Specify   the  maximum  possible  length  of  the  assembled
+                                        sequences.  Setting this value to 0 disables the restriction
+                                        and assembled sequences may be arbitrary long. (default: 0)
+  -n, --min-assembly-length   <int>     Specify   the  minimum  possible  length  of  the  assembled
+                                        sequences.  Setting this value to 0 disables the restriction
+                                        and  assembled  sequences  may be arbitrary short. (default:
+                                        50)
+  -t, --min-trim-length       <int>     Specify  the  minimum length of reads after trimming the low
+                                        quality part (see option -q). (default: 1)
+  -q, --quality-threshold     <int>     Specify  the  quality  score  threshold for trimming the low
+                                        quality  part  of  a  read.  If  the  quality  scores of two
+                                        consecutive  bases  are  strictly  less  than  the specified
+                                        threshold,  the  rest of the read will be trimmed. (default:
+                                        0)
+  -u, --max-uncalled-base     <float>   Specify  the maximal proportion of uncalled bases in a read.
+                                        Setting this value to 0 will cause PEAR to discard all reads
+                                        containing  uncalled  bases.  The other extreme setting is 1
+                                        which  causes  PEAR  to process all reads independent on the
+                                        number of uncalled bases. (default: 1)
+  -g, --test-method           <int>     Specify  the  type  of  statistical  test.  Two  options are
+                                        available. (default: 1)
+                                        1: Given the minimum allowed overlap, test using the highest
+                                        OES. Note that due to its discrete nature, this test usually
+                                        yields  a lower p-value for the assembled read than the cut-
+                                        off  (specified  by -p). For example, setting the cut-off to
+                                        0.05  using  this  test,  the  assembled reads might have an
+                                        actual p-value of 0.02.
+
+                                        2. Use the acceptance probability (m.a.p). This test methods
+                                        computes  the same probability as test method 1. However, it
+                                        assumes  that  the  minimal  overlap is the observed overlap
+                                        with  the  highest  OES, instead of the one specified by -v.
+                                        Therefore,  this  is  not  a  valid statistical test and the
+                                        'p-value'  is  in fact the maximal probability for accepting
+                                        the assembly. Nevertheless, we observed in practice that for
+                                        the case the actual overlap sizes are relatively small, test
+                                        2  can  correctly  assemble  more  reads  with only slightly
+                                        higher false-positive rate.
+  -e, --empirical-freqs                 Disable  empirical base frequencies. (default: use empirical
+                                        base frequencies)
+  -s, --score-method          <int>     Specify the scoring method. (default: 2)
+                                        1. OES with +1 for match and -1 for mismatch.
+                                        2: Assembly score (AS). Use +1 for match and -1 for mismatch
+                                        multiplied by base quality scores.
+                                        3: Ignore quality scores and use +1 for a match and -1 for a
+                                        mismatch.
+  -b, --phred-base            <int>     Base PHRED quality score. (default: 33)
+  -y, --memory                <str>     Specify  the  amount of memory to be used. The number may be
+                                        followed  by  one  of  the  letters  K,  M,  or  G  denoting
+                                        Kilobytes,  Megabytes and Gigabytes, respectively. Bytes are
+                                        assumed in case no letter is specified.
+  -c, --cap                   <int>     Specify  the upper bound for the resulting quality score. If
+                                        set to zero, capping is disabled. (default: 40)
+  -j, --threads               <int>     Number of threads to use
+  -z, --nbase                           When  merging  a  base-pair  that  consists of two non-equal
+                                        bases  out  of which none is degenerate, set the merged base
+                                        to N and use the highest quality score of the two bases
+  -h, --help                            This help screen.
diff --git a/src/pear/script.sh b/src/pear/script.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+## VIASH START
+## VIASh END
+
+[[ "$par_emperical_freqs" == "false" ]] && unset par_emperical_freqs
+[[ "$par_nbase" == "false" ]] && unset par_nbase
+
+ext=${par_forward_fastq##*.}
+if [[ "$ext" == "gz" ]]; then
+  gunzip $par_forward_fastq
+  gunzip $par_reverse_fastq
+  par_forward_fastq=${par_forward_fastq%.*}
+  par_reverse_fastq=${par_reverse_fastq%.*}
+fi
+
+
+pear \
+  -f "$par_forward_fastq" \
+  -r "$par_reverse_fastq" \
+  -o "$par_output" \
+  ${par_p_value:+-p "${par_p_value}"} \
+  ${par_min_overlap:+-v "${par_min_overlap}"} \
+  ${par_max_assembly_length:+-m "${par_max_assembly_length}"} \
+  ${par_min_assembly_length:+-n "${par_min_assembly_length}"} \
+  ${par_min_trim_length:+-t "${par_min_trim_length}"} \
+  ${par_quality_threshold:+-q "${par_quality_threshold}"} \
+  ${par_max_uncalled_base:+-u "${par_max_uncalled_base}"} \
+  ${par_test_method:+-g "${par_test_method}"} \
+  ${par_score_method:+-s "${par_score_method}"} \
+  ${par_phred_base:+-b "${par_phred_base}"} \
+  ${par_memory:+-y "${par_memory}"} \
+  ${par_cap:+-c "${par_cap}"} \
+  ${meta_cpus:+-j "${meta_cpus}"} \
+  ${par_emperical_freqs:+-e} \
+  ${par_nbase:+-z}
+
+
+gzip -f ${par_output}.assembled.fastq
+gzip -f ${par_output}.unassembled.forward.fastq
+gzip -f ${par_output}.unassembled.reverse.fastq
+gzip -f ${par_output}.discarded.fastq
diff --git a/src/pear/test.sh b/src/pear/test.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+set -e
+
+dir_in="${meta_resources_dir%/}/test_data"
+
+echo "> Run PEAR"
+"$meta_executable" \
+  --forward_fastq "$dir_in/reads.left.fq.gz" \
+  --reverse_fastq "$dir_in/reads.right.fq.gz" \
+  --output "test" \
+  --p_value 0.01 \
+  --memory "4G"
+
+echo ">> Checking output"
+echo $(ls)
+[ ! -f "test.assembled.fastq.gz" ] && echo "Output file test.assembled.fastq.gz does not exist" && exit 1
+[ ! -f "test.unassembled.forward.fastq.gz" ] && echo "Output file test.unassembled.forward.fastq.gz does not exist" && exit 1
+[ ! -f "test.unassembled.reverse.fastq.gz" ] && echo "Output file test.unassembled.reverse.fastq.gz does not exist" && exit 1
+[ ! -f "test.discarded.fastq.gz" ] && echo "Output file ftest.discarded.fastq.gz does not exist" && exit 1
+
+echo "> Test successful"
diff --git a/src/pear/test_data/reads.left.fq.gz b/src/pear/test_data/reads.left.fq.gz
diff --git a/src/pear/test_data/reads.right.fq.gz b/src/pear/test_data/reads.right.fq.gz