From 9936b9a5e9157d4f5816b05c15842f3635b3458a Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Fri, 19 Jan 2024 16:13:57 +0100
Subject: [PATCH 1/5] add arriba component (#1)

* add arriba component

* changes based on #2

* update changelog

* quote arguments

* add command to requirements
---
 CHANGELOG.md                        |   2 +
 src/arriba/config.vsh.yaml          | 386 ++++++++++++++++++++++++++++
 src/arriba/help.txt                 | 198 ++++++++++++++
 src/arriba/script.sh                |  47 ++++
 src/arriba/test.sh                  |  45 ++++
 src/arriba/test_data/A.bam          | Bin 0 -> 296 bytes
 src/arriba/test_data/README.md      |   8 +
 src/arriba/test_data/annotation.gtf |   6 +
 src/arriba/test_data/blacklist.tsv  |   0
 src/arriba/test_data/genome.fasta   |   4 +
 10 files changed, 696 insertions(+)
 create mode 100644 src/arriba/config.vsh.yaml
 create mode 100644 src/arriba/help.txt
 create mode 100644 src/arriba/script.sh
 create mode 100644 src/arriba/test.sh
 create mode 100644 src/arriba/test_data/A.bam
 create mode 100644 src/arriba/test_data/README.md
 create mode 100644 src/arriba/test_data/annotation.gtf
 create mode 100644 src/arriba/test_data/blacklist.tsv
 create mode 100644 src/arriba/test_data/genome.fasta

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d0b30ae0..d2610ff9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,8 @@
 
 ## NEW FEATURES
 
+* `arriba`: Detect gene fusions from RNA-seq data (PR #1).
+
 ## MAJOR CHANGES
 
 ## MINOR CHANGES
diff --git a/src/arriba/config.vsh.yaml b/src/arriba/config.vsh.yaml
new file mode 100644
index 00000000..601ad6c2
--- /dev/null
+++ b/src/arriba/config.vsh.yaml
@@ -0,0 +1,386 @@
+functionality:
+  name: arriba
+  description: Detect gene fusions from RNA-Seq data
+  info:
+    keywords: [Gene fusion, RNA-Seq]
+    homepage: https://arriba.readthedocs.io/en/latest/
+    documentation: https://arriba.readthedocs.io/en/latest/
+    repository: https://github.com/suhrig/arriba
+    reference: "doi:10.1101/gr.257246.119"
+    licence: MIT
+  requirements:
+    cpus: 1
+    commands: [ arriba ]
+  argument_groups:
+    - name: Inputs
+      arguments:
+      - name: --bam
+        alternatives: -x
+        type: file
+        description: |
+          File in SAM/BAM/CRAM format with main alignments as generated by STAR
+          (Aligned.out.sam). Arriba extracts candidate reads from this file.
+        required: true
+        example: Aligned.out.bam
+      - name: --genome
+        alternatives: -a
+        type: file
+        description: |
+          FastA file with genome sequence (assembly). The file may be gzip-compressed. An 
+          index with the file extension .fai must exist only if CRAM files are processed.
+        required: true
+        example: assembly.fa
+      - name: --gene_annotation
+        alternatives: -g
+        type: file
+        description: |
+          GTF file with gene annotation. The file may be gzip-compressed.
+        required: true
+        example: annotation.gtf
+      - name: --known_fusions
+        alternatives: -k
+        type: file
+        description: |
+          File containing known/recurrent fusions. Some cancer entities are often 
+          characterized by fusions between the same pair of genes. In order to boost 
+          sensitivity, a list of known fusions can be supplied using this parameter. The list 
+          must contain two columns with the names of the fused genes, separated by tabs.
+        required: false
+        example: known_fusions.tsv
+      - name: --blacklist
+        alternatives: -b
+        type: file
+        description: |
+          File containing blacklisted events (recurrent artifacts and transcripts 
+          observed in healthy tissue).
+        required: false
+        example: blacklist.tsv
+      - name: --structural_variants
+        alternatives: -d
+        type: file
+        description: |
+          Tab-separated file with coordinates of structural variants found using 
+          whole-genome sequencing data. These coordinates serve to increase sensitivity 
+          towards weakly expressed fusions and to eliminate fusions with low evidence. 
+        required: false
+        example: structural_variants_from_WGS.tsv
+      - name: --tags
+        alternatives: -t
+        type: file
+        description: |
+          Tab-separated file containing fusions to annotate with tags in the 'tags' column. 
+          The first two columns specify the genes; the third column specifies the tag. The 
+          file may be gzip-compressed. 
+        required: false
+        example: tags.tsv
+      - name: --protein_domains
+        alternatives: -p
+        type: file
+        description: |
+          File in GFF3 format containing coordinates of the protein domains of genes. The
+          protein domains retained in a fusion are listed in the column
+          'retained_protein_domains'. The file may be gzip-compressed.
+        required: false
+        example: protein_domains.gff3
+    - name: Outputs
+      arguments:
+        - name: --fusions
+          alternatives: -o
+          type: file
+          direction: output
+          description: |
+            Output file with fusions that have passed all filters.
+          required: true
+          example: fusions.tsv
+        - name: --fusions_discarded
+          alternatives: -O
+          type: file
+          direction: output
+          description: |
+            Output file with fusions that were discarded due to filtering. 
+          required: false
+          example: fusions.discarded.tsv
+    - name: Arguments
+      arguments:
+        - name: --max_genomic_breakpoint_distance
+          alternatives: -D
+          type: long
+          description: |
+            When a file with genomic breakpoints obtained via 
+            whole-genome sequencing is supplied via the --structural_variants
+            parameter, this parameter determines how far a 
+            genomic breakpoint may be away from a 
+            transcriptomic breakpoint to consider it as a 
+            related event. For events inside genes, the 
+            distance is added to the end of the gene; for 
+            intergenic events, the distance threshold is 
+            applied as is. Default: 100000.
+          required: false
+        - name: --strandedness
+          alternatives: -s
+          type: string
+          description: |
+            Whether a strand-specific protocol was used for library preparation, 
+            and if so, the type of strandedness (auto/yes/no/reverse). When 
+            unstranded data is processed, the strand can sometimes be inferred from 
+            splice-patterns. But in unclear situations, stranded data helps 
+            resolve ambiguities. Default: auto
+          choices: ["auto", "yes", "no", "reverse"]
+          required: false
+        - name: --interesting_contigs
+          alternatives: -i
+          type: string
+          description: |
+            List of interesting contigs. Fusions between genes 
+            on other contigs are ignored. Cfontigs can be specified with or without the 
+            prefix "chr". Asterisks (*) are treated as wild-cards. 
+            Default: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y AC_* NC_*
+          required: false
+          multiple: true
+          multiple_sep: ","
+          example: ["1", "2", "AC_*", "NC_*"]
+        - name: --viral_contigs
+          alternatives: -v
+          type: string
+          description: |
+            Comma-/space-separated list of viral contigs. Asterisks (*) are treated as 
+            wild-cards.
+            Default: AC_* NC_*
+          required: false
+          multiple: true
+          multiple_sep: ","
+          example: ["AC_*", "NC_*"]
+        - name: --disable_filters
+          alternatives: -f
+          type: string
+          description: |
+            Comma-/space-separated list of filters to disable. By default all filters are 
+            enabled. 
+          choices: [ homologs, low_entropy, isoforms, 
+            top_expressed_viral_contigs, viral_contigs, uninteresting_contigs, 
+            non_coding_neighbors, mismatches, duplicates, no_genomic_support, 
+            genomic_support, intronic, end_to_end, relative_support, 
+            low_coverage_viral_contigs, merge_adjacent, mismappers, multimappers, 
+            same_gene, long_gap, internal_tandem_duplication, small_insert_size, 
+            read_through, inconsistently_clipped, intragenic_exonic, 
+            marginal_read_through, spliced, hairpin, blacklist, min_support, 
+            select_best, in_vitro, short_anchor, known_fusions, no_coverage, 
+            homopolymer, many_spliced ]
+          required: false
+          multiple: true
+          multiple_sep: ","
+        - name: --max_e_value
+          alternatives: -E
+          type: double
+          description: |
+            Arriba estimates the number of fusions with a given number of supporting 
+            reads which one would expect to see by random chance. If the expected number 
+            of fusions (e-value) is higher than this threshold, the fusion is 
+            discarded by the 'relative_support' filter. Note: Increasing this 
+            threshold can dramatically increase the number of false positives and may 
+            increase the runtime of resource-intensive steps. Fractional values are 
+            possible. Default: 0.300000 
+          required: false
+        - name: --min_supporting_reads
+          alternatives: -S
+          type: integer
+          description: |
+            The 'min_support' filter discards all fusions with fewer than 
+            this many supporting reads (split reads and discordant mates 
+            combined). Default: 2 
+          required: false
+          example: 2
+        - name: --max_mismappers
+          alternatives: -m
+          type: double
+          description: |
+            When more than this fraction of supporting reads turns out to be 
+            mismappers, the 'mismappers' filter discards the fusion. Default: 
+            0.800000
+          required: false
+          example: 0.8
+        - name: --max_homolog_identity
+          alternatives: -L
+          type: double
+          description: |
+            Genes with more than the given fraction of sequence identity are 
+            considered homologs and removed by the 'homologs' filter. 
+            Default: 0.300000 
+          required: false
+          example: 0.3
+        - name: --homopolymer_length
+          alternatives: -H
+          type: integer
+          description: |
+            The 'homopolymer' filter removes breakpoints adjacent to 
+            homopolymers of the given length or more. Default: 6
+          required: false
+          example: 6
+        - name: --read_through_distance
+          alternatives: -R
+          type: integer
+          description: |
+            The 'read_through' filter removes read-through fusions 
+            where the breakpoints are less than the given distance away 
+            from each other. Default: 10000 
+          required: false
+          example: 10000
+        - name : --min_anchor_length
+          alternatives: -A
+          type: integer
+          description: |
+            Alignment artifacts are often characterized by split reads coming 
+            from only one gene and no discordant mates. Moreover, the split 
+            reads only align to a short stretch in one of the genes. The 
+            'short_anchor' filter removes these fusions. This parameter sets 
+            the threshold in bp for what the filter considers short. Default: 23 
+          required: false
+          example: 23
+        - name: --many_spliced_events
+          alternatives: -M
+          type: integer
+          description: |
+            The 'many_spliced' filter recovers fusions between genes that 
+            have at least this many spliced breakpoints. Default: 4
+          required: false
+          example: 4
+        - name: --max_kmer_content
+          alternatives: -K
+          type: double
+          description: |
+            The 'low_entropy' filter removes reads with repetitive 3-mers. If 
+            the 3-mers make up more than the given fraction of the sequence, then 
+            the read is discarded. Default: 0.600000 
+          required: false
+          example: 0.6
+        - name: --max_mismatch_pvalue
+          alternatives: -V
+          type: double
+          description: |
+            The 'mismatches' filter uses a binomial model to calculate a 
+            p-value for observing a given number of mismatches in a read. If 
+            the number of mismatches is too high, the read is discarded. 
+            Default: 0.010000 
+          required: false
+          example: 0.05
+        - name: --fragment_length
+          alternatives: -F
+          type: integer
+          description: |
+            When paired-end data is given, the fragment length is estimated 
+            automatically and this parameter has no effect. But when single-end 
+            data is given, the mean fragment length should be specified to 
+            effectively filter fusions that arise from hairpin structures. 
+            Default: 200 
+          required: false
+          example: 200
+        - name: --max_reads
+          alternatives: -U
+          type: integer
+          description: |
+            Subsample fusions with more than the given number of supporting reads. This 
+            improves performance without compromising sensitivity, as long as the 
+            threshold is high. Counting of supporting reads beyond the threshold is 
+            inaccurate, obviously. Default: 300 
+          required: false
+          example: 300
+        - name: --quantile
+          alternatives: -Q
+          type: double
+          description: |
+            Highly expressed genes are prone to produce artifacts during library 
+            preparation. Genes with an expression above the given quantile are eligible 
+            for filtering by the 'in_vitro' filter. Default: 0.998000
+          required: false
+          example: 0.998
+        - name: --exonic_fraction
+          alternatives: -e
+          type: double
+          description: |
+            The breakpoints of false-positive predictions of intragenic events 
+            are often both in exons. True predictions are more likely to have at 
+            least one breakpoint in an intron, because introns are larger. If the 
+            fraction of exonic sequence between two breakpoints is smaller than 
+            the given fraction, the 'intragenic_exonic' filter discards the 
+            event. Default: 0.330000 
+          required: false
+          example: 0.33
+        - name: --top_n
+          alternatives: -T
+          type: integer
+          description: |
+            Only report viral integration sites of the top N most highly expressed viral 
+            contigs. Default: 5
+          required: false
+          example: 5
+        - name: --covered_fraction
+          alternatives: -C
+          type: double
+          description: |
+            Ignore virally associated events if the virus is not fully 
+            expressed, i.e., less than the given fraction of the viral contig is 
+            transcribed. Default: 0.050000 
+          required: false
+          example: 0.05
+        - name: --max_itd_length
+          alternatives: -l
+          type: integer
+          description: |
+            Maximum length of internal tandem duplications. Note: Increasing 
+            this value beyond the default can impair performance and lead to many 
+            false positives. Default: 100 
+          required: false
+          example: 100
+        - name: --min_itd_allele_fraction
+          alternatives: -z
+          type: double
+          description: |
+            Required fraction of supporting reads to report an internal 
+            tandem duplication. Default: 0.070000 
+          required: false
+          example: 0.07
+        - name: --min_itd_supporting_reads
+          alternatives: -Z
+          type: integer
+          description: |
+            Required absolute number of supporting reads to report an 
+            internal tandem duplication. Default: 10 
+          required: false
+          example: 10
+        - name: --skip_duplicate_marking
+          alternatives: -u
+          type: boolean_true
+          description: |
+            Instead of performing duplicate marking itself, Arriba relies on duplicate marking by a 
+            preceding program using the BAM_FDUP flag. This makes sense when unique molecular 
+            identifiers (UMI) are used.
+        - name: --extra_information
+          alternatives: -X
+          type: boolean_true
+          description: |
+            To reduce the runtime and file size, by default, the columns 'fusion_transcript', 
+            'peptide_sequence', and 'read_identifiers' are left empty in the file containing 
+            discarded fusion candidates (see parameter -O). When this flag is set, this extra 
+            information is reported in the discarded fusions file.
+        - name: --fill_gaps
+          alternatives: -I
+          type: boolean_true
+          description: |
+            If assembly of the fusion transcript sequence from the supporting reads is incomplete 
+            (denoted as '...'), fill the gaps using the assembly sequence wherever possible. 
+  resources:
+    - type: bash_script
+      path: script.sh
+  test_resources:
+    - type: bash_script
+      path: test.sh
+    - type: file
+      path: test_data
+platforms:
+  - type: docker
+    image: quay.io/biocontainers/arriba:2.4.0--h0033a41_2
+    setup:
+      - type: docker
+        run: |
+          arriba -h | grep 'Version:' 2>&1 |  sed 's/Version:\s\(.*\)/arriba: "\1"/' > /var/software_versions.txt
+  - type: nextflow
diff --git a/src/arriba/help.txt b/src/arriba/help.txt
new file mode 100644
index 00000000..7d8fc76d
--- /dev/null
+++ b/src/arriba/help.txt
@@ -0,0 +1,198 @@
+```bash
+arriba -h
+```
+
+Arriba gene fusion detector
+---------------------------
+Version: 2.4.0
+
+Arriba is a fast tool to search for aberrant transcripts such as gene fusions. 
+It is based on chimeric alignments found by the STAR RNA-Seq aligner.
+
+Usage: arriba [-c Chimeric.out.sam] -x Aligned.out.bam \
+              -g annotation.gtf -a assembly.fa [-b blacklists.tsv] [-k known_fusions.tsv] \
+              [-t tags.tsv] [-p protein_domains.gff3] [-d structural_variants_from_WGS.tsv] \
+              -o fusions.tsv [-O fusions.discarded.tsv] \
+              [OPTIONS]
+
+ -c FILE  File in SAM/BAM/CRAM format with chimeric alignments as generated by STAR 
+          (Chimeric.out.sam). This parameter is only required, if STAR was run with the 
+          parameter '--chimOutType SeparateSAMold'. When STAR was run with the parameter 
+          '--chimOutType WithinBAM', it suffices to pass the parameter -x to Arriba and -c 
+          can be omitted. 
+
+ -x FILE  File in SAM/BAM/CRAM format with main alignments as generated by STAR 
+          (Aligned.out.sam). Arriba extracts candidate reads from this file. 
+
+ -g FILE  GTF file with gene annotation. The file may be gzip-compressed. 
+
+ -G GTF_FEATURES  Comma-/space-separated list of names of GTF features. 
+                  Default: gene_name=gene_name|gene_id gene_id=gene_id 
+                  transcript_id=transcript_id feature_exon=exon feature_CDS=CDS 
+
+ -a FILE  FastA file with genome sequence (assembly). The file may be gzip-compressed. An 
+          index with the file extension .fai must exist only if CRAM files are processed. 
+
+ -b FILE  File containing blacklisted events (recurrent artifacts and transcripts 
+          observed in healthy tissue). 
+
+ -k FILE  File containing known/recurrent fusions. Some cancer entities are often 
+          characterized by fusions between the same pair of genes. In order to boost 
+          sensitivity, a list of known fusions can be supplied using this parameter. The list 
+          must contain two columns with the names of the fused genes, separated by tabs. 
+
+ -o FILE  Output file with fusions that have passed all filters. 
+
+ -O FILE  Output file with fusions that were discarded due to filtering. 
+
+ -t FILE  Tab-separated file containing fusions to annotate with tags in the 'tags' column. 
+          The first two columns specify the genes; the third column specifies the tag. The 
+          file may be gzip-compressed. 
+
+ -p FILE  File in GFF3 format containing coordinates of the protein domains of genes. The 
+          protein domains retained in a fusion are listed in the column 
+          'retained_protein_domains'. The file may be gzip-compressed. 
+
+ -d FILE  Tab-separated file with coordinates of structural variants found using 
+          whole-genome sequencing data. These coordinates serve to increase sensitivity 
+          towards weakly expressed fusions and to eliminate fusions with low evidence. 
+
+ -D MAX_GENOMIC_BREAKPOINT_DISTANCE  When a file with genomic breakpoints obtained via 
+                                     whole-genome sequencing is supplied via the -d 
+                                     parameter, this parameter determines how far a 
+                                     genomic breakpoint may be away from a 
+                                     transcriptomic breakpoint to consider it as a 
+                                     related event. For events inside genes, the 
+                                     distance is added to the end of the gene; for 
+                                     intergenic events, the distance threshold is 
+                                     applied as is. Default: 100000 
+
+ -s STRANDEDNESS  Whether a strand-specific protocol was used for library preparation, 
+                  and if so, the type of strandedness (auto/yes/no/reverse). When 
+                  unstranded data is processed, the strand can sometimes be inferred from 
+                  splice-patterns. But in unclear situations, stranded data helps 
+                  resolve ambiguities. Default: auto 
+
+ -i CONTIGS  Comma-/space-separated list of interesting contigs. Fusions between genes 
+             on other contigs are ignored. Cfontigs can be specified with or without the 
+             prefix "chr". Asterisks (*) are treated as wild-cards. 
+             Default: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y AC_* NC_* 
+
+ -v CONTIGS  Comma-/space-separated list of viral contigs. Asterisks (*) are treated as 
+             wild-cards. 
+             Default: AC_* NC_* 
+
+ -f FILTERS  Comma-/space-separated list of filters to disable. By default all filters are 
+             enabled. Valid values: homologs, low_entropy, isoforms, 
+             top_expressed_viral_contigs, viral_contigs, uninteresting_contigs, 
+             non_coding_neighbors, mismatches, duplicates, no_genomic_support, 
+             genomic_support, intronic, end_to_end, relative_support, 
+             low_coverage_viral_contigs, merge_adjacent, mismappers, multimappers, 
+             same_gene, long_gap, internal_tandem_duplication, small_insert_size, 
+             read_through, inconsistently_clipped, intragenic_exonic, 
+             marginal_read_through, spliced, hairpin, blacklist, min_support, 
+             select_best, in_vitro, short_anchor, known_fusions, no_coverage, 
+             homopolymer, many_spliced 
+
+ -E MAX_E-VALUE  Arriba estimates the number of fusions with a given number of supporting 
+                 reads which one would expect to see by random chance. If the expected number 
+                 of fusions (e-value) is higher than this threshold, the fusion is 
+                 discarded by the 'relative_support' filter. Note: Increasing this 
+                 threshold can dramatically increase the number of false positives and may 
+                 increase the runtime of resource-intensive steps. Fractional values are 
+                 possible. Default: 0.300000 
+
+ -S MIN_SUPPORTING_READS  The 'min_support' filter discards all fusions with fewer than 
+                          this many supporting reads (split reads and discordant mates 
+                          combined). Default: 2 
+
+ -m MAX_MISMAPPERS  When more than this fraction of supporting reads turns out to be 
+                    mismappers, the 'mismappers' filter discards the fusion. Default: 
+                    0.800000 
+
+ -L MAX_HOMOLOG_IDENTITY  Genes with more than the given fraction of sequence identity are 
+                          considered homologs and removed by the 'homologs' filter. 
+                          Default: 0.300000 
+
+ -H HOMOPOLYMER_LENGTH  The 'homopolymer' filter removes breakpoints adjacent to 
+                        homopolymers of the given length or more. Default: 6 
+
+ -R READ_THROUGH_DISTANCE  The 'read_through' filter removes read-through fusions 
+                           where the breakpoints are less than the given distance away 
+                           from each other. Default: 10000 
+
+ -A MIN_ANCHOR_LENGTH  Alignment artifacts are often characterized by split reads coming 
+                       from only one gene and no discordant mates. Moreover, the split 
+                       reads only align to a short stretch in one of the genes. The 
+                       'short_anchor' filter removes these fusions. This parameter sets 
+                       the threshold in bp for what the filter considers short. Default: 23 
+
+ -M MANY_SPLICED_EVENTS  The 'many_spliced' filter recovers fusions between genes that 
+                         have at least this many spliced breakpoints. Default: 4 
+
+ -K MAX_KMER_CONTENT  The 'low_entropy' filter removes reads with repetitive 3-mers. If 
+                      the 3-mers make up more than the given fraction of the sequence, then 
+                      the read is discarded. Default: 0.600000 
+
+ -V MAX_MISMATCH_PVALUE  The 'mismatches' filter uses a binomial model to calculate a 
+                         p-value for observing a given number of mismatches in a read. If 
+                         the number of mismatches is too high, the read is discarded. 
+                         Default: 0.010000 
+
+ -F FRAGMENT_LENGTH  When paired-end data is given, the fragment length is estimated 
+                     automatically and this parameter has no effect. But when single-end 
+                     data is given, the mean fragment length should be specified to 
+                     effectively filter fusions that arise from hairpin structures. 
+                     Default: 200 
+
+ -U MAX_READS  Subsample fusions with more than the given number of supporting reads. This 
+               improves performance without compromising sensitivity, as long as the 
+               threshold is high. Counting of supporting reads beyond the threshold is 
+               inaccurate, obviously. Default: 300 
+
+ -Q QUANTILE  Highly expressed genes are prone to produce artifacts during library 
+              preparation. Genes with an expression above the given quantile are eligible 
+              for filtering by the 'in_vitro' filter. Default: 0.998000 
+
+ -e EXONIC_FRACTION  The breakpoints of false-positive predictions of intragenic events 
+                     are often both in exons. True predictions are more likely to have at 
+                     least one breakpoint in an intron, because introns are larger. If the 
+                     fraction of exonic sequence between two breakpoints is smaller than 
+                     the given fraction, the 'intragenic_exonic' filter discards the 
+                     event. Default: 0.330000 
+
+ -T TOP_N  Only report viral integration sites of the top N most highly expressed viral 
+           contigs. Default: 5 
+
+ -C COVERED_FRACTION  Ignore virally associated events if the virus is not fully 
+                      expressed, i.e., less than the given fraction of the viral contig is 
+                      transcribed. Default: 0.050000 
+
+ -l MAX_ITD_LENGTH  Maximum length of internal tandem duplications. Note: Increasing 
+                    this value beyond the default can impair performance and lead to many 
+                    false positives. Default: 100 
+
+ -z MIN_ITD_ALLELE_FRACTION  Required fraction of supporting reads to report an internal 
+                             tandem duplication. Default: 0.070000 
+
+ -Z MIN_ITD_SUPPORTING_READS  Required absolute number of supporting reads to report an 
+                              internal tandem duplication. Default: 10 
+
+ -u  Instead of performing duplicate marking itself, Arriba relies on duplicate marking by a 
+     preceding program using the BAM_FDUP flag. This makes sense when unique molecular 
+     identifiers (UMI) are used. 
+
+ -X  To reduce the runtime and file size, by default, the columns 'fusion_transcript', 
+     'peptide_sequence', and 'read_identifiers' are left empty in the file containing 
+     discarded fusion candidates (see parameter -O). When this flag is set, this extra 
+     information is reported in the discarded fusions file. 
+
+ -I  If assembly of the fusion transcript sequence from the supporting reads is incomplete 
+     (denoted as '...'), fill the gaps using the assembly sequence wherever possible. 
+
+ -h  Print help and exit. 
+
+         Code repository: https://github.com/suhrig/arriba
+    Get help/report bugs: https://github.com/suhrig/arriba/issues
+             User manual: https://arriba.readthedocs.io/
+             Please cite: https://doi.org/10.1101/gr.257246.119
\ No newline at end of file
diff --git a/src/arriba/script.sh b/src/arriba/script.sh
new file mode 100644
index 00000000..b0a9af8e
--- /dev/null
+++ b/src/arriba/script.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+[[ "$par_skip_duplicate_marking" == "false" ]] && unset par_skip_duplicate_marking
+[[ "$par_extra_information" == "false" ]] && unset par_extra_information
+[[ "$par_fill_gaps" == "false" ]] && unset par_fill_gaps
+
+arriba \
+  -x "$par_bam" \
+  -a "$par_genome" \
+  -g "$par_gene_annotation" \
+  -o "$par_fusions" \
+  ${par_known_fusions:+-k "${par_known_fusions}"} \
+  ${par_blacklist:+-b "${par_blacklist}"} \
+  ${par_structural_variants:+-d "${par_structural_variants}"} \
+  ${par_tags:+-t "${par_tags}"} \
+  ${par_protein_domains:+-p "${par_protein_domains}"} \
+  ${par_fusions_discarded:+-O "${par_fusions_discarded}"} \
+  ${par_max_genomic_breakpoint_distance:+-D "${par_max_genomic_breakpoint_distance}"} \
+  ${par_strandedness:+-s "${par_strandedness}"} \
+  ${par_interesting_contigs:+-i "${par_interesting_contigs}"} \
+  ${par_viral_contigs:+-v "${par_viral_contigs}"} \
+  ${par_disable_filters:+-f "${par_disable_filters}"} \
+  ${par_max_e_value:+-E "${par_max_e_value}"} \
+  ${par_min_supporting_reads:+-S "${par_min_supporting_reads}"} \
+  ${par_max_mismappers:+-m "${par_max_mismappers}"} \
+  ${par_max_homolog_identity:+-L "${par_max_homolog_identity}"} \
+  ${par_homopolymer_length:+-H "${par_homopolymer_length}"} \
+  ${par_read_through_distance:+-R "${par_read_through_distance}"} \
+  ${par_min_anchor_length:+-A "${par_min_anchor_length}"} \
+  ${par_many_spliced_events:+-M "${par_many_spliced_events}"} \
+  ${par_max_kmer_content:+-K "${par_max_kmer_content}"} \
+  ${par_max_mismatch_pvalue:+-V "${par_max_mismatch_pvalue}"} \
+  ${par_fragment_length:+-F "${par_fragment_length}"} \
+  ${par_max_reads:+-U "${par_max_reads}"} \
+  ${par_quantile:+-Q "${par_quantile}"} \
+  ${par_exonic_fraction:+-e "${par_exonic_fraction}"} \
+  ${par_top_n:+-T "${par_top_n}"} \
+  ${par_covered_fraction:+-C "${par_covered_fraction}"} \
+  ${par_max_itd_length:+-l "${par_max_itd_length}"} \
+  ${par_min_itd_allele_fraction:+-z "${par_min_itd_allele_fraction}"} \
+  ${par_min_itd_supporting_reads:+-Z "${par_min_itd_supporting_reads}"} \
+  ${par_skip_duplicate_marking:+-u} \
+  ${par_extra_information:+-X} \
+  ${par_fill_gaps:+-I}
diff --git a/src/arriba/test.sh b/src/arriba/test.sh
new file mode 100644
index 00000000..7f1243d2
--- /dev/null
+++ b/src/arriba/test.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+set -e
+
+dir_in="$meta_resources_dir/test_data"
+
+echo "> Run arriba with blacklist"
+"$meta_executable" \
+  --bam "$dir_in/A.bam" \
+  --genome "$dir_in/genome.fasta" \
+  --gene_annotation "$dir_in/annotation.gtf" \
+  --blacklist "$dir_in/blacklist.tsv" \
+  --fusions "fusions.tsv" \
+  --fusions_discarded "fusions_discarded.tsv" \
+  --interesting_contigs "1,2"
+
+echo ">> Checking output"
+[ ! -f "fusions.tsv" ] && echo "Output file fusions.tsv does not exist" && exit 1
+[ ! -f "fusions_discarded.tsv" ] && echo "Output file fusions_discarded.tsv does not exist" && exit 1
+
+echo ">> Check if output is empty"
+[ ! -s "fusions.tsv" ] && echo "Output file fusions.tsv is empty" && exit 1
+[ ! -s "fusions_discarded.tsv" ] && echo "Output file fusions_discarded.tsv is empty" && exit 1
+
+rm fusions.tsv fusions_discarded.tsv
+
+echo "> Run arriba without blacklist"
+"$meta_executable" \
+  --bam "$dir_in/A.bam" \
+  --genome "$dir_in/genome.fasta" \
+  --gene_annotation "$dir_in/annotation.gtf" \
+  --fusions "fusions.tsv" \
+  --fusions_discarded "fusions_discarded.tsv" \
+  --interesting_contigs "1,2" \
+  --disable_filters blacklist
+
+echo ">> Checking output"
+[ ! -f "fusions.tsv" ] && echo "Output file fusions.tsv does not exist" && exit 1
+[ ! -f "fusions_discarded.tsv" ] && echo "Output file fusions_discarded.tsv does not exist" && exit 1
+
+echo ">> Check if output is empty"
+[ ! -s "fusions.tsv" ] && echo "Output file fusions.tsv is empty" && exit 1
+[ ! -s "fusions_discarded.tsv" ] && echo "Output file fusions_discarded.tsv is empty" && exit 1
+
+echo "> Test successful"
\ No newline at end of file
diff --git a/src/arriba/test_data/A.bam b/src/arriba/test_data/A.bam
new file mode 100644
index 0000000000000000000000000000000000000000..57511ab3537e519b0b82cc84d30637c43db5801e
GIT binary patch
literal 296
zcmb2|=3rp}f&Xj_PR>jW4GhIaUsBJcB_tGlD0s;8d9%?K<A+5Yd{21Tj4W&atkK~~
zh*6d>KQQ<5<)c~_M+`fR5AZuNn@YW$`C3w~$m(~4>kgSYe=WVcgS$m0eJ(B*{v0Gy
zWb`>`t;m5QABWctHzp`DoHAnC%-}Jhqp=%kuRNLqx)@fcG%)xnq;3Bwl9ZzHWI{{V
zO6SacW@csY;x^769v(f91CGMtA3hhPefdz5R>D&zcH(ns+6U%G3CA^*WtEv_b7hr-
z<7JJRWtp?vycRrkaF%R&I8EKy!|CXmh9BJaT~C`l92z7}mn%f(tw`?3vSeueGBM~E
N7N<xvFat#)0093nXVU-x

literal 0
HcmV?d00001

diff --git a/src/arriba/test_data/README.md b/src/arriba/test_data/README.md
new file mode 100644
index 00000000..853beeb2
--- /dev/null
+++ b/src/arriba/test_data/README.md
@@ -0,0 +1,8 @@
+# arriba test data
+
+Test data was obtained from https://github.com/snakemake/snakemake-wrappers/tree/master/bio/arriba/test.
+
+__author__ = "Jan Forster"
+__copyright__ = "Copyright 2019, Jan Forster"
+__email__ = "j.forster@dkfz.de"
+__license__ = "MIT"
\ No newline at end of file
diff --git a/src/arriba/test_data/annotation.gtf b/src/arriba/test_data/annotation.gtf
new file mode 100644
index 00000000..22b3a67a
--- /dev/null
+++ b/src/arriba/test_data/annotation.gtf
@@ -0,0 +1,6 @@
+1	havana	gene	1	80	.	+	.	gene_id "ENSG00000000000"; gene_version "5"; gene_name "A"; gene_source "havana"; gene_biotype "gene";
+1	havana	transcript	1	80	.	+	.	gene_id "ENSG00000000000"; gene_version "5"; transcript_id "ENST00000000000"; transcript_version "2"; gene_name "A"; gene_source "havana"; gene_biotype "gene"; transcript_name "A-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; tag "basic"; transcript_support_level "1";
+1	havana	exon	1	80	.	+	.	gene_id "ENSG00000000000"; gene_version "5"; transcript_id "ENST00000000000"; transcript_version "2"; exon_number "1"; gene_name "A"; gene_source "havana"; gene_biotype "gene"; transcript_name "A-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00000000000"; exon_version "1"; tag "basic"; transcript_support_level "1";
+2	havana	gene	1	80	.	+	.	gene_id "ENSG00000000001"; gene_version "5"; gene_name "B"; gene_source "havana"; gene_biotype "gene";
+2	havana	transcript	1	80	.	+	.	gene_id "ENSG00000000001"; gene_version "5"; transcript_id "ENST00000000001"; transcript_version "2"; gene_name "B"; gene_source "havana"; gene_biotype "gene"; transcript_name "B-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; tag "basic"; transcript_support_level "1";
+2	havana	exon	1	80	.	+	.	gene_id "ENSG00000000001"; gene_version "5"; transcript_id "ENST00000000001"; transcript_version "2"; exon_number "1"; gene_name "B"; gene_source "havana"; gene_biotype "gene"; transcript_name "B-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00000000001"; exon_version "1"; tag "basic"; transcript_support_level "1";
diff --git a/src/arriba/test_data/blacklist.tsv b/src/arriba/test_data/blacklist.tsv
new file mode 100644
index 00000000..e69de29b
diff --git a/src/arriba/test_data/genome.fasta b/src/arriba/test_data/genome.fasta
new file mode 100644
index 00000000..91ea0d37
--- /dev/null
+++ b/src/arriba/test_data/genome.fasta
@@ -0,0 +1,4 @@
+>1
+GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
+>2
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA

From fb6cf022e35a320e2372d661648ee1f5167eab17 Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Fri, 19 Jan 2024 17:03:46 +0100
Subject: [PATCH 2/5] document command

---
 src/arriba/test_data/README.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/arriba/test_data/README.md b/src/arriba/test_data/README.md
index 853beeb2..8cfb28f6 100644
--- a/src/arriba/test_data/README.md
+++ b/src/arriba/test_data/README.md
@@ -5,4 +5,9 @@ Test data was obtained from https://github.com/snakemake/snakemake-wrappers/tree
 __author__ = "Jan Forster"
 __copyright__ = "Copyright 2019, Jan Forster"
 __email__ = "j.forster@dkfz.de"
-__license__ = "MIT"
\ No newline at end of file
+__license__ = "MIT"
+
+```bash
+git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers
+cp -r /tmp/snakemake-wrappers/bio/arriba/test/* src/arriba/test_data
+```
\ No newline at end of file

From 57e506fa37953850edeec84a059f21d67ff06a4f Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Fri, 26 Jan 2024 13:44:57 +0100
Subject: [PATCH 3/5] add gitignore

---
 .gitignore | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..ca5262bc
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,17 @@
+*.DS_Store
+*__pycache__
+
+# IDE ignores
+.idea/
+
+# R specific ignores
+.Rhistory
+.Rproj.user
+*.Rproj
+
+# viash specific ignores
+target/
+
+# nextflow specific ignores
+.nextflow*
+work

From d0461dbe98b89d871bfb332aa3f592118372ba2e Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Fri, 26 Jan 2024 14:07:47 +0100
Subject: [PATCH 4/5] add arriba test data script

---
 src/arriba/test_data/README.md | 13 -------------
 src/arriba/test_data/script.sh | 10 ++++++++++
 2 files changed, 10 insertions(+), 13 deletions(-)
 delete mode 100644 src/arriba/test_data/README.md
 create mode 100755 src/arriba/test_data/script.sh

diff --git a/src/arriba/test_data/README.md b/src/arriba/test_data/README.md
deleted file mode 100644
index 8cfb28f6..00000000
--- a/src/arriba/test_data/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# arriba test data
-
-Test data was obtained from https://github.com/snakemake/snakemake-wrappers/tree/master/bio/arriba/test.
-
-__author__ = "Jan Forster"
-__copyright__ = "Copyright 2019, Jan Forster"
-__email__ = "j.forster@dkfz.de"
-__license__ = "MIT"
-
-```bash
-git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers
-cp -r /tmp/snakemake-wrappers/bio/arriba/test/* src/arriba/test_data
-```
\ No newline at end of file
diff --git a/src/arriba/test_data/script.sh b/src/arriba/test_data/script.sh
new file mode 100755
index 00000000..e5bbcf2c
--- /dev/null
+++ b/src/arriba/test_data/script.sh
@@ -0,0 +1,10 @@
+# arriba test data
+
+# Test data was obtained from https://github.com/snakemake/snakemake-wrappers/tree/master/bio/arriba/test
+
+if [ ! -d /tmp/snakemake-wrappers ]; then
+  git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers
+fi
+
+cp -r /tmp/snakemake-wrappers/bio/arriba/test/* src/arriba/test_data
+

From b3573a2926bf0a2a5ec78933cf47639909f124dc Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Fri, 26 Jan 2024 16:31:20 +0100
Subject: [PATCH 5/5] Add fastp component (#3)

* initialise fastp component

* add arguments to config

* add test and script

* update help

* update changelog

* fix test data script

* address comments

* Apply suggestions from code review

Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com>

* extend test

---------

Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com>
---
 CHANGELOG.md                     |   2 +
 src/fastp/config.vsh.yaml        | 574 +++++++++++++++++++++++++++++++
 src/fastp/help.txt               |  93 +++++
 src/fastp/script.sh              | 105 ++++++
 src/fastp/test.sh                |  74 ++++
 src/fastp/test_data/pe/a.1.fastq |   4 +
 src/fastp/test_data/pe/a.2.fastq |   4 +
 src/fastp/test_data/script.sh    |  10 +
 src/fastp/test_data/se/a.fastq   |   4 +
 9 files changed, 870 insertions(+)
 create mode 100644 src/fastp/config.vsh.yaml
 create mode 100644 src/fastp/help.txt
 create mode 100644 src/fastp/script.sh
 create mode 100644 src/fastp/test.sh
 create mode 100644 src/fastp/test_data/pe/a.1.fastq
 create mode 100644 src/fastp/test_data/pe/a.2.fastq
 create mode 100755 src/fastp/test_data/script.sh
 create mode 100644 src/fastp/test_data/se/a.fastq

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d2610ff9..63f69365 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,8 @@
 
 * `arriba`: Detect gene fusions from RNA-seq data (PR #1).
 
+* `fastp`: An ultra-fast all-in-one FASTQ preprocessor (PR #3).
+
 ## MAJOR CHANGES
 
 ## MINOR CHANGES
diff --git a/src/fastp/config.vsh.yaml b/src/fastp/config.vsh.yaml
new file mode 100644
index 00000000..c513ec92
--- /dev/null
+++ b/src/fastp/config.vsh.yaml
@@ -0,0 +1,574 @@
+functionality:
+  name: fastp
+  description: |
+    An ultra-fast all-in-one FASTQ preprocessor (QC/adapters/trimming/filtering/splitting/merging...).
+
+    Features:
+
+      - comprehensive quality profiling for both before and after filtering data (quality curves, base contents, KMER, Q20/Q30, GC Ratio, duplication, adapter contents...)
+      - filter out bad reads (too low quality, too short, or too many N...)
+      - cut low quality bases for per read in its 5' and 3' by evaluating the mean quality from a sliding window (like Trimmomatic but faster).
+      - trim all reads in front and tail
+      - cut adapters. Adapter sequences can be automatically detected, which means you don't have to input the adapter sequences to trim them.
+      - correct mismatched base pairs in overlapped regions of paired end reads, if one base is with high quality while the other is with ultra low quality
+      - trim polyG in 3' ends, which is commonly seen in NovaSeq/NextSeq data. Trim polyX in 3' ends to remove unwanted polyX tailing (i.e. polyA tailing for mRNA-Seq data)
+      - preprocess unique molecular identifier (UMI) enabled data, shift UMI to sequence name.
+      - report JSON format result for further interpreting.
+      - visualize quality control and filtering results on a single HTML page (like FASTQC but faster and more informative).
+      - split the output to multiple files (0001.R1.gz, 0002.R1.gz...) to support parallel processing. Two modes can be used, limiting the total split file number, or limitting the lines of each split file.
+      - support long reads (data from PacBio / Nanopore devices).
+      - support reading from STDIN and writing to STDOUT
+      - support interleaved input
+      - support ultra-fast FASTQ-level deduplication
+  info:
+    keywords: [RNA-Seq, Trimming, Quality control]
+    repository: https://github.com/OpenGene/fastp
+    documentation: https://github.com/OpenGene/fastp/blob/master/README.md
+    reference: "doi:10.1093/bioinformatics/bty560"
+    licence: MIT
+  argument_groups:
+    - name: Inputs
+      description: |
+        `fastp` supports both single-end (SE) and paired-end (PE) input.
+
+        - for SE data, you only have to specify read1 input by `-i` or `--in1`.
+        - for PE data, you should also specify read2 input by `-I` or `--in2`.
+      arguments:
+        - name: --in1
+          alternatives: [-i]
+          type: file
+          description: Input FastQ file. Must be single-end or paired-end R1. Can be gzipped.
+          required: true
+          example: in.R1.fq.gz
+        - name: --in2
+          alternatives: [-I]
+          type: file
+          description: Input FastQ file. Must be paired-end R2. Can be gzipped.
+          required: false
+          example: in.R2.fq.gz
+    - name: Outputs
+      description: |
+
+        - for SE data, you only have to specify read1 output by `-o` or `--out1`.
+        - for PE data, you should also specify read2 output by `-O` or `--out2`.
+        - if you don't specify the output file names, no output files will be written, but the QC will still be done for both data before and after filtering.
+        - the output will be gzip-compressed if its file name ends with `.gz`
+      arguments:
+        - name: --out1
+          alternatives: [-o]
+          type: file
+          description: The single-end or paired-end R1 reads that pass QC. Will be gzipped if its file name ends with `.gz`.
+          required: true
+          example: out.R1.fq.gz
+          direction: output
+        - name: --out2
+          alternatives: [-O]
+          type: file
+          description: The paired-end R2 reads that pass QC. Will be gzipped if its file name ends with `.gz`.
+          required: false
+          example: out.R2.fq.gz
+          direction: output
+        - name: --unpaired1
+          type: file
+          description: Store the reads that `read1` passes filters but its paired `read2` doesn't.
+          required: false
+          example: unpaired.R1.fq.gz
+          direction: output
+        - name: --unpaired2
+          type: file
+          description: Store the reads that `read2` passes filters but its paired `read1` doesn't.
+          required: false
+          example: unpaired.R2.fq.gz
+          direction: output
+        - name: --failed_out
+          type: file
+          description: |
+            Store the reads that fail filters.
+
+            If one read failed and is written to --failed_out, its failure reason will be appended to its read name. For example, failed_quality_filter, failed_too_short etc.
+            For PE data, if unpaired reads are not stored (by giving --unpaired1 or --unpaired2), the failed pair of reads will be put together. If one read passes the filters but its pair doesn't, the failure reason will be paired_read_is_failing.
+          required: false
+          example: failed.fq.gz
+          direction: output
+        - name: --overlapped_out
+          type: file
+          description: |
+            For each read pair, output the overlapped region if it has no any mismatched base.
+          direction: output
+    - name: Report output arguments
+      arguments:
+        - name: --json
+          alternatives: [-j]
+          type: file
+          description: |
+            The json format report file name
+          example: out.json
+          direction: output
+        - name: --html
+          type: file
+          description: |
+            The html format report file name
+          example: out.html
+          direction: output
+        - name: --report_title
+          type: string
+          description: |
+            The title of the html report, default is "fastp report".
+          example: fastp report
+    - name: Adapter trimming
+      description: |
+        Adapter trimming is enabled by default, but you can disable it by `-A` or `--disable_adapter_trimming`. Adapter sequences can be automatically detected for both PE/SE data.
+
+        - For SE data, the adapters are evaluated by analyzing the tails of first ~1M reads. This evaluation may be inacurrate, and you can specify the adapter sequence by `-a` or `--adapter_sequence` option. If adapter sequence is specified, the auto detection for SE data will be disabled.
+        - For PE data, the adapters can be detected by per-read overlap analysis, which seeks for the overlap of each pair of reads. This method is robust and fast, so normally you don't have to input the adapter sequence even you know it. But you can still specify the adapter sequences for read1 by `--adapter_sequence`, and for read2 by `--adapter_sequence_r2`. If `fastp` fails to find an overlap (i.e. due to low quality bases), it will use these sequences to trim adapters for read1 and read2 respectively.
+        - For PE data, the adapter sequence auto-detection is disabled by default since the adapters can be trimmed by overlap analysis. However, you can specify `--detect_adapter_for_pe` to enable it.
+        - For PE data, `fastp` will run a little slower if you specify the sequence adapters or enable adapter auto-detection, but usually result in a slightly cleaner output, since the overlap analysis may fail due to sequencing errors or adapter dimers.
+        - The most widely used adapter is the Illumina TruSeq adapters. If your data is from the TruSeq library, you can add `--adapter_sequence=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA --adapter_sequence_r2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT` to your command lines, or enable auto detection for PE data by specifing `detect_adapter_for_pe`.
+        - `fastp` contains some built-in known adapter sequences for better auto-detection. If you want to make some adapters to be a part of the built-in adapters, please file an issue.
+
+        You can also specify --adapter_fasta to give a FASTA file to tell fastp to trim multiple adapters in this FASTA file. Here is a sample of such adapter FASTA file:
+
+        ```
+        >Illumina TruSeq Adapter Read 1
+        AGATCGGAAGAGCACACGTCTGAACTCCAGTCA
+        >Illumina TruSeq Adapter Read 2
+        AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
+        >polyA
+        AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+        ```
+
+        The adapter sequence in this file should be at least 6bp long, otherwise it will be skipped. And you can give whatever you want to trim, rather than regular sequencing adapters (i.e. polyA).
+
+        `fastp` first trims the auto-detected adapter or the adapter sequences given by `--adapter_sequence | --adapter_sequence_r2`, then trims the adapters given by `--adapter_fasta` one by one.
+
+        The sequence distribution of trimmed adapters can be found at the HTML/JSON reports.
+      arguments:
+        - name: --disable_adapter_trimming
+          alternatives: [-A]
+          type: boolean_true
+          description: |
+            Disable adapter trimming.
+        - name: --detect_adapter_for_pe
+          type: boolean_true
+          description: |
+            By default, the auto-detection for adapter is for SE data input only, turn on this option to enable it for PE data.
+        - name: --adapter_sequence
+          alternatives: [-a]
+          type: string
+          description: |
+            The adapter sequences to be trimmed. For SE data, if not specified, the adapters will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped
+        - name: --adapter_sequence_r2
+          type: string
+          description: |
+            The adapter sequences to be trimmed for R2. This is used for PE data if R1/R2 are found overlapped.
+        - name: --adapter_fasta
+          type: file
+          description: |
+            A FASTA file containing all the adapter sequences to be trimmed. For SE data, if not specified, the adapters will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped.
+    - name: Base trimming
+      arguments:
+        - name: --trim_front1
+          alternatives: [-f]
+          type: integer
+          description: |
+            Trimming how many bases in front for read1, default is 0.
+          example: 0
+        - name: --trim_tail1
+          alternatives: [-t]
+          type: integer
+          description: |
+            Trimming how many bases in tail for read1, default is 0.
+          example: 0
+        - name: --max_len1
+          alternatives: [-b]
+          type: integer
+          min: 0
+          description: |
+            If read1 is longer than max_len1, then trim read1 at its tail to make it as long as max_len1. Default 0 means no limitation.
+        - name: --trim_front2
+          alternatives: [-F]
+          type: integer
+          description: |
+            Trimming how many bases in front for read2, default is 0.
+          example: 0
+        - name: --trim_tail2
+          alternatives: [-T]
+          type: integer
+          description: |
+            Trimming how many bases in tail for read2, default is 0.
+          example: 0
+        - name: --max_len2
+          alternatives: [-B]
+          type: integer
+          min: 0
+          description: |
+            If read2 is longer than max_len2, then trim read2 at its tail to make it as long as max_len2. Default 0 means no limitation.
+    - name: Merging mode
+      description: Allows merging paired-end reads into a single longer read if they are overlapping.
+      arguments:
+        - name: --merge
+          alternatives: [-m]
+          type: boolean_true
+          description: |
+            For paired-end input, merge each pair of reads into a single read if they are overlapped. The merged reads will be written to the file given by --merged_out, the unmerged reads will be written to the files specified by --out1 and --out2. The merging mode is disabled by default.
+        - name: --merged_out
+          type: file
+          description: |
+            In the merging mode, specify the file name to store merged output, or specify --stdout to stream the merged output.
+          direction: output
+          example: merged.fq.gz
+        - name: --include_unmerged
+          type: boolean_true
+          description: |
+            In the merging mode, write the unmerged or unpaired reads to the file specified by --merge. Disabled by default.
+    - name: Additional input arguments
+      description: Affects how the input is read.
+      arguments:
+        - name: --interleaved_in
+          type: boolean_true
+          description: |
+            Indicate that <in1> is an interleaved FASTQ which contains both read1 and read2. Disabled by default.
+        - name: --fix_mgi_id
+          type: boolean_true
+          description: |
+            The MGI FASTQ ID format is not compatible with many BAM operation tools, enable this option to fix it.
+        - name: --phred64
+          alternatives: ["-6"]
+          type: boolean_true
+          description: |
+            Indicate the input is using phred64 scoring (it'll be converted to phred33, so the output will still be phred33)
+    - name: Additional output arguments
+      description: Affects how the output is written.
+      arguments:
+        - name: --compression
+          alternatives: ["-z"]
+          type: integer
+          description: |
+            Compression level for gzip output (1 ~ 9). 1 is fastest, 9 is smallest, default is 4.
+          example: 4
+          min: 1
+          max: 9
+        - name: --dont_overwrite
+          type: boolean_true
+          description: |
+            Don't overwrite existing files. Overwritting is allowed by default.
+    - name: Logging arguments
+      arguments:
+        - name: --verbose
+          alternatives: [-V]
+          type: boolean_true
+          description: Output verbose log information (i.e. when every 1M reads are processed).
+    - name: Processing arguments
+      arguments:
+        - name: --reads_to_process
+          type: long
+          description: |
+            Specify how many reads/pairs to be processed. Default 0 means process all reads.
+          example: 1000000
+          min: 0
+    - name: Deduplication arguments
+      arguments:
+        - name: --dedup
+          type: boolean_true
+          description: |
+            Enable deduplication to drop the duplicated reads/pairs
+        - name: --dup_calc_accuracy
+          type: integer
+          description: |
+            Accuracy level to calculate duplication (1~6). Higher level uses more memory (1G, 2G, 4G, 8G, 16G, 24G). Default 1 for no-dedup mode, and 3 for dedup mode.
+          example: 3
+          min: 1
+          max: 6
+        - name: --dont_eval_duplication
+          type: boolean_true
+          description: |
+            Don't evaluate duplication rate to save time and use less memory.
+    - name: PolyG tail trimming arguments
+      arguments:
+        - name: --trim_poly_g
+          alternatives: [-g]
+          type: boolean_true
+          description: |
+            Force polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data
+        - name: --poly_g_min_len
+          type: integer
+          description: |
+            The minimum length to detect polyG in the read tail. 10 by default.
+          example: 10
+          min: 1
+        - name: --disable_trim_poly_g
+          alternatives: [-G]
+          type: boolean_true
+          description: |
+            Disable polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data
+    - name: PolyX tail trimming arguments
+      arguments:
+        - name: --trim_poly_x
+          alternatives: [-x]
+          type: boolean_true
+          description: |
+            Enable polyX trimming in 3' ends.
+        - name: --poly_x_min_len
+          type: integer
+          description: |
+            The minimum length to detect polyX in the read tail. 10 by default.
+          example: 10
+          min: 1
+    - name: Cut arguments
+      arguments:
+        - name: --cut_front
+          alternatives: ["-5"]
+          type: integer
+          description: |
+            Move a sliding window from front (5') to tail, drop the bases in the window if its mean quality < threshold, stop otherwise.
+        - name: --cut_tail
+          alternatives: ["-3"]
+          type: integer
+          description: |
+            Move a sliding window from tail (3') to front, drop the bases in the window if its mean quality < threshold, stop otherwise.
+        - name: --cut_right
+          alternatives: ["-r"]
+          type: integer
+          description: |
+            Move a sliding window from front to tail, if meet one window with mean quality < threshold, drop the bases in the window and the right part, and then stop.
+        - name: --cut_window_size
+          alternatives: ["-W"]
+          type: integer
+          description: |
+            The window size option shared by cut_front, cut_tail or cut_sliding. Range: 1~1000, default: 4.
+          example: 4
+          min: 1
+        - name: --cut_mean_quality
+          alternatives: ["-M"]
+          type: integer
+          description: |
+            The mean quality requirement option shared by cut_front, cut_tail or cut_sliding. Range: 1~36 default: 20 (Q20)
+          example: 20
+          min: 0
+        - name: --cut_front_window_size
+          type: integer
+          description: |
+            The window size option of cut_front, default to cut_window_size if not specified.
+          example: 4
+          min: 1
+        - name: --cut_front_mean_quality
+          type: integer
+          description: |
+            The mean quality requirement option of cut_front, default to cut_mean_quality if not specified.
+          example: 20
+          min: 0
+        - name: --cut_tail_window_size
+          type: integer
+          description: |
+            The window size option of cut_tail, default to cut_window_size if not specified.
+          example: 4
+          min: 1
+        - name: --cut_tail_mean_quality
+          type: integer
+          description: |
+            The mean quality requirement option of cut_tail, default to cut_mean_quality if not specified.
+          example: 20
+          min: 0
+        - name: --cut_right_window_size
+          type: integer
+          description: |
+            The window size option of cut_right, default to cut_window_size if not specified.
+          example: 4
+          min: 1
+        - name: --cut_right_mean_quality
+          type: integer
+          description: |
+            The mean quality requirement option of cut_right, default to cut_mean_quality if not specified.
+          example: 20
+          min: 0
+    - name: Quality filtering arguments
+      arguments:
+        - name: --disable_quality_filtering
+          alternatives: [-Q]
+          type: boolean_true
+          description: |
+            Quality filtering is enabled by default. If this option is specified, quality filtering is disabled.
+        - name: --qualified_quality_phred
+          alternatives: [-q]
+          type: integer
+          description: |
+            The quality value that a base is qualified. Default 15 means phred quality >=Q15 is qualified.
+          example: 15
+          min: 0
+        - name: --unqualified_percent_limit
+          alternatives: [-u]
+          type: integer
+          description: |
+            How many percents of bases are allowed to be unqualified (0~100). Default 40 means 40%.
+          example: 40
+          min: 0
+          max: 100
+        - name: --n_base_limit
+          alternatives: [-n]
+          type: integer
+          description: |
+            If one read's number of N base is >n_base_limit, then this read/pair is discarded. Default is 5.
+          example: 5
+          min: 0
+        - name: --average_qual
+          alternatives: [-e]
+          type: integer
+          description: |
+            If one read's average quality score <avg_qual, then this read/pair is discarded. Default 0 means no requirement.
+          example: 0
+          min: 0
+    - name: Length filtering arguments
+      arguments:
+        - name: --disable_length_filtering
+          alternatives: [-L]
+          type: boolean_true
+          description: |
+            Length filtering is enabled by default. If this option is specified, length filtering is disabled.
+        - name: --length_required
+          alternatives: [-l]
+          type: integer
+          description: |
+            Reads shorter than length_required will be discarded, default is 15.
+          example: 15
+          min: 0
+        - name: --length_limit
+          type: integer
+          description: |
+            Reads longer than length_limit will be discarded, default 0 means no limitation.
+          example: 0
+          min: 0
+    - name: Low complexity filtering arguments
+      arguments:
+        - name: --low_complexity_filter
+          alternatives: [-y]
+          type: boolean_true
+          description: |
+            Enable low complexity filter. The complexity is defined as the percentage of base that is different from its next base (base[i] != base[i+1]).
+        - name: --complexity_threshold
+          alternatives: [-Y]
+          type: integer
+          description: |
+            The threshold for low complexity filter (0~100). Default is 30, which means 30% complexity is required.
+          example: 30
+          min: 0
+    - name: Index filtering arguments
+      arguments:
+        - name: --filter_by_index1
+          type: file
+          description: |
+            Specify a file contains a list of barcodes of index1 to be filtered out, one barcode per line.
+        - name: --filter_by_index2
+          type: file
+          description: |
+            Specify a file contains a list of barcodes of index2 to be filtered out, one barcode per line.
+        - name: --filter_by_index_threshold
+          type: integer
+          description: |
+            The allowed difference of index barcode for index filtering, default 0 means completely identical.
+          example: 0
+          min: 0
+    - name: Overlapped region correction
+      arguments:
+        - type: boolean_true
+          name: --correction
+          alternatives: [-c]
+          description: |
+            Enable base correction in overlapped regions (only for PE data), default is disabled.
+        - name: --overlap_len_require
+          type: integer
+          description: |
+            The minimum length to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. 30 by default.
+          example: 30
+          min: 0
+        - name: --overlap_diff_limit
+          type: integer
+          description: |
+            The maximum number of mismatched bases to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. 5 by default.
+          example: 5
+          min: 0
+        - name: --overlap_diff_percent_limit
+          type: integer
+          description: |
+            The maximum percentage of mismatched bases to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. Default 20 means 20%.
+          example: 20
+          min: 0
+          max: 100
+    - name: UMI arguments
+      arguments:
+        - name: --umi
+          alternatives: [-U]
+          type: boolean_true
+          description: |
+            Enable unique molecular identifier (UMI) preprocessing.
+        - name: --umi_loc
+          type: string
+          description: |
+            Specify the location of UMI, can be (index1/index2/read1/read2/per_index/per_read, default is none.
+          choices: [index1, index2, read1, read2, per_index, per_read]
+        - name: --umi_len
+          type: integer
+          description: |
+            If the UMI is in read1/read2, its length should be provided.
+          example: 0
+          min: 0
+        - name: --umi_prefix
+          type: string
+          description: |
+            If specified, an underline will be used to connect prefix and UMI (i.e. prefix=UMI, UMI=AATTCG, final=UMI_AATTCG). No prefix by default.
+        - name: --umi_skip
+          type: integer
+          description: |
+            If the UMI is in read1/read2, fastp can skip several bases following UMI, default is 0.
+          example: 0
+          min: 0
+        - name: --umi_delim
+          type: string
+          description: |
+            If the UMI is in index1/index2, fastp can use a delimiter to separate UMI from the read sequence, default is none.
+    - name: Overrepresentation analysis arguments
+      arguments:
+        - name: --overrepresentation_analysis
+          alternatives: [-p]
+          type: boolean_true
+          description: |
+            Enable overrepresentation analysis.
+        - name: --overrepresentation_sampling
+          type: integer
+          description: |
+            One in (--overrepresentation_sampling) reads will be computed for overrepresentation analysis (1~10000), smaller is slower, default is 20.
+          example: 20
+          min: 1
+    # # would need to set all outputs to multiple: true
+    # - name: Split arguments
+    #   arguments:
+    #     - name: --split
+    #       alternatives: [-s]
+    #       type: boolean_true
+    #       description: |
+    #         Split output by limiting total split file number with this option (2~999), a sequential number prefix will be added to output name ( 0001.out.fq, 0002.out.fq...), disabled by default.
+    #     - name: --split_by_lines
+    #       alternatives: [-S]
+    #       type: long
+    #       description: |
+    #         Split output by limiting lines of each file with this option(>=1000), a sequential number prefix will be added to output name ( 0001.out.fq, 0002.out.fq...), disabled by default.
+    #     - name: --split_prefix_digits
+    #       type: integer
+    #       description: |
+    #         The digits for the sequential number padding (1~10), default is 4, so the filename will be padded as 0001.xxx, 0 to disable padding.
+    #       example: 4
+  resources:
+    - type: bash_script
+      path: script.sh
+  test_resources:
+    - type: bash_script
+      path: test.sh
+    - type: file
+      path: test_data
+platforms:
+  - type: docker
+    image: quay.io/biocontainers/fastp:0.23.4--hadf994f_2
+    setup:
+      - type: docker
+        run: |
+          fastp --version 2>&1 | sed 's# #: "#;s#$#"#' > /var/software_versions.txt
+  - type: nextflow
diff --git a/src/fastp/help.txt b/src/fastp/help.txt
new file mode 100644
index 00000000..d34917b2
--- /dev/null
+++ b/src/fastp/help.txt
@@ -0,0 +1,93 @@
+```bash
+fastp --help
+```
+
+usage: fastp [options] ... 
+options:
+  -i, --in1                            read1 input file name (string [=])
+  -o, --out1                           read1 output file name (string [=])
+  -I, --in2                            read2 input file name (string [=])
+  -O, --out2                           read2 output file name (string [=])
+      --unpaired1                      for PE input, if read1 passed QC but read2 not, it will be written to unpaired1. Default is to discard it. (string [=])
+      --unpaired2                      for PE input, if read2 passed QC but read1 not, it will be written to unpaired2. If --unpaired2 is same as --unpaired1 (default mode), both unpaired reads will be written to this same file. (string [=])
+      --overlapped_out                 for each read pair, output the overlapped region if it has no any mismatched base. (string [=])
+      --failed_out                     specify the file to store reads that cannot pass the filters. (string [=])
+  -m, --merge                          for paired-end input, merge each pair of reads into a single read if they are overlapped. The merged reads will be written to the file given by --merged_out, the unmerged reads will be written to the files specified by --out1 and --out2. The merging mode is disabled by default.
+      --merged_out                     in the merging mode, specify the file name to store merged output, or specify --stdout to stream the merged output (string [=])
+      --include_unmerged               in the merging mode, write the unmerged or unpaired reads to the file specified by --merge. Disabled by default.
+  -6, --phred64                        indicate the input is using phred64 scoring (it'll be converted to phred33, so the output will still be phred33)
+  -z, --compression                    compression level for gzip output (1 ~ 9). 1 is fastest, 9 is smallest, default is 4. (int [=4])
+      --stdin                          input from STDIN. If the STDIN is interleaved paired-end FASTQ, please also add --interleaved_in.
+      --stdout                         stream passing-filters reads to STDOUT. This option will result in interleaved FASTQ output for paired-end output. Disabled by default.
+      --interleaved_in                 indicate that <in1> is an interleaved FASTQ which contains both read1 and read2. Disabled by default.
+      --reads_to_process               specify how many reads/pairs to be processed. Default 0 means process all reads. (int [=0])
+      --dont_overwrite                 don't overwrite existing files. Overwritting is allowed by default.
+      --fix_mgi_id                     the MGI FASTQ ID format is not compatible with many BAM operation tools, enable this option to fix it.
+  -V, --verbose                        output verbose log information (i.e. when every 1M reads are processed).
+  -A, --disable_adapter_trimming       adapter trimming is enabled by default. If this option is specified, adapter trimming is disabled
+  -a, --adapter_sequence               the adapter for read1. For SE data, if not specified, the adapter will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped. (string [=auto])
+      --adapter_sequence_r2            the adapter for read2 (PE data only). This is used if R1/R2 are found not overlapped. If not specified, it will be the same as <adapter_sequence> (string [=auto])
+      --adapter_fasta                  specify a FASTA file to trim both read1 and read2 (if PE) by all the sequences in this FASTA file (string [=])
+      --detect_adapter_for_pe          by default, the auto-detection for adapter is for SE data input only, turn on this option to enable it for PE data.
+  -f, --trim_front1                    trimming how many bases in front for read1, default is 0 (int [=0])
+  -t, --trim_tail1                     trimming how many bases in tail for read1, default is 0 (int [=0])
+  -b, --max_len1                       if read1 is longer than max_len1, then trim read1 at its tail to make it as long as max_len1. Default 0 means no limitation (int [=0])
+  -F, --trim_front2                    trimming how many bases in front for read2. If it's not specified, it will follow read1's settings (int [=0])
+  -T, --trim_tail2                     trimming how many bases in tail for read2. If it's not specified, it will follow read1's settings (int [=0])
+  -B, --max_len2                       if read2 is longer than max_len2, then trim read2 at its tail to make it as long as max_len2. Default 0 means no limitation. If it's not specified, it will follow read1's settings (int [=0])
+  -D, --dedup                          enable deduplication to drop the duplicated reads/pairs
+      --dup_calc_accuracy              accuracy level to calculate duplication (1~6), higher level uses more memory (1G, 2G, 4G, 8G, 16G, 24G). Default 1 for no-dedup mode, and 3 for dedup mode. (int [=0])
+      --dont_eval_duplication          don't evaluate duplication rate to save time and use less memory.
+  -g, --trim_poly_g                    force polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data
+      --poly_g_min_len                 the minimum length to detect polyG in the read tail. 10 by default. (int [=10])
+  -G, --disable_trim_poly_g            disable polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data
+  -x, --trim_poly_x                    enable polyX trimming in 3' ends.
+      --poly_x_min_len                 the minimum length to detect polyX in the read tail. 10 by default. (int [=10])
+  -5, --cut_front                      move a sliding window from front (5') to tail, drop the bases in the window if its mean quality < threshold, stop otherwise.
+  -3, --cut_tail                       move a sliding window from tail (3') to front, drop the bases in the window if its mean quality < threshold, stop otherwise.
+  -r, --cut_right                      move a sliding window from front to tail, if meet one window with mean quality < threshold, drop the bases in the window and the right part, and then stop.
+  -W, --cut_window_size                the window size option shared by cut_front, cut_tail or cut_sliding. Range: 1~1000, default: 4 (int [=4])
+  -M, --cut_mean_quality               the mean quality requirement option shared by cut_front, cut_tail or cut_sliding. Range: 1~36 default: 20 (Q20) (int [=20])
+      --cut_front_window_size          the window size option of cut_front, default to cut_window_size if not specified (int [=4])
+      --cut_front_mean_quality         the mean quality requirement option for cut_front, default to cut_mean_quality if not specified (int [=20])
+      --cut_tail_window_size           the window size option of cut_tail, default to cut_window_size if not specified (int [=4])
+      --cut_tail_mean_quality          the mean quality requirement option for cut_tail, default to cut_mean_quality if not specified (int [=20])
+      --cut_right_window_size          the window size option of cut_right, default to cut_window_size if not specified (int [=4])
+      --cut_right_mean_quality         the mean quality requirement option for cut_right, default to cut_mean_quality if not specified (int [=20])
+  -Q, --disable_quality_filtering      quality filtering is enabled by default. If this option is specified, quality filtering is disabled
+  -q, --qualified_quality_phred        the quality value that a base is qualified. Default 15 means phred quality >=Q15 is qualified. (int [=15])
+  -u, --unqualified_percent_limit      how many percents of bases are allowed to be unqualified (0~100). Default 40 means 40% (int [=40])
+  -n, --n_base_limit                   if one read's number of N base is >n_base_limit, then this read/pair is discarded. Default is 5 (int [=5])
+  -e, --average_qual                   if one read's average quality score <avg_qual, then this read/pair is discarded. Default 0 means no requirement (int [=0])
+  -L, --disable_length_filtering       length filtering is enabled by default. If this option is specified, length filtering is disabled
+  -l, --length_required                reads shorter than length_required will be discarded, default is 15. (int [=15])
+      --length_limit                   reads longer than length_limit will be discarded, default 0 means no limitation. (int [=0])
+  -y, --low_complexity_filter          enable low complexity filter. The complexity is defined as the percentage of base that is different from its next base (base[i] != base[i+1]).
+  -Y, --complexity_threshold           the threshold for low complexity filter (0~100). Default is 30, which means 30% complexity is required. (int [=30])
+      --filter_by_index1               specify a file contains a list of barcodes of index1 to be filtered out, one barcode per line (string [=])
+      --filter_by_index2               specify a file contains a list of barcodes of index2 to be filtered out, one barcode per line (string [=])
+      --filter_by_index_threshold      the allowed difference of index barcode for index filtering, default 0 means completely identical. (int [=0])
+  -c, --correction                     enable base correction in overlapped regions (only for PE data), default is disabled
+      --overlap_len_require            the minimum length to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. 30 by default. (int [=30])
+      --overlap_diff_limit             the maximum number of mismatched bases to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. 5 by default. (int [=5])
+      --overlap_diff_percent_limit     the maximum percentage of mismatched bases to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. Default 20 means 20%. (int [=20])
+  -U, --umi                            enable unique molecular identifier (UMI) preprocessing
+      --umi_loc                        specify the location of UMI, can be (index1/index2/read1/read2/per_index/per_read, default is none (string [=])
+      --umi_len                        if the UMI is in read1/read2, its length should be provided (int [=0])
+      --umi_prefix                     if specified, an underline will be used to connect prefix and UMI (i.e. prefix=UMI, UMI=AATTCG, final=UMI_AATTCG). No prefix by default (string [=])
+      --umi_skip                       if the UMI is in read1/read2, fastp can skip several bases following UMI, default is 0 (int [=0])
+      --umi_delim                      delimiter to use between the read name and the UMI, default is : (string [=:])
+  -p, --overrepresentation_analysis    enable overrepresented sequence analysis.
+  -P, --overrepresentation_sampling    one in (--overrepresentation_sampling) reads will be computed for overrepresentation analysis (1~10000), smaller is slower, default is 20. (int [=20])
+  -j, --json                           the json format report file name (string [=fastp.json])
+  -h, --html                           the html format report file name (string [=fastp.html])
+  -R, --report_title                   should be quoted with ' or ", default is "fastp report" (string [=fastp report])
+  -w, --thread                         worker thread number, default is 3 (int [=3])
+  -s, --split                          split output by limiting total split file number with this option (2~999), a sequential number prefix will be added to output name ( 0001.out.fq, 0002.out.fq...), disabled by default (int [=0])
+  -S, --split_by_lines                 split output by limiting lines of each file with this option(>=1000), a sequential number prefix will be added to output name ( 0001.out.fq, 0002.out.fq...), disabled by default (long [=0])
+  -d, --split_prefix_digits            the digits for the sequential number padding (1~10), default is 4, so the filename will be padded as 0001.xxx, 0 to disable padding (int [=4])
+      --cut_by_quality5                DEPRECATED, use --cut_front instead.
+      --cut_by_quality3                DEPRECATED, use --cut_tail instead.
+      --cut_by_quality_aggressive      DEPRECATED, use --cut_right instead.
+      --discard_unmerged               DEPRECATED, no effect now, see the introduction for merging.
+  -?, --help                           print this message
diff --git a/src/fastp/script.sh b/src/fastp/script.sh
new file mode 100644
index 00000000..4bb37c87
--- /dev/null
+++ b/src/fastp/script.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+# disable flags
+[[ "$par_disable_adapter_trimming" == "false" ]] && unset par_disable_adapter_trimming
+[[ "$par_detect_adapter_for_pe" == "false" ]] && unset par_detect_adapter_for_pe
+[[ "$par_merge" == "false" ]] && unset par_merge
+[[ "$par_include_unmerged" == "false" ]] && unset par_include_unmerged
+[[ "$par_interleaved_in" == "false" ]] && unset par_interleaved_in
+[[ "$par_fix_mgi_id" == "false" ]] && unset par_fix_mgi_id
+[[ "$par_phred64" == "false" ]] && unset par_phred64
+[[ "$par_dont_overwrite" == "false" ]] && unset par_dont_overwrite
+[[ "$par_verbose" == "false" ]] && unset par_verbose
+[[ "$par_dedup" == "false" ]] && unset par_dedup
+[[ "$par_dont_eval_duplication" == "false" ]] && unset par_dont_eval_duplication
+[[ "$par_trim_poly_g" == "false" ]] && unset par_trim_poly_g
+[[ "$par_disable_trim_poly_g" == "false" ]] && unset par_disable_trim_poly_g
+[[ "$par_trim_poly_x" == "false" ]] && unset par_trim_poly_x
+[[ "$par_disable_quality_filtering" == "false" ]] && unset par_disable_quality_filtering
+[[ "$par_disable_length_filtering" == "false" ]] && unset par_disable_length_filtering
+[[ "$par_low_complexity_filter" == "false" ]] && unset par_low_complexity_filter
+[[ "$par_umi" == "false" ]] && unset par_umi
+[[ "$par_overrepresentation_analysis" == "false" ]] && unset par_overrepresentation_analysis
+
+# run command
+fastp \
+  -i "$par_in1" \
+  -o "$par_out1" \
+  ${par_in2:+--in2 "${par_in2}"} \
+  ${par_out2:+--out2 "${par_out2}"} \
+  ${par_unpaired1:+--unpaired1 "${par_unpaired1}"} \
+  ${par_unpaired2:+--unpaired2 "${par_unpaired2}"} \
+  ${par_failed_out:+--failed_out "${par_failed_out}"} \
+  ${par_overlapped_out:+--overlapped_out "${par_overlapped_out}"} \
+  ${par_json:+--json "${par_json}"} \
+  ${par_html:+--html "${par_html}"} \
+  ${par_report_title:+--report_title "${par_report_title}"} \
+  ${par_disable_adapter_trimming:+--disable_adapter_trimming} \
+  ${par_detect_adapter_for_pe:+--detect_adapter_for_pe} \
+  ${par_adapter_sequence:+--adapter_sequence "${par_adapter_sequence}"} \
+  ${par_adapter_sequence_r2:+--adapter_sequence_r2 "${par_adapter_sequence_r2}"} \
+  ${par_adapter_fasta:+--adapter_fasta "${par_adapter_fasta}"} \
+  ${par_trim_front1:+--trim_front1 "${par_trim_front1}"} \
+  ${par_trim_tail1:+--trim_tail1 "${par_trim_tail1}"} \
+  ${par_max_len1:+--max_len1 "${par_max_len1}"} \
+  ${par_trim_front2:+--trim_front2 "${par_trim_front2}"} \
+  ${par_trim_tail2:+--trim_tail2 "${par_trim_tail2}"} \
+  ${par_max_len2:+--max_len2 "${par_max_len2}"} \
+  ${par_merge:+--merge} \
+  ${par_merged_out:+--merged_out "${par_merged_out}"} \
+  ${par_include_unmerged:+--include_unmerged} \
+  ${par_interleaved_in:+--interleaved_in} \
+  ${par_fix_mgi_id:+--fix_mgi_id} \
+  ${par_phred64:+--phred64} \
+  ${par_compression:+--compression "${par_compression}"} \
+  ${par_dont_overwrite:+--dont_overwrite} \
+  ${par_verbose:+--verbose} \
+  ${par_reads_to_process:+--reads_to_process "${par_reads_to_process}"} \
+  ${par_dedup:+--dedup} \
+  ${par_dup_calc_accuracy:+--dup_calc_accuracy "${par_dup_calc_accuracy}"} \
+  ${par_dont_eval_duplication:+--dont_eval_duplication} \
+  ${par_trim_poly_g:+--trim_poly_g} \
+  ${par_poly_g_min_len:+--poly_g_min_len "${par_poly_g_min_len}"} \
+  ${par_disable_trim_poly_g:+--disable_trim_poly_g} \
+  ${par_trim_poly_x:+--trim_poly_x} \
+  ${par_poly_x_min_len:+--poly_x_min_len "${par_poly_x_min_len}"} \
+  ${par_cut_front:+--cut_front "${par_cut_front}"} \
+  ${par_cut_tail:+--cut_tail "${par_cut_tail}"} \
+  ${par_cut_right:+--cut_right "${par_cut_right}"} \
+  ${par_cut_window_size:+--cut_window_size "${par_cut_window_size}"} \
+  ${par_cut_mean_quality:+--cut_mean_quality "${par_cut_mean_quality}"} \
+  ${par_cut_front_window_size:+--cut_front_window_size "${par_cut_front_window_size}"} \
+  ${par_cut_front_mean_quality:+--cut_front_mean_quality "${par_cut_front_mean_quality}"} \
+  ${par_cut_tail_window_size:+--cut_tail_window_size "${par_cut_tail_window_size}"} \
+  ${par_cut_tail_mean_quality:+--cut_tail_mean_quality "${par_cut_tail_mean_quality}"} \
+  ${par_cut_right_window_size:+--cut_right_window_size "${par_cut_right_window_size}"} \
+  ${par_cut_right_mean_quality:+--cut_right_mean_quality "${par_cut_right_mean_quality}"} \
+  ${par_disable_quality_filtering:+--disable_quality_filtering} \
+  ${par_qualified_quality_phred:+--qualified_quality_phred "${par_qualified_quality_phred}"} \
+  ${par_unqualified_percent_limit:+--unqualified_percent_limit "${par_unqualified_percent_limit}"} \
+  ${par_n_base_limit:+--n_base_limit "${par_n_base_limit}"} \
+  ${par_average_qual:+--average_qual "${par_average_qual}"} \
+  ${par_disable_length_filtering:+--disable_length_filtering} \
+  ${par_length_required:+--length_required "${par_length_required}"} \
+  ${par_length_limit:+--length_limit "${par_length_limit}"} \
+  ${par_low_complexity_filter:+--low_complexity_filter} \
+  ${par_complexity_threshold:+--complexity_threshold "${par_complexity_threshold}"} \
+  ${par_filter_by_index1:+--filter_by_index1 "${par_filter_by_index1}"} \
+  ${par_filter_by_index2:+--filter_by_index2 "${par_filter_by_index2}"} \
+  ${par_filter_by_index_threshold:+--filter_by_index_threshold "${par_filter_by_index_threshold}"} \
+  ${par_correction:+--correction} \
+  ${par_overlap_len_require:+--overlap_len_require "${par_overlap_len_require}"} \
+  ${par_overlap_diff_limit:+--overlap_diff_limit "${par_overlap_diff_limit}"} \
+  ${par_overlap_diff_percent_limit:+--overlap_diff_percent_limit "${par_overlap_diff_percent_limit}"} \
+  ${par_umi:+--umi} \
+  ${par_umi_loc:+--umi_loc "${par_umi_loc}"} \
+  ${par_umi_len:+--umi_len "${par_umi_len}"} \
+  ${par_umi_prefix:+--umi_prefix "${par_umi_prefix}"} \
+  ${par_umi_skip:+--umi_skip "${par_umi_skip}"} \
+  ${par_umi_delim:+--umi_delim "${par_umi_delim}"} \
+  ${par_overrepresentation_analysis:+--overrepresentation_analysis} \
+  ${par_overrepresentation_sampling:+--overrepresentation_sampling "${par_overrepresentation_sampling}"} \
+  ${meta_cpus:+--thread "${meta_cpus}"}
diff --git a/src/fastp/test.sh b/src/fastp/test.sh
new file mode 100644
index 00000000..1b1f6f0c
--- /dev/null
+++ b/src/fastp/test.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+set -e
+
+## VIASH START
+meta_executable="target/docker/fastp/fastp"
+meta_resources_dir="src/fastp"
+## VIASH END
+
+#########################################################################################
+mkdir fastp_se
+cd fastp_se
+
+echo "> Run fastp on SE"
+"$meta_executable" \
+  --in1 "$meta_resources_dir/test_data/se/a.fastq" \
+  --out1 "trimmed.fastq" \
+  --failed_out "failed.fastq" \
+  --json "report.json" \
+  --html "report.html" \
+  --adapter_sequence ACGGCTAGCTA
+
+echo ">> Check if output exists"
+[ ! -f "trimmed.fastq" ] && echo ">> trimmed.fastq does not exist" && exit 1
+[ ! -f "failed.fastq" ] && echo ">> failed.fastq does not exist" && exit 1
+[ ! -f "report.json" ] && echo ">> report.json does not exist" && exit 1
+[ ! -f "report.html" ] && echo ">> report.html does not exist" && exit 1
+
+#########################################################################################
+cd ..
+mkdir fastp_pe_minimal
+cd fastp_pe_minimal
+
+echo ">> Run fastp on PE with minimal parameters"
+"$meta_executable" \
+  --in1 "$meta_resources_dir/test_data/pe/a.1.fastq" \
+  --in2 "$meta_resources_dir/test_data/pe/a.2.fastq" \
+  --out1 "trimmed_1.fastq" \
+  --out2 "trimmed_2.fastq"
+
+echo ">> Check if output exists"
+[ ! -f "trimmed_1.fastq" ] && echo ">> trimmed_1.fastq does not exist" && exit 1
+[ ! -f "trimmed_2.fastq" ] && echo ">> trimmed_2.fastq does not exist" && exit 1
+
+#########################################################################################
+cd ..
+mkdir fastp_pe_many
+cd fastp_pe_many
+
+echo ">> Run fastp on PE with many parameters"
+"$meta_executable" \
+  --in1 "$meta_resources_dir/test_data/pe/a.1.fastq" \
+  --in2 "$meta_resources_dir/test_data/pe/a.2.fastq" \
+  --out1 "trimmed_1.fastq" \
+  --out2 "trimmed_2.fastq" \
+  --failed_out "failed.fastq" \
+  --json "report.json" \
+  --html "report.html" \
+  --adapter_sequence ACGGCTAGCTA \
+  --adapter_sequence_r2 AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \
+  --merge \
+  --merged_out "merged.fastq"
+
+echo ">> Check if output exists"
+[ ! -f "trimmed_1.fastq" ] && echo ">> trimmed_1.fastq does not exist" && exit 1
+[ ! -f "trimmed_2.fastq" ] && echo ">> trimmed_2.fastq does not exist" && exit 1
+[ ! -f "failed.fastq" ] && echo ">> failed.fastq does not exist" && exit 1
+[ ! -f "report.json" ] && echo ">> report.json does not exist" && exit 1
+[ ! -f "report.html" ] && echo ">> report.html does not exist" && exit 1
+[ ! -f "merged.fastq" ] && echo ">> merged.fastq does not exist" && exit 1
+
+#########################################################################################
+
+echo "> Test successful"
\ No newline at end of file
diff --git a/src/fastp/test_data/pe/a.1.fastq b/src/fastp/test_data/pe/a.1.fastq
new file mode 100644
index 00000000..42735560
--- /dev/null
+++ b/src/fastp/test_data/pe/a.1.fastq
@@ -0,0 +1,4 @@
+@1
+ACGGCAT
++
+!!!!!!!
diff --git a/src/fastp/test_data/pe/a.2.fastq b/src/fastp/test_data/pe/a.2.fastq
new file mode 100644
index 00000000..42735560
--- /dev/null
+++ b/src/fastp/test_data/pe/a.2.fastq
@@ -0,0 +1,4 @@
+@1
+ACGGCAT
++
+!!!!!!!
diff --git a/src/fastp/test_data/script.sh b/src/fastp/test_data/script.sh
new file mode 100755
index 00000000..725eef6d
--- /dev/null
+++ b/src/fastp/test_data/script.sh
@@ -0,0 +1,10 @@
+# fastp test data
+
+# Test data was obtained from https://github.com/snakemake/snakemake-wrappers/tree/master/bio/fastp/test
+
+if [ ! -d /tmp/snakemake-wrappers ]; then
+  git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers
+fi
+
+cp -r /tmp/snakemake-wrappers/bio/fastp/test/reads/* src/fastp/test_data
+
diff --git a/src/fastp/test_data/se/a.fastq b/src/fastp/test_data/se/a.fastq
new file mode 100644
index 00000000..42735560
--- /dev/null
+++ b/src/fastp/test_data/se/a.fastq
@@ -0,0 +1,4 @@
+@1
+ACGGCAT
++
+!!!!!!!