viash-hub · rcannood · Sep 3, 2024 · Jul 18, 2024 · Jul 18, 2024 · Jul 18, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,8 @@
 
 * `agat/agat_convert_sp_gff2gtf`: convert any GTF/GFF file into a proper GTF file (PR #76).
 
+* `fastqc`: High throughput sequence quality control analysis tool (PR #92).
+
 ## MINOR CHANGES
 
 * `busco` components: update BUSCO to `5.7.1` (PR #72).

diff --git a/src/fastqc/config.vsh.yaml b/src/fastqc/config.vsh.yaml
@@ -0,0 +1,182 @@
+name: fastqc
+namespace: fastqc
+description: FastQC - A high throughput sequence QC analysis tool.
+keywords: [Quality control, BAM, SAM, FASTQ]
+links:
+  repository: https://github.com/s-andrews/FastQC
+license: GPL-3.0, Apache-2.0
+authors:
+  - __merge__: /src/_authors/theodoro_gasperin.yaml
+    roles: [ author, maintainer ]
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --input
+        type: file
+        direction: input
+        multiple: true
+        description: FASTQ file(s) to be analyzed.
+        required: true
+        example: input.fq
+
+  - name: Outputs
+    arguments:
+      - name: --html
+        type: file
+        direction: output
+        multiple: true
+        description: |
+          Create the HTML report of the results. 
+          '*' wild card must be provided in the output file name. 
+          Wild card will be replaced by the input basename.
+        example: example_*.html
+
+      - name: --zip
+        type: file
+        direction: output
+        multiple: true
+        description: |
+          Create the zip file(s) containing: html report, data, images, icons, summary, etc.
+          '*' wild card must be provided in the output file name.
+          Wild card will be replaced by the input basename.
+        example: example_*.zip     
+
+  - name: Options
+    arguments:  
+      - name: --casava
+        type: boolean_true
+        description: | 
+          Files come from raw casava output. Files in the same sample
+          group (differing only by the group number) will be analysed
+          as a set rather than individually. Sequences with the filter
+          flag set in the header will be excluded from the analysis.
+          Files must have the same names given to them by casava
+          (including being gzipped and ending with .gz) otherwise they
+          won't be grouped together correctly.
+
+      - name: --nano
+        type: boolean_true
+        description: |
+          Files come from nanopore sequences and are in fast5 format. In
+          this mode you can pass in directories to process and the program
+          will take in all fast5 files within those directories and produce
+          a single output file from the sequences found in all files.
+
+      - name: --nofilter
+        type: boolean_true
+        description: |
+          If running with --casava then don't remove read flagged by
+          casava as poor quality when performing the QC analysis.
+
+      - name: --extract
+        type: boolean_true
+        description: |
+          If set then the zipped output file will be uncompressed in
+          the same directory after it has been created.  By default
+          this option will be set if fastqc is run in non-interactive
+          mode.
+
+      - name: --noextract
+        type: boolean_true
+        description: |
+          Do not uncompress the output file after creating it. 
+          You should set this option if you do not wish to uncompress 
+          the output when running in non-interactive mode.
+
+      - name: --nogroup
+        type: boolean_true
+        description: |
+          Disable grouping of bases for reads >50bp. 
+          All reports will show data for every base in the read. 
+          WARNING: Using this option will cause fastqc to crash 
+          and burn if you use it on really long reads, and your 
+          plots may end up a ridiculous size. You have been warned!
+
+      - name: --min_length
+        type: integer
+        description: |
+          Sets an artificial lower limit on the length of the 
+          sequence to be shown in the report. As long as you 
+          set this to a value greater or equal to your longest 
+          read length then this will be the sequence length used 
+          to create your read groups. This can be useful for making
+          directly comparable statistics from datasets with somewhat 
+          variable read lengths.
+        example: 0
+
+      - name: --format
+        alternatives: -f
+        type: string
+        description: |
+          Bypasses the normal sequence file format detection and 
+          forces the program to use the specified format. 
+          Valid formats are bam, sam, bam_mapped, sam_mapped, and fastq.
+        example: bam
+
+      - name: --contaminants
+        alternatives: -c
+        type: file
+        description: |
+          Specifies a non-default file which contains the list 
+          of contaminants to screen overrepresented sequences against. 
+          The file must contain sets of named contaminants in the form
+          name[tab]sequence. Lines prefixed with a hash will be ignored.
+        example: contaminants.txt
+
+      - name: --adapters
+        alternatives: -a
+        type: file
+        description: |
+          Specifies a non-default file which contains the list of 
+          adapter sequences which will be explicitly searched against 
+          the library. The file must contain sets of named adapters 
+          in the form name[tab]sequence. Lines prefixed with a hash will be ignored.
+        example: adapters.txt
+
+      - name: --limits
+        alternatives: -l
+        type: file
+        description: |
+          Specifies a non-default file which contains 
+          a set of criteria which will be used to determine 
+          the warn/error limits for the various modules. 
+          This file can also be used to selectively remove 
+          some modules from the output altogether. The format 
+          needs to mirror the default limits.txt file found in 
+          the Configuration folder.
+        example: limits.txt
+
+      - name: --kmers
+        alternatives: -k
+        type: integer
+        description: |
+          Specifies the length of Kmer to look for in the Kmer 
+          content module. Specified Kmer length must be between 
+          2 and 10. Default length is 7 if not specified.
+        example: 7
+
+      - name: --quiet
+        alternatives: -q
+        type: boolean_true
+        description: |
+          Suppress all progress messages on stdout and only report errors.
+
+resources:
+  - type: bash_script
+    path: script.sh
+test_resources:
+  - type: bash_script
+    path: test.sh
+
+engines:
+  - type: docker
+    image: biocontainers/fastqc:v0.11.9_cv8
+    setup:
+      - type: docker
+        run: |
+          fastqc --version > /var/software_versions.txt
+
+runners:
+  - type: executable
+  - type: nextflow
diff --git a/src/fastqc/help.txt b/src/fastqc/help.txt
@@ -0,0 +1,125 @@
+```bash
+fastqc --help
+```
+
+            FastQC - A high throughput sequence QC analysis tool
+
+SYNOPSIS
+
+	fastqc seqfile1 seqfile2 .. seqfileN
+
+    fastqc [-o output dir] [--(no)extract] [-f fastq|bam|sam] 
+           [-c contaminant file] seqfile1 .. seqfileN
+
+DESCRIPTION
+
+    FastQC reads a set of sequence files and produces from each one a quality
+    control report consisting of a number of different modules, each one of 
+    which will help to identify a different potential type of problem in your
+    data.
+
+    If no files to process are specified on the command line then the program
+    will start as an interactive graphical application.  If files are provided
+    on the command line then the program will run with no user interaction
+    required.  In this mode it is suitable for inclusion into a standardised
+    analysis pipeline.
+
+    The options for the program as as follows:
+
+    -h --help       Print this help file and exit
+
+    -v --version    Print the version of the program and exit
+
+    -o --outdir     Create all output files in the specified output directory.
+                    Please note that this directory must exist as the program
+                    will not create it.  If this option is not set then the 
+                    output file for each sequence file is created in the same
+                    directory as the sequence file which was processed.
+
+    --casava        Files come from raw casava output. Files in the same sample
+                    group (differing only by the group number) will be analysed
+                    as a set rather than individually. Sequences with the filter
+                    flag set in the header will be excluded from the analysis.
+                    Files must have the same names given to them by casava
+                    (including being gzipped and ending with .gz) otherwise they
+                    won't be grouped together correctly.
+
+    --nano          Files come from nanopore sequences and are in fast5 format. In
+                    this mode you can pass in directories to process and the program
+                    will take in all fast5 files within those directories and produce
+                    a single output file from the sequences found in all files.                    
+
+    --nofilter      If running with --casava then don't remove read flagged by
+                    casava as poor quality when performing the QC analysis.
+
+    --extract       If set then the zipped output file will be uncompressed in
+                    the same directory after it has been created.  By default
+                    this option will be set if fastqc is run in non-interactive
+                    mode.
+
+    -j --java       Provides the full path to the java binary you want to use to
+                    launch fastqc. If not supplied then java is assumed to be in
+                    your path.
+
+    --noextract     Do not uncompress the output file after creating it.  You
+                    should set this option if you do not wish to uncompress
+                    the output when running in non-interactive mode.
+
+    --nogroup       Disable grouping of bases for reads >50bp. All reports will
+                    show data for every base in the read.  WARNING: Using this
+                    option will cause fastqc to crash and burn if you use it on
+                    really long reads, and your plots may end up a ridiculous size.
+                    You have been warned!
+
+    --min_length    Sets an artificial lower limit on the length of the sequence
+                    to be shown in the report.  As long as you set this to a value
+                    greater or equal to your longest read length then this will be
+                    the sequence length used to create your read groups.  This can
+                    be useful for making directly comaparable statistics from 
+                    datasets with somewhat variable read lengths.
+
+    -f --format     Bypasses the normal sequence file format detection and
+                    forces the program to use the specified format.  Valid
+                    formats are bam,sam,bam_mapped,sam_mapped and fastq
+
+    -t --threads    Specifies the number of files which can be processed
+                    simultaneously.  Each thread will be allocated 250MB of
+                    memory so you shouldn't run more threads than your
+                    available memory will cope with, and not more than
+                    6 threads on a 32 bit machine
+
+    -c              Specifies a non-default file which contains the list of
+    --contaminants  contaminants to screen overrepresented sequences against.
+                    The file must contain sets of named contaminants in the
+                    form name[tab]sequence.  Lines prefixed with a hash will
+                    be ignored.
+
+    -a              Specifies a non-default file which contains the list of
+    --adapters      adapter sequences which will be explicity searched against
+                    the library. The file must contain sets of named adapters
+                    in the form name[tab]sequence.  Lines prefixed with a hash
+                    will be ignored.
+
+    -l              Specifies a non-default file which contains a set of criteria
+    --limits        which will be used to determine the warn/error limits for the
+                    various modules.  This file can also be used to selectively 
+                    remove some modules from the output all together.  The format
+                    needs to mirror the default limits.txt file found in the
+                    Configuration folder.
+
+   -k --kmers       Specifies the length of Kmer to look for in the Kmer content
+                    module. Specified Kmer length must be between 2 and 10. Default
+                    length is 7 if not specified.
+
+   -q --quiet       Supress all progress messages on stdout and only report errors.
+
+   -d --dir         Selects a directory to be used for temporary files written when
+                    generating report images. Defaults to system temp directory if
+                    not specified.
+
+BUGS
+
+    Any bugs in fastqc should be reported either to [email protected]
+    or in www.bioinformatics.babraham.ac.uk/bugzilla/
+
+