From b1eaaa997333f7d7398b6a4cb6b1228f3cf3aeed Mon Sep 17 00:00:00 2001
From: DriesSchaumont <5946712+DriesSchaumont@users.noreply.github.com>
Date: Fri, 8 Nov 2024 09:00:17 +0000
Subject: [PATCH] Add fasterq dump

---
 .../sra_tools_fasterq_dump/config.vsh.yaml    | 252 ++++++++++++++++++
 .../sra_tools_fasterq_dump/helpt.txt          |  70 +++++
 .../sra_tools_fasterq_dump/script.sh          |  68 +++++
 3 files changed, 390 insertions(+)
 create mode 100644 src/sra_tools/sra_tools_fasterq_dump/config.vsh.yaml
 create mode 100644 src/sra_tools/sra_tools_fasterq_dump/helpt.txt
 create mode 100644 src/sra_tools/sra_tools_fasterq_dump/script.sh

diff --git a/src/sra_tools/sra_tools_fasterq_dump/config.vsh.yaml b/src/sra_tools/sra_tools_fasterq_dump/config.vsh.yaml
new file mode 100644
index 00000000..255bef01
--- /dev/null
+++ b/src/sra_tools/sra_tools_fasterq_dump/config.vsh.yaml
@@ -0,0 +1,252 @@
+name: sra_tools_fasterq_dump
+namespace: sra_tools
+description:
+  The fasterq-dump tool extracts data in FASTQ- or FASTA-format from SRA-accessions.
+argument_groups:
+    - name: Inputs
+      arguments:
+        - name: --accession
+          description: |
+            SRA-accession to download. Does not require to use SRA tool's prefetch
+            command first to download file. Mutually exclude
+          type: string
+          required: false
+        - name: --prefetch_directory
+          type: file
+          required: false
+          direction: input
+          description: |
+            Directory generated by SRA tool 'prefetch'. Mutually exclusive with --accession.
+    - name: "Output"
+
+    - name: "Options"
+      arguments:
+        - name: --details
+          alternatives: ["-x"]
+          type: bolean_true
+          description: |
+            Print details
+        - name: --progress
+          alternatives: [-p]
+          type: boolean_true
+          description: |
+            Show progress
+        - name: --split_spot
+          alternatives: ["-s"]
+          type: boolean_true
+          description: |
+            Split spots into reads
+        - name: --split_files
+          alternatives: ["-s"]
+          type: boolean_true
+          description: |
+            Write reads into different files
+        - name: --split_3
+          alternatives: ["-3"]
+          type: boolean_true
+          description: |
+            Writes single reads in special file.
+        - name: "--concatenate_reads"
+          type: boolean_true
+          description: |
+            Writes whole spots into one file
+        - name: --skip_technical
+          type: boolean_true
+          description: |
+            Skip technical reads.
+        - name: --include_technical
+          type: boolean_true
+          description: |
+            Include technical reads.
+        - name: --minimal_read_length
+          type: integer
+          min: 1
+          description: |
+            Filter by sequence length.
+          required: false
+        - name: --bases
+          alternatives: [-B]
+          type: string
+          description: |
+            Filter by bases
+          required: false
+        - name: --table
+          type: string
+          description: |
+            Name of consensus-table to use for pacbio reads.
+          required: false
+        - name: --fasta
+          type: boolean_true
+          description: |
+            Produce fasta output
+        - name: --fasta_unsorted
+          type: boolean_true
+          description: |
+            Produce unsorted FASTA output
+        - name: --fasta_reference_table 
+          type: boolean_true
+          description: |
+            Produce FASTA output from REFERENCE table. 
+        - name: --fasta_concat_all
+          type: boolean_true
+          description: |
+            Concatenate all rows and produce FASTA output
+        - name: --internal_ref
+          type: boolean_true
+          description: |
+            Extracts only internal references into the output file.
+            Internal references are non-standard scaffoldings the submitter
+            included in the submission and the bases of them are stored in the accession.
+        - name: --external_ref
+          type: boolean_true
+          description: |
+            Extracts only external references into the output file.
+            External references are canonical RefSeq accessions that are used in the accession.
+        - name: --ref_name
+          type: string
+          multiple: true
+          required: false
+          description: |
+            This command extracts only the named reference. If a reference is not found, the option is ignored.
+            If none of the requested references is found, an empty file is produced. Each reference used by an
+            accession can be named in 2 ways: the canonical name like 'NC_001133.9' or the user-supplied name
+            like 'I' or 'chr1'. The user can use the '--ref_report' option to inspect the names used.
+        - name: --seq_defline
+          type: string
+          required: false
+          example: "@$ac.$si $sn length=$rl"
+          description: |
+            Supply a defline for the the sequence sections of FASTQ or FASTA.
+            The format is a text that may contain these variables:
+              $ac ... the accession
+              $sn ... the spot-name
+              $sg ... the spot-group
+              $si ... the spot-id ( the number of the spot )
+              $ri ... the read-id ( the number of a read within a spot )
+              $rl ... the read-length
+            The accession, spot-id, read-id, and read-length are always available - but the spot-group and/or spot-name
+            might be missing or empty. If a variable is missing or empty it does not produce an error - it will be
+            omitted from the defline.
+
+            Defaults for FASTQ:
+              if not splitting: @$ac.$si $sn length=$rl
+              if splitting: @$ac.$si/$ri $sn length=$rl
+            Defaults for FASTA:
+              if not splitting >$ac.$si $sn length=$rl
+              if splitting: >$ac.$si/$ri $sn length=$rl
+
+        - name: --qual_defline
+          type: string
+          required: false
+          example: "@$ac.$si $sn length=$rl"
+          description: |
+            Supply a defline for the the quality sections of FASTQ.
+            The format is a text that may contain these variables:
+              $ac ... the accession
+              $sn ... the spot-name
+              $sg ... the spot-group
+              $si ... the spot-id ( the number of the spot )
+              $ri ... the read-id ( the number of a read within a spot )
+              $rl ... the read-length
+            The accession, spot-id, read-id, and read-length are always available - but the spot-group and/or spot-name
+            might be missing or empty. If a variable is missing or empty it does not produce an error - it will be
+            omitted from the defline.
+
+            Defaults for FASTQ:
+              if not splitting: +$ac.$si $sn length=$rl
+              if splitting: +$ac.$si/$ri $sn length=$rl
+        - name: --only_unaligned
+          alternatives: [-U]
+          type: boolean_true
+          description: |
+            Process only unaligned reads.
+        - name: "--only_aligned"
+          alternatives: [-a]
+          type: boolean_true
+          description: |
+            Process only aligned reads.
+        - name: "--log-level"
+          alternatives: "-L"
+          type: string
+          choices: [fatal, sys, int, err, warn, info, debug]
+          example: warn
+          required: false
+
+        
+
+        
+
+          
+
+        
+        
+          
+
+
+
+
+          
+          
+
+        
+
+
+# Unused arguments:
+  # -F|--format                      format (special, fastq, default=fastq) 
+  -o|--outfile                     output-file 
+  -O|--outdir                      output-dir 
+  -b|--bufsize                     size of file-buffer dflt=1MB 
+  -c|--curcache                    size of cursor-cache dflt=10MB 
+  -m|--mem                         memory limit for sorting dflt=100MB 
+  -t|--temp                        where to put temp. files dflt=curr dir 
+  -e|--threads                     how many thread dflt=6 
+  -p|--progress                    show progress 
+  -x|--details                     print details 
+  -s|--split-spot                  split spots into reads 
+  -S|--split-files                 write reads into different files 
+  -3|--split-3                     writes single reads in special file 
+  --concatenate-reads              writes whole spots into one file 
+  -Z|--stdout                      print output to stdout 
+  -f|--force                       force to overwrite existing file(s) 
+  --skip-technical                 skip technical reads 
+  --include-technical              include technical reads 
+  -M|--min-read-len                filter by sequence-len 
+  --table                          which seq-table to use in case of pacbio 
+  -B|--bases                       filter by bases 
+  -A|--append                      append to output-file 
+  --fasta                          produce FASTA output 
+  --fasta-unsorted                 produce FASTA output, unsorted 
+  --fasta-ref-tbl                  produce FASTA output from REFERENCE tbl 
+  --fasta-concat-all               concatenate all rows and produce FASTA 
+  --internal-ref                   extract only internal REFERENCEs 
+  --external-ref                   extract only external REFERENCEs 
+  --ref-name                       extract only these REFERENCEs 
+  --ref-report                     enumerate references 
+  --use-name                       print name instead of seq-id 
+  --seq-defline                    custom defline for sequence:  $ac=accession, 
+                                   $sn=spot-name,  $sg=spot-group, $si=spot-id,  
+                                   $ri=read-id, $rl=read-length 
+  --qual-defline                   custom defline for qualities:  same as 
+                                   seq-defline 
+  -U|--only-unaligned              process only unaligned reads 
+  -a|--only-aligned                process only aligned reads 
+  --disk-limit                     explicitly set disk-limit 
+  --disk-limit-tmp                 explicitly set disk-limit for temp. files 
+  --size-check                     switch to control: on=perform size-check 
+                                   (default),  off=do not perform size-check,  
+                                   only=perform size-check only 
+  --ngc <PATH>                     PATH to ngc file 
+
+  -h|--help                        Output brief explanation for the program. 
+  -V|--version                     Display the version of the program then 
+                                   quit. 
+  -L|--log-level <level>           Logging level as number or enum string. One 
+                                   of (fatal|sys|int|err|warn|info|debug) or 
+                                   (0-6) Current/default is warn. 
+  -v|--verbose                     Increase the verbosity of the program 
+                                   status messages. Use multiple times for more 
+                                   verbosity. Negates quiet. 
+  -q|--quiet                       Turn off all status messages for the 
+                                   program. Negated by verbose. 
+  --option-file <file>             Read more options and parameters from the 
+                                   file. 
\ No newline at end of file
diff --git a/src/sra_tools/sra_tools_fasterq_dump/helpt.txt b/src/sra_tools/sra_tools_fasterq_dump/helpt.txt
new file mode 100644
index 00000000..a5eadbfa
--- /dev/null
+++ b/src/sra_tools/sra_tools_fasterq_dump/helpt.txt
@@ -0,0 +1,70 @@
+```
+docker run -t --rm ncbi/sra-tools fasterq-dump --help
+```
+
+Usage:
+  fasterq-dump <path> [options]
+  fasterq-dump <accession> [options]
+
+Options:
+  -F|--format                      format (special, fastq, default=fastq) 
+  -o|--outfile                     output-file 
+  -O|--outdir                      output-dir 
+  -b|--bufsize                     size of file-buffer dflt=1MB 
+  -c|--curcache                    size of cursor-cache dflt=10MB 
+  -m|--mem                         memory limit for sorting dflt=100MB 
+  -t|--temp                        where to put temp. files dflt=curr dir 
+  -e|--threads                     how many thread dflt=6 
+  -p|--progress                    show progress 
+  -x|--details                     print details 
+  -s|--split-spot                  split spots into reads 
+  -S|--split-files                 write reads into different files 
+  -3|--split-3                     writes single reads in special file 
+  --concatenate-reads              writes whole spots into one file 
+  -Z|--stdout                      print output to stdout 
+  -f|--force                       force to overwrite existing file(s) 
+  --skip-technical                 skip technical reads 
+  --include-technical              include technical reads 
+  -M|--min-read-len                filter by sequence-len 
+  --table                          which seq-table to use in case of pacbio 
+  -B|--bases                       filter by bases 
+  -A|--append                      append to output-file 
+  --fasta                          produce FASTA output 
+  --fasta-unsorted                 produce FASTA output, unsorted 
+  --fasta-ref-tbl                  produce FASTA output from REFERENCE tbl 
+  --fasta-concat-all               concatenate all rows and produce FASTA 
+  --internal-ref                   extract only internal REFERENCEs 
+  --external-ref                   extract only external REFERENCEs 
+  --ref-name                       extract only these REFERENCEs 
+  --ref-report                     enumerate references 
+  --use-name                       print name instead of seq-id 
+  --seq-defline                    custom defline for sequence:  $ac=accession, 
+                                   $sn=spot-name,  $sg=spot-group, $si=spot-id,  
+                                   $ri=read-id, $rl=read-length 
+  --qual-defline                   custom defline for qualities:  same as 
+                                   seq-defline 
+  -U|--only-unaligned              process only unaligned reads 
+  -a|--only-aligned                process only aligned reads 
+  --disk-limit                     explicitly set disk-limit 
+  --disk-limit-tmp                 explicitly set disk-limit for temp. files 
+  --size-check                     switch to control: on=perform size-check 
+                                   (default),  off=do not perform size-check,  
+                                   only=perform size-check only 
+  --ngc <PATH>                     PATH to ngc file 
+
+  -h|--help                        Output brief explanation for the program. 
+  -V|--version                     Display the version of the program then 
+                                   quit. 
+  -L|--log-level <level>           Logging level as number or enum string. One 
+                                   of (fatal|sys|int|err|warn|info|debug) or 
+                                   (0-6) Current/default is warn. 
+  -v|--verbose                     Increase the verbosity of the program 
+                                   status messages. Use multiple times for more 
+                                   verbosity. Negates quiet. 
+  -q|--quiet                       Turn off all status messages for the 
+                                   program. Negated by verbose. 
+  --option-file <file>             Read more options and parameters from the 
+                                   file. 
+for more information visit:
+   https://github.com/ncbi/sra-tools/wiki/HowTo:-fasterq-dump
+   https://github.com/ncbi/sra-tools/wiki/08.-prefetch-and-fasterq-dump
\ No newline at end of file
diff --git a/src/sra_tools/sra_tools_fasterq_dump/script.sh b/src/sra_tools/sra_tools_fasterq_dump/script.sh
new file mode 100644
index 00000000..bc011d0e
--- /dev/null
+++ b/src/sra_tools/sra_tools_fasterq_dump/script.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+
+unset_if_false=( 
+    par_details
+    par_progress
+    par_split_spot
+    par_split_files
+    par_split_3
+    par_concatenate_reads
+    skip_technical
+    include_technical
+    exclude_technical
+    par_fasta
+    par_fasta_unsorted
+    par_fasta_reference_table
+    par_fasta_concat_all
+    par_internal_ref
+    par_external_ref
+    par_only_unaligned
+    par_only_aligned
+)
+
+for par in ${unset_if_false[@]}; do
+    test_val="${!par}"
+    [[ "$test_val" == "false" ]] && unset $par
+done
+
+if [ -z "$par_accession" ] && [ -z "$par_prefetch_directory" ]; then
+    echo "Either 'accesssion' or 'prefetch_directory' must be specified."
+    exit 1
+fi
+
+if [ ! -z "$par_accession" ] && [ ! -z "$par_prefetch_directory" ]; then
+    echo "'accesssion' or 'prefetch_directory' are mutually exclusive arguments."
+    exit 1
+fi
+input=${par_accession-$par_prefetch_directory}
+
+
+
+fasterq-dump \
+    ${par_details:+--details} \
+    ${par_progress:+--progress} \
+    ${par_split_spot:+--split-spot} \
+    ${par_split_files:+--split-files} \
+    ${par_concatenate_reads:+--concatenate-reads} \
+    ${par_split_3:+--split-3} \
+    ${par_skip_technical:+--skip-technical} \
+    ${par_include_technical:+--include-technical} \
+    ${par_exclude_technical:+--exclude-technical} \
+    ${par_minimal_read_length:+--min-read-len $par_minimal_read_length} \
+    ${par_table:+--table $par_table} \
+    ${par_bases:+--bases $par_bases} \
+    ${par_fasta+--fasta} \
+    ${par_fasta_unsorted+--fasta-unsorted} \
+    ${par_fasta_reference_table+--fasta-ref-tbl} \
+    ${par_fasta_concat_all+--fasta-concat-all} \
+    ${par_internal_ref+--internal-ref} \
+    ${par_external_ref+--external-ref} \
+    ${par_seq_defline:+--seq-defline ${par_seq_defline@Q}} \
+    ${par_qual_defline:+--qual-defline ${par_seq_defline@Q}} \
+    ${par_only_unaligned+--only-unaligned} \
+    ${par_only_aligned+--only-aligned} \
+    ${par_log_level+--log-level $par_log_level} \
+    ${meta_memory_mb:+--memory "${meta_memory_mb}M"} \
+    --force \
+    "$input"
+    
\ No newline at end of file