From b1eaaa997333f7d7398b6a4cb6b1228f3cf3aeed Mon Sep 17 00:00:00 2001 From: DriesSchaumont <5946712+DriesSchaumont@users.noreply.github.com> Date: Fri, 8 Nov 2024 09:00:17 +0000 Subject: [PATCH] Add fasterq dump --- .../sra_tools_fasterq_dump/config.vsh.yaml | 252 ++++++++++++++++++ .../sra_tools_fasterq_dump/helpt.txt | 70 +++++ .../sra_tools_fasterq_dump/script.sh | 68 +++++ 3 files changed, 390 insertions(+) create mode 100644 src/sra_tools/sra_tools_fasterq_dump/config.vsh.yaml create mode 100644 src/sra_tools/sra_tools_fasterq_dump/helpt.txt create mode 100644 src/sra_tools/sra_tools_fasterq_dump/script.sh diff --git a/src/sra_tools/sra_tools_fasterq_dump/config.vsh.yaml b/src/sra_tools/sra_tools_fasterq_dump/config.vsh.yaml new file mode 100644 index 00000000..255bef01 --- /dev/null +++ b/src/sra_tools/sra_tools_fasterq_dump/config.vsh.yaml @@ -0,0 +1,252 @@ +name: sra_tools_fasterq_dump +namespace: sra_tools +description: + The fasterq-dump tool extracts data in FASTQ- or FASTA-format from SRA-accessions. +argument_groups: + - name: Inputs + arguments: + - name: --accession + description: | + SRA-accession to download. Does not require to use SRA tool's prefetch + command first to download file. Mutually exclude + type: string + required: false + - name: --prefetch_directory + type: file + required: false + direction: input + description: | + Directory generated by SRA tool 'prefetch'. Mutually exclusive with --accession. + - name: "Output" + + - name: "Options" + arguments: + - name: --details + alternatives: ["-x"] + type: bolean_true + description: | + Print details + - name: --progress + alternatives: [-p] + type: boolean_true + description: | + Show progress + - name: --split_spot + alternatives: ["-s"] + type: boolean_true + description: | + Split spots into reads + - name: --split_files + alternatives: ["-s"] + type: boolean_true + description: | + Write reads into different files + - name: --split_3 + alternatives: ["-3"] + type: boolean_true + description: | + Writes single reads in special file. + - name: "--concatenate_reads" + type: boolean_true + description: | + Writes whole spots into one file + - name: --skip_technical + type: boolean_true + description: | + Skip technical reads. + - name: --include_technical + type: boolean_true + description: | + Include technical reads. + - name: --minimal_read_length + type: integer + min: 1 + description: | + Filter by sequence length. + required: false + - name: --bases + alternatives: [-B] + type: string + description: | + Filter by bases + required: false + - name: --table + type: string + description: | + Name of consensus-table to use for pacbio reads. + required: false + - name: --fasta + type: boolean_true + description: | + Produce fasta output + - name: --fasta_unsorted + type: boolean_true + description: | + Produce unsorted FASTA output + - name: --fasta_reference_table + type: boolean_true + description: | + Produce FASTA output from REFERENCE table. + - name: --fasta_concat_all + type: boolean_true + description: | + Concatenate all rows and produce FASTA output + - name: --internal_ref + type: boolean_true + description: | + Extracts only internal references into the output file. + Internal references are non-standard scaffoldings the submitter + included in the submission and the bases of them are stored in the accession. + - name: --external_ref + type: boolean_true + description: | + Extracts only external references into the output file. + External references are canonical RefSeq accessions that are used in the accession. + - name: --ref_name + type: string + multiple: true + required: false + description: | + This command extracts only the named reference. If a reference is not found, the option is ignored. + If none of the requested references is found, an empty file is produced. Each reference used by an + accession can be named in 2 ways: the canonical name like 'NC_001133.9' or the user-supplied name + like 'I' or 'chr1'. The user can use the '--ref_report' option to inspect the names used. + - name: --seq_defline + type: string + required: false + example: "@$ac.$si $sn length=$rl" + description: | + Supply a defline for the the sequence sections of FASTQ or FASTA. + The format is a text that may contain these variables: + $ac ... the accession + $sn ... the spot-name + $sg ... the spot-group + $si ... the spot-id ( the number of the spot ) + $ri ... the read-id ( the number of a read within a spot ) + $rl ... the read-length + The accession, spot-id, read-id, and read-length are always available - but the spot-group and/or spot-name + might be missing or empty. If a variable is missing or empty it does not produce an error - it will be + omitted from the defline. + + Defaults for FASTQ: + if not splitting: @$ac.$si $sn length=$rl + if splitting: @$ac.$si/$ri $sn length=$rl + Defaults for FASTA: + if not splitting >$ac.$si $sn length=$rl + if splitting: >$ac.$si/$ri $sn length=$rl + + - name: --qual_defline + type: string + required: false + example: "@$ac.$si $sn length=$rl" + description: | + Supply a defline for the the quality sections of FASTQ. + The format is a text that may contain these variables: + $ac ... the accession + $sn ... the spot-name + $sg ... the spot-group + $si ... the spot-id ( the number of the spot ) + $ri ... the read-id ( the number of a read within a spot ) + $rl ... the read-length + The accession, spot-id, read-id, and read-length are always available - but the spot-group and/or spot-name + might be missing or empty. If a variable is missing or empty it does not produce an error - it will be + omitted from the defline. + + Defaults for FASTQ: + if not splitting: +$ac.$si $sn length=$rl + if splitting: +$ac.$si/$ri $sn length=$rl + - name: --only_unaligned + alternatives: [-U] + type: boolean_true + description: | + Process only unaligned reads. + - name: "--only_aligned" + alternatives: [-a] + type: boolean_true + description: | + Process only aligned reads. + - name: "--log-level" + alternatives: "-L" + type: string + choices: [fatal, sys, int, err, warn, info, debug] + example: warn + required: false + + + + + + + + + + + + + + + + + + + + +# Unused arguments: + # -F|--format format (special, fastq, default=fastq) + -o|--outfile output-file + -O|--outdir output-dir + -b|--bufsize size of file-buffer dflt=1MB + -c|--curcache size of cursor-cache dflt=10MB + -m|--mem memory limit for sorting dflt=100MB + -t|--temp where to put temp. files dflt=curr dir + -e|--threads how many thread dflt=6 + -p|--progress show progress + -x|--details print details + -s|--split-spot split spots into reads + -S|--split-files write reads into different files + -3|--split-3 writes single reads in special file + --concatenate-reads writes whole spots into one file + -Z|--stdout print output to stdout + -f|--force force to overwrite existing file(s) + --skip-technical skip technical reads + --include-technical include technical reads + -M|--min-read-len filter by sequence-len + --table which seq-table to use in case of pacbio + -B|--bases filter by bases + -A|--append append to output-file + --fasta produce FASTA output + --fasta-unsorted produce FASTA output, unsorted + --fasta-ref-tbl produce FASTA output from REFERENCE tbl + --fasta-concat-all concatenate all rows and produce FASTA + --internal-ref extract only internal REFERENCEs + --external-ref extract only external REFERENCEs + --ref-name extract only these REFERENCEs + --ref-report enumerate references + --use-name print name instead of seq-id + --seq-defline custom defline for sequence: $ac=accession, + $sn=spot-name, $sg=spot-group, $si=spot-id, + $ri=read-id, $rl=read-length + --qual-defline custom defline for qualities: same as + seq-defline + -U|--only-unaligned process only unaligned reads + -a|--only-aligned process only aligned reads + --disk-limit explicitly set disk-limit + --disk-limit-tmp explicitly set disk-limit for temp. files + --size-check switch to control: on=perform size-check + (default), off=do not perform size-check, + only=perform size-check only + --ngc PATH to ngc file + + -h|--help Output brief explanation for the program. + -V|--version Display the version of the program then + quit. + -L|--log-level Logging level as number or enum string. One + of (fatal|sys|int|err|warn|info|debug) or + (0-6) Current/default is warn. + -v|--verbose Increase the verbosity of the program + status messages. Use multiple times for more + verbosity. Negates quiet. + -q|--quiet Turn off all status messages for the + program. Negated by verbose. + --option-file Read more options and parameters from the + file. \ No newline at end of file diff --git a/src/sra_tools/sra_tools_fasterq_dump/helpt.txt b/src/sra_tools/sra_tools_fasterq_dump/helpt.txt new file mode 100644 index 00000000..a5eadbfa --- /dev/null +++ b/src/sra_tools/sra_tools_fasterq_dump/helpt.txt @@ -0,0 +1,70 @@ +``` +docker run -t --rm ncbi/sra-tools fasterq-dump --help +``` + +Usage: + fasterq-dump [options] + fasterq-dump [options] + +Options: + -F|--format format (special, fastq, default=fastq) + -o|--outfile output-file + -O|--outdir output-dir + -b|--bufsize size of file-buffer dflt=1MB + -c|--curcache size of cursor-cache dflt=10MB + -m|--mem memory limit for sorting dflt=100MB + -t|--temp where to put temp. files dflt=curr dir + -e|--threads how many thread dflt=6 + -p|--progress show progress + -x|--details print details + -s|--split-spot split spots into reads + -S|--split-files write reads into different files + -3|--split-3 writes single reads in special file + --concatenate-reads writes whole spots into one file + -Z|--stdout print output to stdout + -f|--force force to overwrite existing file(s) + --skip-technical skip technical reads + --include-technical include technical reads + -M|--min-read-len filter by sequence-len + --table which seq-table to use in case of pacbio + -B|--bases filter by bases + -A|--append append to output-file + --fasta produce FASTA output + --fasta-unsorted produce FASTA output, unsorted + --fasta-ref-tbl produce FASTA output from REFERENCE tbl + --fasta-concat-all concatenate all rows and produce FASTA + --internal-ref extract only internal REFERENCEs + --external-ref extract only external REFERENCEs + --ref-name extract only these REFERENCEs + --ref-report enumerate references + --use-name print name instead of seq-id + --seq-defline custom defline for sequence: $ac=accession, + $sn=spot-name, $sg=spot-group, $si=spot-id, + $ri=read-id, $rl=read-length + --qual-defline custom defline for qualities: same as + seq-defline + -U|--only-unaligned process only unaligned reads + -a|--only-aligned process only aligned reads + --disk-limit explicitly set disk-limit + --disk-limit-tmp explicitly set disk-limit for temp. files + --size-check switch to control: on=perform size-check + (default), off=do not perform size-check, + only=perform size-check only + --ngc PATH to ngc file + + -h|--help Output brief explanation for the program. + -V|--version Display the version of the program then + quit. + -L|--log-level Logging level as number or enum string. One + of (fatal|sys|int|err|warn|info|debug) or + (0-6) Current/default is warn. + -v|--verbose Increase the verbosity of the program + status messages. Use multiple times for more + verbosity. Negates quiet. + -q|--quiet Turn off all status messages for the + program. Negated by verbose. + --option-file Read more options and parameters from the + file. +for more information visit: + https://github.com/ncbi/sra-tools/wiki/HowTo:-fasterq-dump + https://github.com/ncbi/sra-tools/wiki/08.-prefetch-and-fasterq-dump \ No newline at end of file diff --git a/src/sra_tools/sra_tools_fasterq_dump/script.sh b/src/sra_tools/sra_tools_fasterq_dump/script.sh new file mode 100644 index 00000000..bc011d0e --- /dev/null +++ b/src/sra_tools/sra_tools_fasterq_dump/script.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash + +unset_if_false=( + par_details + par_progress + par_split_spot + par_split_files + par_split_3 + par_concatenate_reads + skip_technical + include_technical + exclude_technical + par_fasta + par_fasta_unsorted + par_fasta_reference_table + par_fasta_concat_all + par_internal_ref + par_external_ref + par_only_unaligned + par_only_aligned +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +if [ -z "$par_accession" ] && [ -z "$par_prefetch_directory" ]; then + echo "Either 'accesssion' or 'prefetch_directory' must be specified." + exit 1 +fi + +if [ ! -z "$par_accession" ] && [ ! -z "$par_prefetch_directory" ]; then + echo "'accesssion' or 'prefetch_directory' are mutually exclusive arguments." + exit 1 +fi +input=${par_accession-$par_prefetch_directory} + + + +fasterq-dump \ + ${par_details:+--details} \ + ${par_progress:+--progress} \ + ${par_split_spot:+--split-spot} \ + ${par_split_files:+--split-files} \ + ${par_concatenate_reads:+--concatenate-reads} \ + ${par_split_3:+--split-3} \ + ${par_skip_technical:+--skip-technical} \ + ${par_include_technical:+--include-technical} \ + ${par_exclude_technical:+--exclude-technical} \ + ${par_minimal_read_length:+--min-read-len $par_minimal_read_length} \ + ${par_table:+--table $par_table} \ + ${par_bases:+--bases $par_bases} \ + ${par_fasta+--fasta} \ + ${par_fasta_unsorted+--fasta-unsorted} \ + ${par_fasta_reference_table+--fasta-ref-tbl} \ + ${par_fasta_concat_all+--fasta-concat-all} \ + ${par_internal_ref+--internal-ref} \ + ${par_external_ref+--external-ref} \ + ${par_seq_defline:+--seq-defline ${par_seq_defline@Q}} \ + ${par_qual_defline:+--qual-defline ${par_seq_defline@Q}} \ + ${par_only_unaligned+--only-unaligned} \ + ${par_only_aligned+--only-aligned} \ + ${par_log_level+--log-level $par_log_level} \ + ${meta_memory_mb:+--memory "${meta_memory_mb}M"} \ + --force \ + "$input" + \ No newline at end of file