diff --git a/CHANGELOG.md b/CHANGELOG.md index c8d0a08b..151b7a41 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,10 +43,14 @@ - `samtools/samtools_index`: Index SAM/BAM/CRAM files (PR #35). - `samtools/samtools_sort`: Sort SAM/BAM/CRAM files (PR #36). - `samtools/samtools_stats`: Reports alignment summary statistics for a BAM file (PR #39). - - `samtools/samtools_stats`: Indexes FASTA files to enable random access to fasta and fastq files (PR #41). + - `samtools/samtools_faidx`: Indexes FASTA files to enable random access to fasta and fastq files (PR #41). + - `samtools/samtools_collate`: Shuffles and groups reads in SAM/BAM/CRAM files together by their names (PR #42). + - `samtools/samtools_view`: Views and converts SAM/BAM/CRAM files (PR #48). + - `samtools/samtools_fastq`: Converts a SAM/BAM/CRAM file to FASTQ (PR #52). * `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43). + ## MAJOR CHANGES ## MINOR CHANGES @@ -55,6 +59,10 @@ * Update to Viash 0.8.5 (PR #25). +* Update to Viash 0.9.0-RC3 (PR #51). + ## DOCUMENTATION ## BUG FIXES + +* Add escaping character before leading hashtag in the description field of the config file (PR #50). \ No newline at end of file diff --git a/README.md b/README.md index 1639481f..ecf807ca 100644 --- a/README.md +++ b/README.md @@ -26,34 +26,25 @@ We encourage contributions from the community. To contribute: ## Contribution Guidelines -- **Documentation of Functionality**: The purpose and functionality of - each component should be adequately described. -- **Documentation of Inputs and Outputs**: All input and output - arguments should have a description and example (with extension). -- **Docker Image**: A Docker image (with optional additional - dependencies) should be provided. -- **Write unit tests**: A unit test with possibly test resources needs - to be provided. -- **Provide test resources**: If the unit test requires test resources, - these should be provided in the `test_resources` section of the - component. -- **Versioning**: If the component uses custom software (not installed - via Apt, Apk, Yum, Pip, Conda, or R), a Bash script `version.sh` needs - to be provided that outputs the version of the software. -- **File format specifications**: If a component returns a directory or - data structure such as AnnData or MuData, a specification of the file - format should be provided. +The contribution guidelines describes which steps you should follow to +contribute a component to this repository. + +1. Find a component to contribute +2. Add config template +3. Fill in the metadata +4. Find a suitable container +5. Create help file +6. Create or fetch test data +7. Add arguments for the input files +8. Add arguments for the output files +9. Add arguments for the other arguments +10. Add a Docker engine +11. Write a runner script +12. Create test script +13. Create a `/var/software_versions.txt` file See the [CONTRIBUTING](CONTRIBUTING.md) file for more details. -## Repository Structure - -… - -## Installation and Usage - -… - ## Support and Community For support, questions, or to join our community: diff --git a/README.qmd b/README.qmd index f7e5139b..656cdac7 100644 --- a/README.qmd +++ b/README.qmd @@ -21,29 +21,24 @@ We encourage contributions from the community. To contribute: ## Contribution Guidelines +The contribution guidelines describes which steps you should follow to contribute a component to this repository. + ```{r echo=FALSE} lines <- readr::read_lines("CONTRIBUTING.md") -index_start <- grep("^## ", lines) +index_start <- grep("^### Step [0-9]*:", lines) + index_end <- c(index_start[-1] - 1, length(lines)) -name <- gsub("^## ", "", lines[index_start]) -description <- lines[index_start + 2] +name <- gsub("^### Step [0-9]*: *", "", lines[index_start]) knitr::asis_output( - paste(paste0(" * **", name, "**: ", description, "\n"), collapse = "") + paste(paste0(" 1. ", name, "\n"), collapse = "") ) ``` See the [CONTRIBUTING](CONTRIBUTING.md) file for more details. -## Repository Structure - -... - -## Installation and Usage - -... ## Support and Community diff --git a/_viash.yaml b/_viash.yaml index a72a1ab7..397a8d69 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -7,4 +7,7 @@ links: issue_tracker: https://github.com/viash-hub/biobase/issues repository: https://github.com/viash-hub/biobase -viash_version: 0.9.0-RC2 \ No newline at end of file +viash_version: 0.9.0-RC3 + +config_mods: | + .requirements.commands := ['ps'] diff --git a/src/bcl_convert/config.vsh.yaml b/src/bcl_convert/config.vsh.yaml index 7a9bce24..d0cedfa7 100644 --- a/src/bcl_convert/config.vsh.yaml +++ b/src/bcl_convert/config.vsh.yaml @@ -50,20 +50,20 @@ argument_groups: example: true - name: --bcl_num_parallel_tiles type: integer - description: "# of tiles to process in parallel (default 1)" + description: "\\# of tiles to process in parallel (default 1)" example: 1 - name: --bcl_num_conversion_threads type: integer - description: "# of threads for conversion (per tile, default # cpu threads)" + description: "\\# of threads for conversion (per tile, default # cpu threads)" example: 1 - name: --bcl_num_compression_threads type: integer - description: "# of threads for fastq.gz output compression (per tile, default # cpu threads, or HW+12)" + description: "\\# of threads for fastq.gz output compression (per tile, default # cpu threads, or HW+12)" example: 1 - name: --bcl_num_decompression_threads type: integer description: - "# of threads for bcl/cbcl input decompression (per tile, default half # cpu threads, or HW+8). + "\\# of threads for bcl/cbcl input decompression (per tile, default half # cpu threads, or HW+8). Only applies when preloading files" example: 1 @@ -79,7 +79,7 @@ argument_groups: example: true - name: --num_unknown_barcodes_reported type: integer - description: "# of Top Unknown Barcodes to output (1000 by default)" + description: "\\# of Top Unknown Barcodes to output (1000 by default)" example: 1000 - name: --bcl_validate_sample_sheet_only type: boolean diff --git a/src/falco/config.vsh.yaml b/src/falco/config.vsh.yaml index 61cbb0d5..4d9cf656 100644 --- a/src/falco/config.vsh.yaml +++ b/src/falco/config.vsh.yaml @@ -177,7 +177,7 @@ engines: image: debian:trixie-slim setup: - type: apt - packages: [wget, build-essential, g++, zlib1g-dev] + packages: [wget, build-essential, g++, zlib1g-dev, procps] - type: docker run: | wget https://github.com/smithlabcode/falco/releases/download/v1.2.2/falco-1.2.2.tar.gz -O /tmp/falco.tar.gz && \ diff --git a/src/gffread/config.vsh.yaml b/src/gffread/config.vsh.yaml index c7f3d551..d2c41a87 100644 --- a/src/gffread/config.vsh.yaml +++ b/src/gffread/config.vsh.yaml @@ -1,5 +1,4 @@ name: gffread -namespace: gffread description: Validate, filter, convert and perform various other operations on GFF files. keywords: [gff, conversion, validation, filtering] links: diff --git a/src/multiqc/config.vsh.yaml b/src/multiqc/config.vsh.yaml index 7f29d109..0a3a784b 100644 --- a/src/multiqc/config.vsh.yaml +++ b/src/multiqc/config.vsh.yaml @@ -161,6 +161,12 @@ argument_groups: type: boolean_true description: | Disable coloured log output. + - name: "--cl_config" + type: string + required: false + description: | + YAML formatted string that allows to customize MultiQC behaviour like input file detection. + example: "qualimap_config: { general_stats_coverage: [20,40,200] }" - name: "Output format" arguments: diff --git a/src/multiqc/script.sh b/src/multiqc/script.sh index 6b04ff5b..6353eb11 100755 --- a/src/multiqc/script.sh +++ b/src/multiqc/script.sh @@ -1,7 +1,7 @@ #!/bin/bash # disable flags -[[ "$par_ignore_symlinks" == "false" ]] && unset ignore_symlinks +[[ "$par_ignore_symlinks" == "false" ]] && unset par_ignore_symlinks [[ "$par_dirs" == "false" ]] && unset par_dirs [[ "$par_full_names" == "false" ]] && unset par_full_names [[ "$par_fn_as_s_name" == "false" ]] && unset par_fn_as_s_name @@ -99,6 +99,7 @@ multiqc \ ${include_modules} \ ${par_include_modules:+--include-modules "$par_include_modules"} \ ${par_data_format:+--data-format "$par_data_format"} \ + ${par_cl_config:+--cl-config "$par_cl_config"} \ ${par_zip_data_dir:+--zip-data-dir} \ ${par_pdf:+--pdf} \ ${par_interactive:+--interactive} \ diff --git a/src/samtools/samtools_collate/config.vsh.yaml b/src/samtools/samtools_collate/config.vsh.yaml new file mode 100644 index 00000000..669f4cdf --- /dev/null +++ b/src/samtools/samtools_collate/config.vsh.yaml @@ -0,0 +1,94 @@ +name: samtools_collate +namespace: samtools +description: Shuffles and groups reads in SAM/BAM/CRAM files together by their names. +keywords: [collate, counts, bam, sam, cram] +links: + homepage: https://www.htslib.org/ + documentation: https://www.htslib.org/doc/samtools-icollate.html + repository: https://github.com/samtools/samtools +references: + doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] +license: MIT/Expat + +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: The input BAM file. + required: true + - name: --reference + type: file + description: Reference sequence FASTA FILE. + + - name: Outputs + arguments: + - name: --output + alternatives: -o + type: file + description: The output filename. + required: true + direction: output + + - name: Options + arguments: + - name: --uncompressed + alternatives: -u + type: boolean_true + description: Output uncompressed BAM. + - name: --fast + alternatives: -f + type: boolean_true + description: Fast mode, only primary alignments. + - name: --working_reads + alternatives: -r + type: integer + description: Working reads stored (for use with -f). + default: 10000 + - name: --compression + alternatives: -l + type: integer + description: Compression level. + default: 1 + - name: --nb_tmp_files + alternatives: -n + type: integer + description: Number of temporary files. + default: 64 + - name: --tmp_prefix + alternatives: -T + type: string + description: Write temporary files to PREFIX.nnnn.bam. + - name: --no_pg + type: boolean_true + description: Do not add a PG line. + - name: --input_fmt_option + type: string + description: Specify a single input file format option in the form of OPTION or OPTION=VALUE. + - name: --output_fmt + type: string + description: Specify output format (SAM, BAM, CRAM). + - name: --output_fmt_option + type: string + description: Specify a single output file format option in the form of OPTION or OPTION=VALUE. + + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/samtools:1.19.2--h50ea8bc_1 + setup: + - type: docker + run: | + samtools --version 2>&1 | grep -E '^(samtools|Using htslib)' | \ + sed 's#Using ##;s# \([0-9\.]*\)$#: \1#' > /var/software_versions.txt +runners: +- type: executable +- type: nextflow diff --git a/src/samtools/samtools_collate/help.txt b/src/samtools/samtools_collate/help.txt new file mode 100644 index 00000000..16190f4b --- /dev/null +++ b/src/samtools/samtools_collate/help.txt @@ -0,0 +1,31 @@ +``` +samtools collate +``` +Usage: samtools collate [options...] [] + +Options: + -O Output to stdout + -o Output file name (use prefix if not set) + -u Uncompressed BAM output + -f Fast (only primary alignments) + -r Working reads stored (with -f) [10000] + -l INT Compression level [1] + -n INT Number of temporary files [64] + -T PREFIX + Write temporary files to PREFIX.nnnn.bam + --no-PG do not add a PG line + --input-fmt-option OPT[=VAL] + Specify a single input file format option in the form + of OPTION or OPTION=VALUE + --output-fmt FORMAT[,OPT[=VAL]]... + Specify output format (SAM, BAM, CRAM) + --output-fmt-option OPT[=VAL] + Specify a single output file format option in the form + of OPTION or OPTION=VALUE + --reference FILE + Reference sequence FASTA FILE [null] + -@, --threads INT + Number of additional threads to use [0] + --verbosity INT + Set level of verbosity + is required unless the -o or -O options are used. \ No newline at end of file diff --git a/src/samtools/samtools_collate/script.sh b/src/samtools/samtools_collate/script.sh new file mode 100644 index 00000000..25847a52 --- /dev/null +++ b/src/samtools/samtools_collate/script.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -e + +[[ "$par_uncompressed" == "false" ]] && unset par_uncompressed +[[ "$par_fast" == "false" ]] && unset par_fast +[[ "$par_no_pg" == "false" ]] && unset par_no_pg + +samtools collate \ + "$par_input" \ + ${par_output:+-o "$par_output"} \ + ${par_reference:+-T "$par_reference"} \ + ${par_uncompressed:+-u} \ + ${par_fast:+-f} \ + ${par_working_reads:+-r "$par_working_reads"} \ + ${par_compression:+-l "$par_compression"} \ + ${par_nb_tmp_files:+-n "$par_nb_tmp_files"} \ + ${par_tmp_prefix:+-T "$par_tmp_prefix"} \ + ${par_no_pg:+-P} \ + ${par_input_fmt_option:+-O "$par_input_fmt_option"} \ + ${par_output_fmt:+-O "$par_output_fmt"} \ + ${par_output_fmt_option:+-O "$par_output_fmt_option"} + +exit 0 diff --git a/src/samtools/samtools_collate/test.sh b/src/samtools/samtools_collate/test.sh new file mode 100644 index 00000000..c5a2e6e6 --- /dev/null +++ b/src/samtools/samtools_collate/test.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +test_dir="${meta_resources_dir}/test_data" +out_dir="${meta_resources_dir}/out" + +############################################################################################ + +echo ">>> Test 1: $meta_functionality_name" +"$meta_executable" \ + --input "$test_dir/test.paired_end.sorted.bam" \ + --output "$out_dir/collated.bam" + +echo ">>> Checking whether output exists" +[ ! -f "$out_dir/collated.bam" ] && echo "File 'collated.bam' does not exist!" && exit 1 + +echo ">>> Checking whether output is non-empty" +[ ! -s "$out_dir/collated.bam" ] && echo "File 'collated.bam' is empty!" && exit 1 + +echo ">>> Checking whether output is correct" +diff <(samtools view "$out_dir/collated.bam") \ + <(samtools view "$test_dir/collated.bam") || \ + (echo "Output file collated.bam does not match expected output" && exit 1) + +############################################################################################ + +echo ">>> Test 2: $meta_functionality_name with --fast option" +"$meta_executable" \ + --fast \ + --input "$test_dir/test.paired_end.sorted.bam" \ + --output "$out_dir/fast_collated.bam" + +echo ">>> Checking whether output exists" +[ ! -f "$test_dir/fast_collated.bam" ] && echo "File 'fast_collated.bam' does not exist!" && exit 1 + +echo ">>> Checking whether output is non-empty" +[ ! -s "$test_dir/fast_collated.bam" ] && echo "File 'fast_collated.bam' is empty!" && exit 1 + +echo ">>> Checking whether output is correct" +diff <(samtools view "$test_dir/fast_collated.bam") \ + <(samtools view "$test_dir/fast_collated.bam") || \ + (echo "Output file fast_collated.bam does not match expected output" && exit 1) + + +############################################################################################ + +echo ">>> Test 3: $meta_functionality_name with compression" +"$meta_executable" \ + --compression 8 \ + --input "$test_dir/test.paired_end.sorted.bam" \ + --output "$out_dir/comp_collated.bam" + +echo ">>> Checking whether output exists" +[ ! -f "$out_dir/comp_collated.bam" ] && echo "File 'comp_collated.bam' does not exist!" && exit 1 + +echo ">>> Checking whether output is non-empty" +[ ! -s "$out_dir/comp_collated.bam" ] && echo "File 'comp_collated.bam' is empty!" && exit 1 + +echo ">>> Checking whether output is correct" +diff <(samtools view "$out_dir/comp_collated.bam") \ + <(samtools view "$test_dir/comp_collated.bam") || \ + (echo "Output file comp_collated.bam does not match expected output" && exit 1) + +############################################################################################ + +echo ">>> All tests passed successfully." + +exit 0 diff --git a/src/samtools/samtools_collate/test_data/collated.bam b/src/samtools/samtools_collate/test_data/collated.bam new file mode 100644 index 00000000..f6d5eab9 Binary files /dev/null and b/src/samtools/samtools_collate/test_data/collated.bam differ diff --git a/src/samtools/samtools_collate/test_data/comp_collated.bam b/src/samtools/samtools_collate/test_data/comp_collated.bam new file mode 100644 index 00000000..1f26cee4 Binary files /dev/null and b/src/samtools/samtools_collate/test_data/comp_collated.bam differ diff --git a/src/samtools/samtools_collate/test_data/fast_collated.bam b/src/samtools/samtools_collate/test_data/fast_collated.bam new file mode 100644 index 00000000..bb78fe5a Binary files /dev/null and b/src/samtools/samtools_collate/test_data/fast_collated.bam differ diff --git a/src/samtools/samtools_collate/test_data/script.sh b/src/samtools/samtools_collate/test_data/script.sh new file mode 100755 index 00000000..f97a7efe --- /dev/null +++ b/src/samtools/samtools_collate/test_data/script.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +# dowload test data from nf-core module +wget https://github.com/nf-core/test-datasets/raw/modules/data/genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam \ No newline at end of file diff --git a/src/samtools/samtools_collate/test_data/test.paired_end.sorted.bam b/src/samtools/samtools_collate/test_data/test.paired_end.sorted.bam new file mode 100644 index 00000000..85cccf14 Binary files /dev/null and b/src/samtools/samtools_collate/test_data/test.paired_end.sorted.bam differ diff --git a/src/samtools/samtools_fastq/config.vsh.yaml b/src/samtools/samtools_fastq/config.vsh.yaml new file mode 100644 index 00000000..39e926f0 --- /dev/null +++ b/src/samtools/samtools_fastq/config.vsh.yaml @@ -0,0 +1,189 @@ +name: samtools_fastq +namespace: samtools +description: Converts a SAM, BAM or CRAM to FASTQ format. +keywords: [fastq, bam, sam, cram] +links: + homepage: https://www.htslib.org/ + documentation: https://www.htslib.org/doc/samtools-fastq.html + repository: https://github.com/samtools/samtools +references: + doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] +license: MIT/Expat + +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: input SAM/BAM/CRAM file + required: true + - name: Outputs + arguments: + - name: --output + type: file + description: output FASTQ file + required: true + direction: output + - name: Options + arguments: + - name: --no_suffix + alternatives: -n + type: boolean_true + description: | + By default, either '/1' or '/2' is added to the end of read names where the corresponding + READ1 or READ2 FLAG bit is set. Using -n causes read names to be left as they are. + - name: --suffix + alternatives: -N + type: boolean_true + description: | + Always add either '/1' or '/2' to the end of read names even when put into different files. + - name: --use_oq + alternatives: -O + type: boolean_true + description: | + Use quality values from OQ tags in preference to standard quality string if available. + - name: --singleton + alternatives: -s + type: file + description: write singleton reads to FILE. + - name: --copy_tags + alternatives: -t + type: boolean_true + description: | + Copy RG, BC and QT tags to the FASTQ header line, if they exist. + - name: --copy_tags_list + alternatives: -T + type: string + description: | + Specify a comma-separated list of tags to copy to the FASTQ header line, if they exist. + TAGLIST can be blank or * to indicate all tags should be copied to the output. If using *, + be careful to quote it to avoid unwanted shell expansion. + - name: --read1 + alternatives: -1 + type: file + description: | + Write reads with the READ1 FLAG set (and READ2 not set) to FILE instead of outputting them. + If the -s option is used, only paired reads will be written to this file. + direction: output + - name: --read2 + alternatives: -2 + type: file + description: | + Write reads with the READ2 FLAG set (and READ1 not set) to FILE instead of outputting them. + If the -s option is used, only paired reads will be written to this file. + direction: output + - name: --output_reads + alternatives: -o + type: file + description: | + Write reads with either READ1 FLAG or READ2 flag set to FILE instead of outputting them to stdout. + This is equivalent to -1 FILE -2 FILE. + direction: output + - name: --output_reads_both + alternatives: -0 + type: file + description: | + Write reads where the READ1 and READ2 FLAG bits set are either both set or both unset to FILE + instead of outputting them. + direction: output + - name: --filter_flags + alternatives: -f + type: integer + description: | + Only output alignments with all bits set in INT present in the FLAG field. INT can be specified + in hex by beginning with `0x' (i.e. /^0x[0-9A-F]+/) or in octal by beginning with `0' + (i.e. /^0[0-7]+/). + default: 0 + - name: --excl_flags + alternatives: -F + type: string + description: | + Do not output alignments with any bits set in INT present in the FLAG field. INT can be specified + in hex by beginning with `0x' (i.e. /^0x[0-9A-F]+/) or in octal by beginning with `0' + (i.e. /^0[0-7]+/). This defaults to 0x900 representing filtering of secondary and + supplementary alignments. + default: 0x900 + - name: --incl_flags + alternatives: --rf + type: string + description: | + Only output alignments with any bits set in INT present in the FLAG field. INT can be specified + in hex by beginning with `0x' (i.e. /^0x[0-9A-F]+/), in octal by beginning with `0' + (i.e. /^0[0-7]+/), as a decimal number not beginning with '0' or as a comma-separated list of + flag names. + default: 0 + - name: --excl_flags_all + alternatives: -G + type: integer + description: | + Only EXCLUDE reads with all of the bits set in INT present in the FLAG field. INT can be specified + in hex by beginning with `0x' (i.e. /^0x[0-9A-F]+/) or in octal by beginning with `0' + (i.e. /^0[0-7]+/). + default: 0 + - name: --aux_tag + alternatives: -d + type: string + description: | + Only output alignments containing an auxiliary tag matching both TAG and VAL. If VAL is omitted + then any value is accepted. The tag types supported are i, f, Z, A and H. "B" arrays are not + supported. This is comparable to the method used in samtools view --tag. The option may be specified + multiple times and is equivalent to using the --aux_tag_file option. + - name: --aux_tag_file + alternatives: -D + type: string + description: | + Only output alignments containing an auxiliary tag matching TAG and having a value listed in FILE. + The format of the file is one line per value. This is equivalent to specifying --aux_tag multiple times. + - name: --casava + alternatives: -i + type: boolean_true + description: add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG) + - name: --compression + alternatives: -c + type: integer + description: set compression level when writing gz or bgzf fastq files. + default: 0 + - name: --index1 + alternatives: --i1 + type: file + description: write first index reads to FILE. + - name: --index2 + alternatives: --i2 + type: file + description: write second index reads to FILE. + - name: --barcode_tag + type: string + description: Auxiliary tag to find index reads in. + default: BC + - name: --quality_tag + type: string + description: Auxiliary tag to find index quality in. + default: QT + - name: --index_format + type: string + description: | + string to describe how to parse the barcode and quality tags. For example: + [i14i8]: the first 14 characters are index 1, the next 8 characters are index 2. + [n8i14]: ignore the first 8 characters, and use the next 14 characters for index 1. + If the tag contains a separator, then the numeric part can be replaced with '*' to mean + 'read until the separator or end of tag', for example: [n*i*]. + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/samtools:1.19.2--h50ea8bc_1 + setup: + - type: docker + run: | + samtools --version 2>&1 | grep -E '^(samtools|Using htslib)' | \ + sed 's#Using ##;s# \([0-9\.]*\)$#: \1#' > /var/software_versions.txt +runners: +- type: executable +- type: nextflow diff --git a/src/samtools/samtools_fastq/help.txt b/src/samtools/samtools_fastq/help.txt new file mode 100644 index 00000000..39ed0d00 --- /dev/null +++ b/src/samtools/samtools_fastq/help.txt @@ -0,0 +1,80 @@ +``` +samtools fastq +``` + +Usage: samtools fastq [options...] + +Description: +Converts a SAM, BAM or CRAM to FASTQ format. + +Options: + -0 FILE write reads designated READ_OTHER to FILE + -1 FILE write reads designated READ1 to FILE + -2 FILE write reads designated READ2 to FILE + -o FILE write reads designated READ1 or READ2 to FILE + note: if a singleton file is specified with -s, only + paired reads will be written to the -1 and -2 files. + -d, --tag TAG[:VAL] + only include reads containing TAG, optionally with value VAL + -f, --require-flags INT + only include reads with all of the FLAGs in INT present [0] + -F, --excl[ude]-flags INT + only include reads with none of the FLAGs in INT present [0x900] + --rf, --incl[ude]-flags INT + only include reads with any of the FLAGs in INT present [0] + -G INT only EXCLUDE reads with all of the FLAGs in INT present [0] + -n don't append /1 and /2 to the read name + -N always append /1 and /2 to the read name + -O output quality in the OQ tag if present + -s FILE write singleton reads designated READ1 or READ2 to FILE + -t copy RG, BC and QT tags to the FASTQ header line + -T TAGLIST copy arbitrary tags to the FASTQ header line, '*' for all + -v INT default quality score if not given in file [1] + -i add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG) + -c INT compression level [0..9] to use when writing bgzf files [1] + --i1 FILE write first index reads to FILE + --i2 FILE write second index reads to FILE + --barcode-tag TAG + Barcode tag [BC] + --quality-tag TAG + Quality tag [QT] + --index-format STR + How to parse barcode and quality tags + + --input-fmt-option OPT[=VAL] + Specify a single input file format option in the form + of OPTION or OPTION=VALUE + --reference FILE + Reference sequence FASTA FILE [null] + -@, --threads INT + Number of additional threads to use [0] + --verbosity INT + Set level of verbosity + +The files will be automatically compressed if the file names have a .gz +or .bgzf extension. The input to this program must be collated by name. +Run 'samtools collate' or 'samtools sort -n' to achieve this. + +Reads are designated READ1 if FLAG READ1 is set and READ2 is not set. +Reads are designated READ2 if FLAG READ1 is not set and READ2 is set. +Otherwise reads are designated READ_OTHER (both flags set or both flags unset). +Run 'samtools flags' for more information on flag codes and meanings. + +The index-format string describes how to parse the barcode and quality tags. +It is made up of 'i' or 'n' followed by a length or '*'. For example: + i14i8 The first 14 characters are index 1, the next 8 are index 2 + n8i14 Ignore the first 8 characters, and use the next 14 for index 1 + +If the tag contains a separator, then the numeric part can be replaced with +'*' to mean 'read until the separator or end of tag', for example: + i*i* Break the tag at the separator into index 1 and index 2 + n*i* Ignore the left part of the tag until the separator, + then use the second part of the tag as index 1 + +Examples: +To get just the paired reads in separate files, use: + samtools fastq -1 pair1.fq -2 pair2.fq -0 /dev/null -s /dev/null -n in.bam + +To get all non-supplementary/secondary reads in a single file, redirect +the output: + samtools fastq in.bam > all_reads.fq \ No newline at end of file diff --git a/src/samtools/samtools_fastq/script.sh b/src/samtools/samtools_fastq/script.sh new file mode 100644 index 00000000..367432f9 --- /dev/null +++ b/src/samtools/samtools_fastq/script.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -e + +[[ "$par_no_suffix" == "false" ]] && unset par_no_suffix +[[ "$par_suffix" == "false" ]] && unset par_suffix +[[ "$par_use_oq" == "false" ]] && unset par_use_oq +[[ "$par_copy_tags" == "false" ]] && unset par_copy_tags +[[ "$par_casava" == "false" ]] && unset par_casava + +samtools fastq \ + ${par_no_suffix:+-n} \ + ${par_suffix:+-N} \ + ${par_use_oq:+-O} \ + ${par_singleton:+-s "$par_singleton"} \ + ${par_copy_tags:+-t} \ + ${par_copy_tags_list:+-T "$par_copy_tags_list"} \ + ${par_read1:+-1 "$par_read1"} \ + ${par_read2:+-2 "$par_read2"} \ + ${par_output_reads:+-o "$par_output_reads"} \ + ${par_output_reads_both:+-0 "$par_output_reads_both"} \ + ${par_filter_flags:+-f "$par_filter_flags"} \ + ${par_excl_flags:+-F "$par_excl_flags"} \ + ${par_incl_flags:+--rf "$par_incl_flags"} \ + ${par_excl_flags_all:+-G "$par_excl_flags_all"} \ + ${par_aux_tag:+-d "$par_aux_tag"} \ + ${par_aux_tag_file:+-D "$par_aux_tag_file"} \ + ${par_casava:+-i} \ + ${par_compression:+-c "$par_compression"} \ + ${par_index1:+--i1 "$par_index1"} \ + ${par_index2:+--i2 "$par_index2"} \ + ${par_barcode_tag:+--barcode-tag "$par_barcode_tag"} \ + ${par_quality_tag:+--quality-tag "$par_quality_tag"} \ + ${par_index_format:+--index-format "$par_index_format"} \ + "$par_input" \ + > "$par_output" + diff --git a/src/samtools/samtools_fastq/test.sh b/src/samtools/samtools_fastq/test.sh new file mode 100644 index 00000000..32ee3f5e --- /dev/null +++ b/src/samtools/samtools_fastq/test.sh @@ -0,0 +1,96 @@ +#!/bin/bash + +test_dir="${meta_resources_dir}/test_data" +out_dir="${meta_resources_dir}/out_data" + +############################################################################################ + +echo ">>> Test 1: Convert all reads from a bam file to fastq format" +"$meta_executable" \ + --input "$test_dir/a.bam" \ + --output "$out_dir/a.fq" + +echo ">>> Check if output file exists" +[ ! -f "$out_dir/a.fq" ] && echo "Output file a.fq does not exist" && exit 1 + +echo ">>> Check if output is empty" +[ ! -s "$out_dir/a.fq" ] && echo "Output file a.fq is empty" && exit 1 + +echo ">>> Check if output matches expected output" +diff "$out_dir/a.fq" "$test_dir/a.fq" || + (echo "Output file a.fq does not match expected output" && exit 1) + +rm "$out_dir/a.fq" + +############################################################################################ + +echo ">>> Test 2: Convert all reads from a sam file to fastq format" +"$meta_executable" \ + --input "$test_dir/a.sam" \ + --output "$out_dir/a.fq" + +echo ">>> Check if output file exists" +[ ! -f "$out_dir/a.fq" ] && echo "Output file a.fq does not exist" && exit 1 + +echo ">>> Check if output is empty" +[ ! -s "$out_dir/a.fq" ] && echo "Output file a.fq is empty" && exit 1 + +echo ">>> Check if output matches expected output" +diff "$out_dir/a.fq" "$test_dir/a.fq" || + (echo "Output file a.fq does not match expected output" && exit 1) + +rm "$out_dir/a.fq" + +############################################################################################ + +echo ">>> Test 3: Output reads from bam file to separate files" + +"$meta_executable" \ + --input "$test_dir/a.bam" \ + --read1 "$out_dir/a.1.fq" \ + --read2 "$out_dir/a.2.fq" \ + --output "$out_dir/a.fq" + +echo ">>> Check if output files exist" +[ ! -f "$out_dir/a.1.fq" ] && echo "Output file a.1.fq does not exist" && exit 1 +[ ! -f "$out_dir/a.2.fq" ] && echo "Output file a.2.fq does not exist" && exit 1 +[ ! -f "$out_dir/a.fq" ] && echo "Output file a.fq does not exist" && exit 1 + +echo ">>> Check if output files are empty" +[ ! -s "$out_dir/a.1.fq" ] && echo "Output file a.1.fq is empty" && exit 1 +[ ! -s "$out_dir/a.2.fq" ] && echo "Output file a.2.fq is empty" && exit 1 +# output should be empty since input has no singleton reads + +echo ">>> Check if output files match expected output" +diff "$out_dir/a.1.fq" "$test_dir/a.1.fq" || + (echo "Output file a.1.fq does not match expected output" && exit 1) +diff "$out_dir/a.2.fq" "$test_dir/a.2.fq" || + (echo "Output file a.2.fq does not match expected output" && exit 1) + +rm "$out_dir/a.1.fq" "$out_dir/a.2.fq" "$out_dir/a.fq" + +############################################################################################ + +echo ">>> Test 4: Output only forward reads from bam file to fastq format" + +"$meta_executable" \ + --input "$test_dir/a.sam" \ + --excl_flags "0x80" \ + --output "$out_dir/half.fq" + +echo ">>> Check if output file exists" +[ ! -f "$out_dir/half.fq" ] && echo "Output file half.fq does not exist" && exit 1 + +echo ">>> Check if output is empty" +[ ! -s "$out_dir/half.fq" ] && echo "Output file half.fq is empty" && exit 1 + +echo ">>> Check if output matches expected output" +diff "$out_dir/half.fq" "$test_dir/half.fq" || + (echo "Output file half.fq does not match expected output" && exit 1) + +rm "$out_dir/half.fq" + +############################################################################################ + +echo "All tests succeeded!" +exit 0 \ No newline at end of file diff --git a/src/samtools/samtools_fastq/test_data/a.1.fq b/src/samtools/samtools_fastq/test_data/a.1.fq new file mode 100644 index 00000000..03eaa725 --- /dev/null +++ b/src/samtools/samtools_fastq/test_data/a.1.fq @@ -0,0 +1,12 @@ +@a1 +AAAAAAAAAA ++ +********** +@b1 +AAAAAAAAAA ++ +********** +@c1 +AAAAAAAAAA ++ +********** diff --git a/src/samtools/samtools_fastq/test_data/a.2.fq b/src/samtools/samtools_fastq/test_data/a.2.fq new file mode 100644 index 00000000..03eaa725 --- /dev/null +++ b/src/samtools/samtools_fastq/test_data/a.2.fq @@ -0,0 +1,12 @@ +@a1 +AAAAAAAAAA ++ +********** +@b1 +AAAAAAAAAA ++ +********** +@c1 +AAAAAAAAAA ++ +********** diff --git a/src/samtools/samtools_fastq/test_data/a.bam b/src/samtools/samtools_fastq/test_data/a.bam new file mode 100644 index 00000000..dba1268a Binary files /dev/null and b/src/samtools/samtools_fastq/test_data/a.bam differ diff --git a/src/samtools/samtools_fastq/test_data/a.fq b/src/samtools/samtools_fastq/test_data/a.fq new file mode 100644 index 00000000..d12c62ca --- /dev/null +++ b/src/samtools/samtools_fastq/test_data/a.fq @@ -0,0 +1,24 @@ +@a1/1 +AAAAAAAAAA ++ +********** +@b1/1 +AAAAAAAAAA ++ +********** +@c1/1 +AAAAAAAAAA ++ +********** +@a1/2 +AAAAAAAAAA ++ +********** +@b1/2 +AAAAAAAAAA ++ +********** +@c1/2 +AAAAAAAAAA ++ +********** diff --git a/src/samtools/samtools_fastq/test_data/a.sam b/src/samtools/samtools_fastq/test_data/a.sam new file mode 100644 index 00000000..aa8c77b3 --- /dev/null +++ b/src/samtools/samtools_fastq/test_data/a.sam @@ -0,0 +1,7 @@ +@SQ SN:xx LN:20 +a1 99 xx 1 1 10M = 11 20 AAAAAAAAAA ********** +b1 99 xx 1 1 10M = 11 20 AAAAAAAAAA ********** +c1 99 xx 1 1 10M = 11 20 AAAAAAAAAA ********** +a1 147 xx 11 1 10M = 1 -20 TTTTTTTTTT ********** +b1 147 xx 11 1 10M = 1 -20 TTTTTTTTTT ********** +c1 147 xx 11 1 10M = 1 -20 TTTTTTTTTT ********** diff --git a/src/samtools/samtools_fastq/test_data/half.fq b/src/samtools/samtools_fastq/test_data/half.fq new file mode 100644 index 00000000..85a2b1c4 --- /dev/null +++ b/src/samtools/samtools_fastq/test_data/half.fq @@ -0,0 +1,12 @@ +@a1/1 +AAAAAAAAAA ++ +********** +@b1/1 +AAAAAAAAAA ++ +********** +@c1/1 +AAAAAAAAAA ++ +********** diff --git a/src/samtools/samtools_fastq/test_data/script.sh b/src/samtools/samtools_fastq/test_data/script.sh new file mode 100755 index 00000000..b59bc1bd --- /dev/null +++ b/src/samtools/samtools_fastq/test_data/script.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# dowload test data from snakemake wrapper +if [ ! -d /tmp/fastq_source ]; then + git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers.git /tmp/fastq_source +fi + +cp -r /tmp/fastq_source/bio/samtools/fastx/test/*.sam src/samtools/samtools_fastq/test_data/ +cp -r /tmp/fastq_source/bio/samtools/fastq/interleaved/test/mapped/*.bam src/samtools/samtools_fastq/test_data/ +cp -r /tmp/fastq_source/bio/samtools/fastq/interleaved/test/reads/*.fq src/samtools/samtools_fastq/test_data/ +cp -r /tmp/fastq_source/bio/samtools/fastq/separate/test/reads/*.fq src/samtools/samtools_fastq/test_data/ \ No newline at end of file diff --git a/src/samtools/samtools_idxstats/config.vsh.yaml b/src/samtools/samtools_idxstats/config.vsh.yaml index d5e32077..30f21348 100644 --- a/src/samtools/samtools_idxstats/config.vsh.yaml +++ b/src/samtools/samtools_idxstats/config.vsh.yaml @@ -7,7 +7,7 @@ links: documentation: https://www.htslib.org/doc/samtools-idxstats.html repository: https://github.com/samtools/samtools references: - doi: 10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008 + doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat argument_groups: diff --git a/src/samtools/samtools_sort/config.vsh.yaml b/src/samtools/samtools_sort/config.vsh.yaml index 7cd9ec48..a78800da 100644 --- a/src/samtools/samtools_sort/config.vsh.yaml +++ b/src/samtools/samtools_sort/config.vsh.yaml @@ -4,7 +4,7 @@ description: Sort SAM/BAM/CRAM file. keywords: [sort, bam, sam, cram] links: homepage: https://www.htslib.org/ - documentation: https://www.htslib.org/doc/samtools-idxstats.html + documentation: https://www.htslib.org/doc/samtools-sort.html repository: https://github.com/samtools/samtools references: doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] diff --git a/src/samtools/samtools_stats/config.vsh.yaml b/src/samtools/samtools_stats/config.vsh.yaml index 554a11e8..0d8f57a4 100644 --- a/src/samtools/samtools_stats/config.vsh.yaml +++ b/src/samtools/samtools_stats/config.vsh.yaml @@ -4,10 +4,10 @@ description: Reports alignment summary statistics for a BAM file. keywords: [statistics, counts, bam, sam, cram] links: homepage: https://www.htslib.org/ - documentation: https://www.htslib.org/doc/samtools-idxstats.html + documentation: https://www.htslib.org/doc/samtools-stats.html repository: https://github.com/samtools/samtools references: - doi: 10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008 + doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat argument_groups: diff --git a/src/samtools/samtools_view/config.vsh.yaml b/src/samtools/samtools_view/config.vsh.yaml new file mode 100644 index 00000000..206b87ac --- /dev/null +++ b/src/samtools/samtools_view/config.vsh.yaml @@ -0,0 +1,351 @@ +name: samtools_view +namespace: samtools +description: Views and converts SAM/BAM/CRAM files. +keywords: [view, convert, bam, sam, cram] +links: + homepage: https://www.htslib.org/ + documentation: https://www.htslib.org/doc/samtools-view.html + repository: https://github.com/samtools/samtools +references: + doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] +license: MIT/Expat + +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: Input SAM, BAM, or CRAM file. + required: true + must_exist: true + - name: --fai_reference + alternatives: -t + type: file + description: | + A tab-delimited FILE. Each line must contain the reference name in the first column + and the length of the reference in the second column, with one line for each distinct + reference. Any additional fields beyond the second column are ignored. This file also + defines the order of the reference sequences in sorting. If you run: `samtools faidx ', + the resulting index file .fai can be used as this FILE. + - name: --reference + alternatives: -T + type: file + description: | + A FASTA format reference FILE, optionally compressed by bgzip and ideally indexed by samtools faidx. + If an index is not present one will be generated for you, if the reference file is local. + If the reference file is not local, but is accessed instead via an https://, s3:// or other URL, + the index file will need to be supplied by the server alongside the reference. It is possible to + have the reference and index files in different locations by supplying both to this option separated + by the string "##idx##", for example: + --reference ftp://x.com/ref.fa##idx##ftp://y.com/index.fa.fai + However, note that only the location of the reference will be stored in the output file header. + If this method is used to make CRAM files, the cram reader may not be able to find the index, + and may not be able to decode the file unless it can get the references it needs using a different + method. + - name: --target_file + alternatives: -L + type: file + description: | + Only output alignments overlapping the input BED FILE [null]. + - name: --region_file + type: file + description: | + Use an index and multi-region iterator to only output alignments overlapping the input BED FILE. + Equivalent to --use_index --target_file FILE. + - name: --qname_file + alternatives: -N + type: file + description: | + Output only alignments with read names listed in FILE. If FILE starts with ^ then the operation is + negated and only outputs alignment with read groups not listed in FILE. It is not permissible to mix + both the filter-in and filter-out style syntax in the same command. + must_exist: true + - name: --read_group_file + alternatives: -R + type: file + description: | + Output alignments in read groups listed in FILE [null]. If FILE starts with ^ then the operation is + negated and only outputs alignment with read names not listed in FILE. It is not permissible to mix + both the filter-in and filter-out style syntax in the same command. Note that records with no RG tag + will also be output when using this option. This behaviour may change in a future release. + must_exist: true + - name: --use_index + alternatives: -M + type: boolean_true + description: | + Use the multi-region iterator on the union of a BED file and command-line region arguments. + This avoids re-reading the same regions of files so can sometimes be much faster. Note this also + removes duplicate sequences. Without this a sequence that overlaps multiple regions specified on + the command line will be reported multiple times. The usage of a BED file is optional and its path + has to be preceded by --target_file option. + + - name: Outputs + arguments: + - name: --output + alternatives: -o + type: file + description: Output to FILE instead of [stdout]. + required: true + direction: output + example: output.bam + - name: --bam + alternatives: -b + type: boolean_true + description: Output in the BAM format. + - name: --cram + alternatives: -C + type: boolean_true + description: | + Output in the CRAM format (requires --reference). + - name: --fast + type: boolean_true + description: | + Enable fast compression. This also changes the default output format to BAM, + but this can be overridden by the explicit format options or using a filename + with a known suffix. + - name: --uncompressed + alternatives: -u + type: boolean_true + description: | + Output uncompressed data. This also changes the default output format to BAM, + but this can be overridden by the explicit format options or using a filename + with a known suffix. + This option saves time spent on compression/decompression and is thus preferred + when the output is piped to another samtools command. + - name: --with_header + type: boolean_true + description: | + Include the header in the output. + - name: --header_only + alternatives: -H + type: boolean_true + description: | + Output the header only. + - name: --no_header + type: boolean_true + description: | + When producing SAM format, output alignment records but not headers. + This is the default; the option can be used to reset the effect of + --with_header/--header_only. + - name: --count + alternatives: -c + type: boolean_true + description: | + Instead of printing the alignments, only count them and print the total number. + All filter options, such as --require_flags, --excl_flags, and --min_MQ, are taken + into account. The --unmap option is ignored in this mode. + - name: --output_unselected + alternatives: -U + type: file + description: | + Write alignments that are not selected by the various filter options to FILE. + When this option is used, all alignments (or all alignments intersecting the regions + specified) are written to either the output file or this file, but never both. + - name: --unmap + alternatives: -p + type: boolean_true + description: | + Set the UNMAP flag on alignments that are not selected by the filter options. + These alignments are then written to the normal output. This is not compatible + with --output_unselected. + - name: --read_group + alternatives: -r + type: string + description: | + Output alignments in read group STR [null]. Note that records with no RG tag will also be output + when using this option. This behaviour may change in a future release. + - name: --tag + alternatives: -d + type: string + description: | + Only output alignments with tag STR1 and associated value STR2, which can be a string or an integer + [null]. + The value can be omitted, in which case only the tag is considered. + Note that this option does not specify a tag type. For example, use --tag XX:42 to select alignments + with an XX:i:42 field, not --tag XX:i:42. + - name: --tag_file + alternatives: -D + type: file + description: | + Only output alignments with tag STR and associated values listed in FILE. + must_exist: true + - name: --min_MQ + alternatives: -q + type: integer + description: | + Skip alignments with MAPQ smaller than INT. + default: 0 + - name: --library + alternatives: -l + type: string + description: | + Only output alignments in library STR. + - name: --min_qlen + alternatives: -m + type: integer + description: | + Only output alignments with number of CIGAR bases consuming query sequence >= INT. + default: 0 + - name: --expr + alternatives: -e + type: string + description: | + Only include alignments that match the filter expression STR. The syntax for these expressions is + described in the main samtools. + - name: --require_flags + alternatives: -f + type: string + description: | + Only output alignments with all bits set in FLAG present in the FLAG field. FLAG can be specified + in hex by beginning with `0x' (i.e. /^0x[0-9A-F]+/), in octal by beginning with `0' (i.e. /^0[0-7]+/), + as a decimal number not beginning with '0' or as a comma-separated list of flag names. + - name: --excl_flags + alternatives: -F + type: string + description: | + Do not output alignments with any bits set in FLAG present in the FLAG field. FLAG can be specified + in hex by beginning with `0x' (i.e. /^0x[0-9A-F]+/), in octal by beginning with `0' (i.e. /^0[0-7]+/), + as a decimal number not beginning with '0' or as a comma-separated list of flag names. + - name: --excl_all_flags + alternatives: -G + type: integer + description: | + Do not output alignments with all bits set in INT present in the FLAG field. This is the opposite of + --require_flags such that --require_flags 12 --exclude_all_flags 12 is the same as no filtering at all. + FLAG can be specified in hex by beginning with `0x' (i.e. /^0x[0-9A-F]+/), in octal by beginning with `0' + (i.e. /^0[0-7]+/), as a decimal number not beginning with '0' or as a comma-separated list of flag names. + - name: --incl_flags + alternatives: --rf + type: string + description: | + Only output alignments with any bit set in FLAG present in the FLAG field. FLAG can be specified in hex + by beginning with `0x' (i.e. /^0x[0-9A-F]+/), in octal by beginning with `0' (i.e. /^0[0-7]+/), as a decimal + number not beginning with '0' or as a comma-separated list of flag names. + - name: --remove_tag + alternatives: -x + type: string + description: | + Read tag(s) to exclude from output (repeatable) [null]. This can be a single tag or a comma separated list. + Alternatively the option itself can be repeated multiple times. + If the list starts with a `^' then it is negated and treated as a request to remove all tags except those in STR. + The list may be empty, so --remove_tag ^ will remove all tags. + Note that tags will only be removed from reads that pass filtering. + - name: --keep_tag + type: string + description: | + This keeps only tags listed in STR and is directly equivalent to --remove_tag ^STR. Specifying an empty list + will remove all tags. If both --keep_tag and --remove_tag are specified then --keep_tag has precedence. + Note that tags will only be removed from reads that pass filtering. + - name: --remove_B + alternatives: -B + type: boolean_true + description: | + Collapse the backward CIGAR operation. + - name: --add_flags + type: string + description: | + Adds flag(s) to read. FLAG can be specified in hex by beginning with `0x' (i.e. /^0x[0-9A-F]+/), in octal + by beginning with `0' (i.e. /^0[0-7]+/), as a decimal number not beginning with '0' or as a comma-separated + list of flag names. + - name: --remove_flags + type: string + description: | + Remove flag(s) from read. FLAG is specified in the same way as with the --add_flags option. + - name: --subsample + type: double + description: | + Output only a proportion of the input alignments, as specified by 0.0 <= FLOAT <= 1.0, which gives the fraction + of templates/pairs to be kept. This subsampling acts in the same way on all of the alignment records in the same + template or read pair, so it never keeps a read but not its mate. + - name: --subsample_seed + type: integer + description: | + Subsampling seed used to influence which subset of reads is kept. When subsampling data that has previously + been subsampled, be sure to use a different seed value from those used previously; otherwise more reads will + be retained than expected. + default: 0 + - name: --fetch_pairs + alternatives: -P + type: boolean_true + description: | + Retrieve pairs even when the mate is outside of the requested region. Enabling this option also turns on the + multi-region iterator (-M). A region to search must be specified, either on the command-line, or using the + --target_file option. The input file must be an indexed regular file. + This option first scans the requested region, using the RNEXT and PNEXT fields of the records that have the + PAIRED flag set and pass other filtering options to find where paired reads are located. These locations are + used to build an expanded region list, and a set of QNAMEs to allow from the new regions. It will then make + a second pass, collecting all reads from the originally-specified region list together with reads from additional + locations that match the allowed set of QNAMEs. Any other filtering options used will be applied to all reads + found during this second pass. + As this option links reads using RNEXT and PNEXT, it is important that these fields are set accurately. Use + 'samtools fixmate' to correct them if necessary. + Note that this option does not work with the --count, --output-unselected or --unmap options. + - name: --customized_index + alternatives: -X + type: boolean_true + description: | + Include customized index file as a part of arguments. See EXAMPLES section for sample of usage. + - name: --sanitize + alternatives: -z + type: string + description: | + Perform some sanity checks on the state of SAM record fields, fixing up common mistakes made by aligners. + These include soft-clipping alignments when they extend beyond the end of the reference, marking records as + unmapped when they have reference * or position 0, and ensuring unmapped alignments have no CIGAR or mapping + quality for unmapped alignments and no MD, NM, CG or SM tags. + FLAGs is a comma-separated list of keywords chosen from the following list. + + unmap: The UNMAPPED BAM flag. This is set for reads with position <= 0, reference name "*" or reads starting + beyond the end of the reference. Note CIGAR "*" is permitted for mapped data so does not trigger this. + + pos: Position and reference name fields. These may be cleared when a sequence is unmapped due to the + coordinates being beyond the end of the reference. Selecting this may change the sort order of the file, + so it is not a part of the on compound argument. + mqual: Mapping quality. This is set to zero for unmapped reads. + cigar: Modifies CIGAR fields, either by adding soft-clips for reads that overlap the end of the reference or + by clearing it for unmapped reads. + aux: For unmapped data, some auxiliary fields are meaningless and will be removed. These include NM, MD, CG and SM. + off: Perform no sanity fixing. This is the default + on: Sanitize data in a way that guarantees the same sort order. This is everything except for pos. + all: All sanitizing options, including pos. + - name: --no_PG + type: boolean_true + description: | + Do not add a @PG line to the header of the output file. + - name: --input_fmt_option + type: string + description: | + Specify a single input file format option in the form of OPTION or OPTION=VALUE. + - name: --output_fmt + alternatives: -O + type: string + description: | + Specify output format (SAM, BAM, CRAM). + - name: --output_fmt_option + type: string + description: | + Specify a single output file format option in the form of OPTION or OPTION=VALUE. + - name: --write_index + type: boolean_true + description: | + Automatically index the output files. + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/samtools:1.19.2--h50ea8bc_1 + setup: + - type: docker + run: | + samtools --version 2>&1 | grep -E '^(samtools|Using htslib)' | \ + sed 's#Using ##;s# \([0-9\.]*\)$#: \1#' > /var/software_versions.txt +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/samtools/samtools_view/help.txt b/src/samtools/samtools_view/help.txt new file mode 100644 index 00000000..753b1bc6 --- /dev/null +++ b/src/samtools/samtools_view/help.txt @@ -0,0 +1,80 @@ +``` +samtools view +``` + +Usage: samtools view [options] || [region ...] + +Output options: + -b, --bam Output BAM + -C, --cram Output CRAM (requires -T) + -1, --fast Use fast BAM compression (and default to --bam) + -u, --uncompressed Uncompressed BAM output (and default to --bam) + -h, --with-header Include header in SAM output + -H, --header-only Print SAM header only (no alignments) + --no-header Print SAM alignment records only [default] + -c, --count Print only the count of matching records + -o, --output FILE Write output to FILE [standard output] + -U, --unoutput FILE, --output-unselected FILE + Output reads not selected by filters to FILE + -p, --unmap Set flag to UNMAP on reads not selected + then write to output file. + -P, --fetch-pairs Retrieve complete pairs even when outside of region +Input options: + -t, --fai-reference FILE FILE listing reference names and lengths + -M, --use-index Use index and multi-region iterator for regions + --region[s]-file FILE Use index to include only reads overlapping FILE + -X, --customized-index Expect extra index file argument after + +Filtering options (Only include in output reads that...): + -L, --target[s]-file FILE ...overlap (BED) regions in FILE + -N, --qname-file [^]FILE ...whose read name is listed in FILE ("^" negates) + -r, --read-group STR ...are in read group STR + -R, --read-group-file [^]FILE + ...are in a read group listed in FILE + -d, --tag STR1[:STR2] ...have a tag STR1 (with associated value STR2) + -D, --tag-file STR:FILE ...have a tag STR whose value is listed in FILE + -q, --min-MQ INT ...have mapping quality >= INT + -l, --library STR ...are in library STR + -m, --min-qlen INT ...cover >= INT query bases (as measured via CIGAR) + -e, --expr STR ...match the filter expression STR + -f, --require-flags FLAG ...have all of the FLAGs present + -F, --excl[ude]-flags FLAG ...have none of the FLAGs present + --rf, --incl-flags, --include-flags FLAG + ...have some of the FLAGs present + -G FLAG EXCLUDE reads with all of the FLAGs present + --subsample FLOAT Keep only FLOAT fraction of templates/read pairs + --subsample-seed INT Influence WHICH reads are kept in subsampling [0] + -s INT.FRAC Same as --subsample 0.FRAC --subsample-seed INT + +Processing options: + --add-flags FLAG Add FLAGs to reads + --remove-flags FLAG Remove FLAGs from reads + -x, --remove-tag STR + Comma-separated read tags to strip (repeatable) [null] + --keep-tag STR + Comma-separated read tags to preserve (repeatable) [null]. + Equivalent to "-x ^STR" + -B, --remove-B Collapse the backward CIGAR operation + -z, --sanitize FLAGS Perform sanitity checking and fixing on records. + FLAGS is comma separated (see manual). [off] + +General options: + -?, --help Print long help, including note about region specification + -S Ignored (input format is auto-detected) + --no-PG Do not add a PG line + --input-fmt-option OPT[=VAL] + Specify a single input file format option in the form + of OPTION or OPTION=VALUE + -O, --output-fmt FORMAT[,OPT[=VAL]]... + Specify output format (SAM, BAM, CRAM) + --output-fmt-option OPT[=VAL] + Specify a single output file format option in the form + of OPTION or OPTION=VALUE + -T, --reference FILE + Reference sequence FASTA FILE [null] + -@, --threads INT + Number of additional threads to use [0] + --write-index + Automatically index the output files [off] + --verbosity INT + Set level of verbosity diff --git a/src/samtools/samtools_view/script.sh b/src/samtools/samtools_view/script.sh new file mode 100644 index 00000000..c3911b48 --- /dev/null +++ b/src/samtools/samtools_view/script.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -e + +[[ "$par_bam" == "false" ]] && unset par_bam +[[ "$par_cram" == "false" ]] && unset par_cram +[[ "$par_fast" == "false" ]] && unset par_fast +[[ "$par_uncompressed" == "false" ]] && unset par_uncompressed +[[ "$par_with_header" == "false" ]] && unset par_with_header +[[ "$par_header_only" == "false" ]] && unset par_header_only +[[ "$par_no_header" == "false" ]] && unset par_no_header +[[ "$par_count" == "false" ]] && unset par_count +[[ "$par_unmap" == "false" ]] && unset par_unmap +[[ "$par_use_index" == "false" ]] && unset par_use_index +[[ "$par_fetch_pairs" == "false" ]] && unset par_fetch_pairs +[[ "$par_customized_index" == "false" ]] && unset par_customized_index +[[ "$par_no_PG" == "false" ]] && unset par_no_PG +[[ "$par_write_index" == "false" ]] && unset par_write_index +[[ "$par_remove_B" == "false" ]] && unset par_remove_B + +samtools view \ + ${par_bam:+-b} \ + ${par_cram:+-C} \ + ${par_fast:+--fast} \ + ${par_uncompressed:+-u} \ + ${par_with_header:+--with-header} \ + ${par_header_only:+-H} \ + ${par_no_header:+--no-header} \ + ${par_count:+-c} \ + ${par_output:+-o "$par_output"} \ + ${par_output_unselected:+-U "$par_output_unselected"} \ + ${par_unmap:+-p "$par_unmap"} \ + ${par_fetch_pairs:+-P "$par_fetch_pairs"} \ + ${par_fai_reference:+-t "$par_fai_reference"} \ + ${par_use_index:+-M "$par_use_index"} \ + ${par_region_file:+--region-file "$par_region_file"} \ + ${par_customized_index:+-X} \ + ${par_target_file:+-L "$par_target_file"} \ + ${par_qname_file:+-N "$par_qname_file"} \ + ${par_read_group:+-r "$par_read_group"} \ + ${par_read_group_file:+-R "$par_read_group_file"} \ + ${par_tag:+-d "$par_tag"} \ + ${par_tag_file:+-D "$par_tag_file"} \ + ${par_min_MQ:+-q "$par_min_MQ"} \ + ${par_library:+-l "$par_library"} \ + ${par_min_qlen:+-m "$par_min_qlen"} \ + ${par_expr:+-e "$par_expr"} \ + ${par_require_flags:+-f "$par_require_flags"} \ + ${par_excl_flags:+-F "$par_excl_flags"} \ + ${par_incl_flags:+--rf "$par_incl_flags"} \ + ${par_excl_all_flags:+-G "$par_excl_all_flags"} \ + ${par_subsample:+--subsample "$par_subsample"} \ + ${par_subsample_seed:+--subsample-seed "$par_subsample_seed"} \ + ${par_add_flags:+--add-flags "$par_add_flags"} \ + ${par_remove_flags:+--remove-flags "$par_remove_flags"} \ + ${par_remove_tag:+-x "$par_remove_tag"} \ + ${par_keep_tag:+--keep-tag "$par_keep_tag"} \ + ${par_remove_B:+-B} \ + ${par_sanitize:+-z "$par_sanitize"} \ + ${par_input_fmt_option:+--input-fmt-option "$par_input_fmt_option"} \ + ${par_output_fmt:+-O "$par_output_fmt"} \ + ${par_output_fmt_option:+--output-fmt-option "$par_output_fmt_option"} \ + ${par_reference:+-T "$par_reference"} \ + ${par_write_index:+--write-index} \ + ${par_no_PG:+--no-PG} \ + "$par_input" + +exit 0 diff --git a/src/samtools/samtools_view/test.sh b/src/samtools/samtools_view/test.sh new file mode 100644 index 00000000..1de29a7c --- /dev/null +++ b/src/samtools/samtools_view/test.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +test_dir="${meta_resources_dir}/test_data" +temp_dir="${meta_resources_dir}/out" + +############################################################################################ + +echo ">>> Test 1: Import SAM to BAM when @SQ lines are present in the header" +"$meta_executable" \ + --bam \ + --output "$temp_dir/a.bam" \ + --input "$test_dir/a.sam" + +echo ">>> Checking whether output exists" +[ ! -f "$temp_dir/a.bam" ] && echo "File 'a.bam' does not exist!" && exit 1 + +echo ">>> Checking whether output is non-empty" +[ ! -s "$temp_dir/a.bam" ] && echo "File 'a.bam' is empty!" && exit 1 + +echo ">>> Checking whether output is correct" +# compare output of "samtools view" for both files +diff <(samtools view "$temp_dir/a.bam") <(samtools view "$test_dir/a.bam") || \ + (echo "Output file a.bam does not match expected output" && exit 1) + +############################################################################################ + +echo ">>> Test 2: ${meta_functionality_name} with CRAM format output" + +"$meta_executable" \ + --cram \ + --output "$temp_dir/a.cram" \ + --input "$test_dir/a.sam" + +echo ">>> Checking whether output exists" +[ ! -f "$temp_dir/a.cram" ] && echo "File 'a.cram' does not exist!" && exit 1 + +echo ">>> Checking whether output is non-empty" +[ ! -s "$temp_dir/a.cram" ] && echo "File 'a.cram' is empty!" && exit 1 + +echo ">>> Checking whether output is correct" +# compare output of "samtools view" for both files +diff <(samtools view "$temp_dir/a.cram") <(samtools view "$test_dir/a.cram") || \ + (echo "Output file a.cram does not match expected output" && exit 1) + +############################################################################################ + +echo ">>> Test 3: ${meta_functionality_name} with --count option" + +"$meta_executable" \ + --count \ + --output "$temp_dir/a.count" \ + --input "$test_dir/a.sam" + +echo ">>> Checking whether output exists" +[ ! -f "$temp_dir/a.count" ] && echo "File 'a.count' does not exist!" && exit 1 + +echo ">>> Checking whether output is non-empty" +[ ! -s "$temp_dir/a.count" ] && echo "File 'a.count' is empty!" && exit 1 + +echo ">>> Checking whether output is correct" +diff "$temp_dir/a.count" "$test_dir/a.count" || \ + (echo "Output file a.count does not match expected output" && exit 1) + +############################################################################################ + +echo ">>> Test 4: ${meta_functionality_name} including only the forward reads from read pairs" + +"$meta_executable" \ + --output "$temp_dir/a.forward" \ + --excl_flags "0x80" \ + --input "$test_dir/a.sam" + +echo ">>> Checking whether output exists" +[ ! -f "$temp_dir/a.forward" ] && echo "File 'a.forward' does not exist!" && exit 1 + +echo ">>> Checking whether output is non-empty" +[ ! -s "$temp_dir/a.forward" ] && echo "File 'a.forward' is empty!" && exit 1 + +echo ">>> Checking whether output is correct" +diff "$temp_dir/a.forward" "$test_dir/a.forward" || \ + (echo "Output file a.forward does not match expected output" && exit 1) + +############################################################################################ + +echo ">>> All test passed successfully" +rm -rf "${temp_dir}" +exit 0 \ No newline at end of file diff --git a/src/samtools/samtools_view/test_data/a.bam b/src/samtools/samtools_view/test_data/a.bam new file mode 100644 index 00000000..95b85b72 Binary files /dev/null and b/src/samtools/samtools_view/test_data/a.bam differ diff --git a/src/samtools/samtools_view/test_data/a.count b/src/samtools/samtools_view/test_data/a.count new file mode 100644 index 00000000..1e8b3149 --- /dev/null +++ b/src/samtools/samtools_view/test_data/a.count @@ -0,0 +1 @@ +6 diff --git a/src/samtools/samtools_view/test_data/a.cram b/src/samtools/samtools_view/test_data/a.cram new file mode 100644 index 00000000..57fb3269 Binary files /dev/null and b/src/samtools/samtools_view/test_data/a.cram differ diff --git a/src/samtools/samtools_view/test_data/a.forward b/src/samtools/samtools_view/test_data/a.forward new file mode 100644 index 00000000..766d4f20 --- /dev/null +++ b/src/samtools/samtools_view/test_data/a.forward @@ -0,0 +1,3 @@ +a1 99 xx 1 1 10M = 11 20 AAAAAAAAAA ********** +b1 99 xx 1 1 10M = 11 20 AAAAAAAAAA ********** +c1 99 xx 1 1 10M = 11 20 AAAAAAAAAA ********** diff --git a/src/samtools/samtools_view/test_data/a.sam b/src/samtools/samtools_view/test_data/a.sam new file mode 100644 index 00000000..aa8c77b3 --- /dev/null +++ b/src/samtools/samtools_view/test_data/a.sam @@ -0,0 +1,7 @@ +@SQ SN:xx LN:20 +a1 99 xx 1 1 10M = 11 20 AAAAAAAAAA ********** +b1 99 xx 1 1 10M = 11 20 AAAAAAAAAA ********** +c1 99 xx 1 1 10M = 11 20 AAAAAAAAAA ********** +a1 147 xx 11 1 10M = 1 -20 TTTTTTTTTT ********** +b1 147 xx 11 1 10M = 1 -20 TTTTTTTTTT ********** +c1 147 xx 11 1 10M = 1 -20 TTTTTTTTTT ********** diff --git a/src/samtools/samtools_view/test_data/script.sh b/src/samtools/samtools_view/test_data/script.sh new file mode 100755 index 00000000..90918e44 --- /dev/null +++ b/src/samtools/samtools_view/test_data/script.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# dowload test data from snakemake wrapper +if [ ! -d /tmp/view_source ]; then + git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers.git /tmp/view_source +fi + +cp -r /tmp/idxstats_source/bio/samtools/view/test/*.sam src/samtools/samtools_view/test_data \ No newline at end of file