From d3b20532f669b09319bc62e551febfa0b1111a8f Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Sat, 13 Apr 2024 08:28:56 +0100 Subject: [PATCH] Samtools sort (#36) * Initial version of samtools sort, no tests * Add tests, final touches * Update changelog * Update src/samtools/samtools_sort/config.vsh.yaml Remove "must_exist: false" since that is the default value Co-authored-by: Robrecht Cannoodt * Clean up test script, update changelog * Minor changes, paths, config and script --------- Co-authored-by: Robrecht Cannoodt --- CHANGELOG.md | 1 + src/samtools/samtools_sort/config.vsh.yaml | 149 ++++++++++++++++++ src/samtools/samtools_sort/help.txt | 40 +++++ src/samtools/samtools_sort/script.sh | 43 +++++ src/samtools/samtools_sort/test.sh | 79 ++++++++++ src/samtools/samtools_sort/test_data/a.bam | Bin 0 -> 184 bytes .../test_data/output/a_ref.sorted.bam | Bin 0 -> 301 bytes .../test_data/output/ascii_ref.sorted.bam | Bin 0 -> 325 bytes .../output/compressed_ref.sorted.bam | Bin 0 -> 312 bytes .../samtools_sort/test_data/script.sh | 8 + .../test_data/text/a_ref.sorted.txt | 6 + .../test_data/text/ascii_ref.sorted.txt | 6 + .../test_data/text/compressed_ref.sorted.txt | 6 + 13 files changed, 338 insertions(+) create mode 100644 src/samtools/samtools_sort/config.vsh.yaml create mode 100644 src/samtools/samtools_sort/help.txt create mode 100644 src/samtools/samtools_sort/script.sh create mode 100644 src/samtools/samtools_sort/test.sh create mode 100644 src/samtools/samtools_sort/test_data/a.bam create mode 100644 src/samtools/samtools_sort/test_data/output/a_ref.sorted.bam create mode 100644 src/samtools/samtools_sort/test_data/output/ascii_ref.sorted.bam create mode 100644 src/samtools/samtools_sort/test_data/output/compressed_ref.sorted.bam create mode 100755 src/samtools/samtools_sort/test_data/script.sh create mode 100644 src/samtools/samtools_sort/test_data/text/a_ref.sorted.txt create mode 100644 src/samtools/samtools_sort/test_data/text/ascii_ref.sorted.txt create mode 100644 src/samtools/samtools_sort/test_data/text/compressed_ref.sorted.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index 43787a4b..ba7bf0e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,7 @@ - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31). - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32). - `samtools/samtools_index`: Index SAM/BAM/CRAM files (PR #35). + - `samtools/samtools_sort`: Sort SAM/BAM/CRAM files (PR #36). - `samtools/samtools_stats`: Reports alignment summary statistics for a BAM file (PR #39). ## MAJOR CHANGES diff --git a/src/samtools/samtools_sort/config.vsh.yaml b/src/samtools/samtools_sort/config.vsh.yaml new file mode 100644 index 00000000..7cd9ec48 --- /dev/null +++ b/src/samtools/samtools_sort/config.vsh.yaml @@ -0,0 +1,149 @@ +name: samtools_sort +namespace: samtools +description: Sort SAM/BAM/CRAM file. +keywords: [sort, bam, sam, cram] +links: + homepage: https://www.htslib.org/ + documentation: https://www.htslib.org/doc/samtools-idxstats.html + repository: https://github.com/samtools/samtools +references: + doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] +license: MIT/Expat + +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: SAM/BAM/CRAM input file. + required: true + must_exist: true + - name: Outputs + arguments: + - name: --output + type: file + description: | + Write final output to file. + required: true + direction: output + example: out.bam + - name: --output_fmt + alternatives: -O + type: string + description: | + Specify output format (SAM, BAM, CRAM). + example: BAM + - name: --output_fmt_option + type: string + description: | + Specify a single output file format option in the form + of OPTION or OPTION=VALUE. + - name: --reference + type: file + description: | + Reference sequence FASTA FILE. + example: ref.fa + - name: --write_index + type: boolean_true + description: | + Automatically index the output files. + - name: --prefix + alternatives: -T + type: string + description: | + Write temporary files to PREFIX.nnnn.bam. + - name: --no_PG + type: boolean_true + description: | + Do not add a PG line. + - name: --template_coordinate + type: boolean_true + description: | + Sort by template-coordinate. + - name: --input_fmt_option + type: string + description: | + Specify a single input file format option in the form + of OPTION or OPTION=VALUE. + - name: Options + arguments: + - name: --compression + alternatives: -l + type: integer + description: | + Set compression level, from 0 (uncompressed) to 9 (best). + default: 0 + - name: --uncompressed + alternatives: -u + type: boolean_true + description: | + Output uncompressed data (equivalent to --compression 0). + - name: --minimiser + alternatives: -M + type: boolean_true + description: | + Use minimiser for clustering unaligned/unplaced reads. + - name: --not_reverse + alternatives: -R + type: boolean_true + description: | + Do not use reverse strand (only compatible with --minimiser) + - name: --kmer_size + alternatives: -K + type: integer + description: | + Kmer size to use for minimiser. + example: 20 + - name: --order + alternatives: -I + type: file + description: | + Order minimisers by their position in FILE FASTA. + example: ref.fa + - name: --window + alternatives: -w + type: integer + description: | + Window size for minimiser INDEXING VIA --order REF.FA. + example: 100 + - name: --homopolymers + alternatives: -H + type: boolean_true + description: | + Squash homopolymers when computing minimiser. + - name: --natural_sort + alternatives: -n + type: boolean_true + description: | + Sort by read name (natural): cannot be used with samtools index. + - name: --ascii_sort + alternatives: -N + type: boolean_true + description: | + Sort by read name (ASCII): cannot be used with samtools index. + - name: --tag + alternatives: -t + type: string + description: | + Sort by value of TAG. Uses position as secondary index + (or read name if --natural_sort is set). + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/samtools:1.19.2--h50ea8bc_1 + setup: + - type: docker + run: | + samtools --version 2>&1 | grep -E '^(samtools|Using htslib)' | \ + sed 's#Using ##;s# \([0-9\.]*\)$#: \1#' > /var/software_versions.txt +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/samtools/samtools_sort/help.txt b/src/samtools/samtools_sort/help.txt new file mode 100644 index 00000000..27cd86a0 --- /dev/null +++ b/src/samtools/samtools_sort/help.txt @@ -0,0 +1,40 @@ +``` +samtools sort +``` + +Usage: samtools sort [options...] [in.bam] +Options: + -l INT Set compression level, from 0 (uncompressed) to 9 (best) + -u Output uncompressed data (equivalent to -l 0) + -m INT Set maximum memory per thread; suffix K/M/G recognized [768M] + -M Use minimiser for clustering unaligned/unplaced reads + -R Do not use reverse strand (only compatible with -M) + -K INT Kmer size to use for minimiser [20] + -I FILE Order minimisers by their position in FILE FASTA + -w INT Window size for minimiser indexing via -I ref.fa [100] + -H Squash homopolymers when computing minimiser + -n Sort by read name (natural): cannot be used with samtools index + -N Sort by read name (ASCII): cannot be used with samtools index + -t TAG Sort by value of TAG. Uses position as secondary index (or read name if -n is set) + -o FILE Write final output to FILE rather than standard output + -T PREFIX Write temporary files to PREFIX.nnnn.bam + --no-PG + Do not add a PG line + --template-coordinate + Sort by template-coordinate + --input-fmt-option OPT[=VAL] + Specify a single input file format option in the form + of OPTION or OPTION=VALUE + -O, --output-fmt FORMAT[,OPT[=VAL]]... + Specify output format (SAM, BAM, CRAM) + --output-fmt-option OPT[=VAL] + Specify a single output file format option in the form + of OPTION or OPTION=VALUE + --reference FILE + Reference sequence FASTA FILE [null] + -@, --threads INT + Number of additional threads to use [0] + --write-index + Automatically index the output files [off] + --verbosity INT + Set level of verbosity \ No newline at end of file diff --git a/src/samtools/samtools_sort/script.sh b/src/samtools/samtools_sort/script.sh new file mode 100644 index 00000000..94836c18 --- /dev/null +++ b/src/samtools/samtools_sort/script.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -e + +[[ "$par_uncompressed" == "false" ]] && unset par_uncompressed +[[ "$par_minimiser" == "false" ]] && unset par_minimiser +[[ "$par_not_reverse" == "false" ]] && unset par_not_reverse +[[ "$par_homopolymers" == "false" ]] && unset par_homopolymers +[[ "$par_natural_sort" == "false" ]] && unset par_natural_sort +[[ "$par_ascii_sort" == "false" ]] && unset par_ascii_sort +[[ "$par_template_coordinate" == "false" ]] && unset par_template_coordinate +[[ "$par_write_index" == "false" ]] && unset par_write_index +[[ "$par_no_PG" == "false" ]] && unset par_no_PG + + +samtools sort \ + ${par_compression:+-l "$par_compression"} \ + ${par_uncompressed:+-u} \ + ${par_minimiser:+-M} \ + ${par_not_reverse:+-R} \ + ${par_kmer_size:+-K "$par_kmer_size"} \ + ${par_order:+-I "$par_order"} \ + ${par_window:+-w "$par_window"} \ + ${par_homopolymers:+-H} \ + ${par_natural_sort:+-n} \ + ${par_ascii_sort:+-N} \ + ${par_tag:+-t "$par_tag"} \ + ${par_input_fmt_option:+--input-fmt-option "$par_input_fmt_option"} \ + ${par_template_coordinate:+--template-coordinate} \ + ${par_write_index:+--write-index} \ + ${par_prefix:+-T "$par_prefix"} \ + ${par_no_PG:+--no-PG} \ + ${par_output_fmt:+-O "$par_output_fmt"} \ + ${par_output_fmt_option:+--output-fmt-option "$par_output_fmt_option"} \ + ${par_reference:+--reference "$par_reference"} \ + -o "$par_output" \ + "$par_input" + +# save text files containing the output of samtools view for later comparison +samtools view "$par_output" -o "$par_output".txt \ No newline at end of file diff --git a/src/samtools/samtools_sort/test.sh b/src/samtools/samtools_sort/test.sh new file mode 100644 index 00000000..d8425dc9 --- /dev/null +++ b/src/samtools/samtools_sort/test.sh @@ -0,0 +1,79 @@ +#!/bin/bash + +test_dir="${meta_resources_dir}/test_data" +out_dir="${meta_resources_dir}/test_data/text" + +# Files are compared using the "samtools view" output. +############################################################################################ + +echo ">>> Test 1: Sorting a BAM file" + +"$meta_executable" \ + --input "$test_dir/a.bam" \ + --output "$test_dir/a.sorted.bam" + +echo ">>> Check if output file exists" +[ ! -f "$test_dir/a.sorted.bam" ] \ + && echo "Output file a.sorted.bam does not exist" && exit 1 + +echo ">>> Check if output is empty" +[ ! -s "$test_dir/a.sorted.bam" ] \ + && echo "Output file a.sorted.bam is empty" && exit 1 + +echo ">>> Check if output matches expected output" +diff -a "$test_dir/a.sorted.bam.txt" "$out_dir/a_ref.sorted.txt" \ + || (echo "Output file a.sorted.bam does not match expected output" && exit 1) + +rm "$test_dir/a.sorted.bam" "$test_dir/a.sorted.bam.txt" + +############################################################################################ + +echo ">>> Test 2: Sorting a BAM file according to ascii order" + +"$meta_executable" \ + --input "$test_dir/a.bam" \ + --ascii_sort \ + --output "$test_dir/ascii.sorted.bam" + +echo ">>> Check if output file exists" +[ ! -f "$test_dir/ascii.sorted.bam" ] \ + && echo "Output file ascii.sorted.bam does not exist" && exit 1 + +echo ">>> Check if output is empty" +[ ! -s "$test_dir/ascii.sorted.bam" ] \ + && echo "Output file ascii.sorted.bam is empty" && exit 1 + +echo ">>> Check if output matches expected output" +diff -a "$test_dir/ascii.sorted.bam.txt" "$out_dir/ascii_ref.sorted.txt" \ + || (echo "Output file ascii.sorted.bam does not match expected output" && exit 1) + +rm "$test_dir/ascii.sorted.bam" "$test_dir/ascii.sorted.bam.txt" + +############################################################################################ + +echo ">>> Test 3: Sorting a BAM file with compression" + +"$meta_executable" \ + --input "$test_dir/a.bam" \ + --compression 5 \ + --output "$test_dir/compressed.sorted.bam" + +echo ">>> Check if output file exists" +[ ! -f "$test_dir/compressed.sorted.bam" ] \ + && echo "Output file compressed.sorted.bam does not exist" && exit 1 + +echo ">>> Check if output is empty" +[ ! -s "$test_dir/compressed.sorted.bam" ] \ + && echo "Output file compressed.sorted.bam is empty" && exit 1 + +echo ">>> Check if output matches expected output" # +diff "$test_dir/compressed.sorted.bam.txt" "$out_dir/compressed_ref.sorted.txt" \ + || (echo "Output file compressed.sorted.bam does not match expected output" && exit 1) + +rm "$test_dir/compressed.sorted.bam" "$test_dir/compressed.sorted.bam.txt" + +############################################################################################ + + +echo "All tests succeeded!" +exit 0 \ No newline at end of file diff --git a/src/samtools/samtools_sort/test_data/a.bam b/src/samtools/samtools_sort/test_data/a.bam new file mode 100644 index 0000000000000000000000000000000000000000..dba1268acbd8446e4fde54d7da33434597fbe635 GIT binary patch literal 184 zcmb2|=3rp}f&Xj_PR>jWb_~TuUs6R95)ukH_@3~5+q`PUgD)R98yP)FV(BtuE_7vW z=9s|5aI{h|P#vgC9!+};gK@G0Lz-KrbIh-tVq12fpEAOZjf CvNY8I literal 0 HcmV?d00001 diff --git a/src/samtools/samtools_sort/test_data/output/a_ref.sorted.bam b/src/samtools/samtools_sort/test_data/output/a_ref.sorted.bam new file mode 100644 index 0000000000000000000000000000000000000000..da4edc86ce845f32e87096851b93ac7d1e381b46 GIT binary patch literal 301 zcmb2|=3rp}f&Xj_PR>jW%NdG`zNF4Mb>P5*2M-TPK1)wsk$z0_P5#t5^W&#XnmIjU z(y6BcGAAV8q=yC`eEB#%HRZrlfd}VazDx?8Gb4J^{Q1FWBpvNHGNa_av_YI!_+Wof!sMr3TLFQ`<(V$q&ZVreVm$7< RRX2ta(;{gGW^nj`2mtJmbie=r literal 0 HcmV?d00001 diff --git a/src/samtools/samtools_sort/test_data/output/ascii_ref.sorted.bam b/src/samtools/samtools_sort/test_data/output/ascii_ref.sorted.bam new file mode 100644 index 0000000000000000000000000000000000000000..58e4f57e7061005d529f9fbeb58cde781386eb0f GIT binary patch literal 325 zcmb2|=3rp}f&Xj_PR>jWdl`It9Qm3IL|FG1hF+6v;AS{<=*YSN&nqn2%Q~Y!wXX6# zkQ6mdp9=h zMun!m;JFpWD(NOOt1~yM{_c^=SsT==$SVEaTu%RROU|eJuceo+l{c^t*e7~ynrT!| zNdE47ZW&sd-n#1^CUjcuN%)=q@W4G5hGh&7WX>==U_K=&@#Z|x(eh}n&tfo6c3?eZ4;(n~;o1NH{{!aA rg1`)`k2hz!Y@6$ua(kxFwz(%G-m)Gy^j@+&}~X>U@0T literal 0 HcmV?d00001 diff --git a/src/samtools/samtools_sort/test_data/output/compressed_ref.sorted.bam b/src/samtools/samtools_sort/test_data/output/compressed_ref.sorted.bam new file mode 100644 index 0000000000000000000000000000000000000000..d10c2c0079e633c0edc9ee46b03ca58e8a790ecf GIT binary patch literal 312 zcmb2|=3rp}f&Xj_PR>jWn;3k14sta)h`7$*B6Yp?z@LU$PL7*532ALODRAr56CZ&L z|Fn4q|1bBq>8`rFxBc+VJimk744<42t_xc7vFmF3MWv{}64#7QHpM< z+?sye$>OST((?$_`rpDH*9(u8J)K_EKCR-)N7eoblMAn8xN04o*fw#J*zbc<&wKLL z>^wd>r=%wM@BCHEJ$D@-gIR951il1$3!Anv*jajFTN0(%k$N-P%&v^fcyo zH#R!9G&U+;oRG-K%-p;H2o%(1W(Y1c?#!8cvZA!;V5wi*>y(6qgo@Aq|JSd`wzdKS ekIOS%ww+5^W5syfcdKp;Bc?^t49wu50ucafe|m-h literal 0 HcmV?d00001 diff --git a/src/samtools/samtools_sort/test_data/script.sh b/src/samtools/samtools_sort/test_data/script.sh new file mode 100755 index 00000000..a7a5b13c --- /dev/null +++ b/src/samtools/samtools_sort/test_data/script.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# dowload test data from snakemake wrapper +if [ ! -d /tmp/idxstats_source ]; then + git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers.git /tmp/sort_source +fi + +cp -r /tmp/sort_source/bio/samtools/sort/test/mapped/* src/samtools/samtools_sort/test_data diff --git a/src/samtools/samtools_sort/test_data/text/a_ref.sorted.txt b/src/samtools/samtools_sort/test_data/text/a_ref.sorted.txt new file mode 100644 index 00000000..ce8d0527 --- /dev/null +++ b/src/samtools/samtools_sort/test_data/text/a_ref.sorted.txt @@ -0,0 +1,6 @@ +a1 99 xx 1 1 10M = 11 20 AAAAAAAAAA ********** +b1 99 xx 1 1 10M = 11 20 AAAAAAAAAA ********** +c1 99 xx 1 1 10M = 11 20 AAAAAAAAAA ********** +a1 147 xx 11 1 10M = 1 -20 TTTTTTTTTT ********** +b1 147 xx 11 1 10M = 1 -20 TTTTTTTTTT ********** +c1 147 xx 11 1 10M = 1 -20 TTTTTTTTTT ********** diff --git a/src/samtools/samtools_sort/test_data/text/ascii_ref.sorted.txt b/src/samtools/samtools_sort/test_data/text/ascii_ref.sorted.txt new file mode 100644 index 00000000..00cdbc69 --- /dev/null +++ b/src/samtools/samtools_sort/test_data/text/ascii_ref.sorted.txt @@ -0,0 +1,6 @@ +a1 99 xx 1 1 10M = 11 20 AAAAAAAAAA ********** +a1 147 xx 11 1 10M = 1 -20 TTTTTTTTTT ********** +b1 99 xx 1 1 10M = 11 20 AAAAAAAAAA ********** +b1 147 xx 11 1 10M = 1 -20 TTTTTTTTTT ********** +c1 99 xx 1 1 10M = 11 20 AAAAAAAAAA ********** +c1 147 xx 11 1 10M = 1 -20 TTTTTTTTTT ********** diff --git a/src/samtools/samtools_sort/test_data/text/compressed_ref.sorted.txt b/src/samtools/samtools_sort/test_data/text/compressed_ref.sorted.txt new file mode 100644 index 00000000..ce8d0527 --- /dev/null +++ b/src/samtools/samtools_sort/test_data/text/compressed_ref.sorted.txt @@ -0,0 +1,6 @@ +a1 99 xx 1 1 10M = 11 20 AAAAAAAAAA ********** +b1 99 xx 1 1 10M = 11 20 AAAAAAAAAA ********** +c1 99 xx 1 1 10M = 11 20 AAAAAAAAAA ********** +a1 147 xx 11 1 10M = 1 -20 TTTTTTTTTT ********** +b1 147 xx 11 1 10M = 1 -20 TTTTTTTTTT ********** +c1 147 xx 11 1 10M = 1 -20 TTTTTTTTTT **********