diff --git a/CHANGELOG.md b/CHANGELOG.md index dcc783c9..5f720035 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,16 +16,17 @@ * `rsem/rsem_calculate_expression`: Calculate expression levels (PR #93). -* `nanoplot`: Plotting tool for long read sequencing data and alignments (PR #95). +* `rseqc`: + - `rseqc/bam_stat`: Generate statistics from a bam file (PR #155). +* `nanoplot`: Plotting tool for long read sequencing data and alignments (PR #95). -## BREAKING CHANGES +## BUG FIXES * `falco`: Fix a typo in the `--reverse_complement` argument (PR #157). -## BUG FIXES +* `cutadapt`: Fix the the non-functional `action` parameter (PR #161). -* `cutadapt`: fix the the non-functional `action` parameter (PR #161). ## MINOR CHANGES diff --git a/src/rseqc/rseqc_bamstat/config.vsh.yaml b/src/rseqc/rseqc_bamstat/config.vsh.yaml new file mode 100644 index 00000000..6d607e2f --- /dev/null +++ b/src/rseqc/rseqc_bamstat/config.vsh.yaml @@ -0,0 +1,59 @@ +name: rseqc_bamstat +namespace: rseqc +keywords: [ rnaseq, genomics ] +description: Generate statistics from a bam file. +links: + homepage: https://rseqc.sourceforge.net/ + documentation: https://rseqc.sourceforge.net/#bam-stat-py + issue_tracker: https://github.com/MonashBioinformaticsPlatform/RSeQC/issues + repository: https://github.com/MonashBioinformaticsPlatform/RSeQC +references: + doi: 10.1093/bioinformatics/bts356 +license: GPL-3.0 +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] + +argument_groups: +- name: "Input" + arguments: + - name: "--input_file" + alternatives: -i + type: file + required: true + description: Input alignment file in BAM or SAM format. + - name: "--mapq" + alternatives: -q + type: integer + example: 30 + description: | + Minimum mapping quality (phred scaled) to determine uniquely mapped reads. Default: '30'. + +- name: "Output" + arguments: + - name: "--output" + type: file + direction: output + description: Output file (txt) with mapping quality statistics. + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data + +engines: +- type: docker + image: python:3.10 + setup: + - type: python + packages: [ RSeQC ] + - type: docker + run: | + echo "RSeQC bam_stat.py: $(bam_stat.py --version | cut -d' ' -f2-)" > /var/software_versions.txt +runners: +- type: executable +- type: nextflow diff --git a/src/rseqc/rseqc_bamstat/help.txt b/src/rseqc/rseqc_bamstat/help.txt new file mode 100644 index 00000000..b4e9c1d9 --- /dev/null +++ b/src/rseqc/rseqc_bamstat/help.txt @@ -0,0 +1,18 @@ +``` +bam_stat.py -h +``` + +Usage: bam_stat.py [options] + +Summarizing mapping statistics of a BAM or SAM file. + + + +Options: + --version show program's version number and exit + -h, --help show this help message and exit + -i INPUT_FILE, --input-file=INPUT_FILE + Alignment file in BAM or SAM format. + -q MAP_QUAL, --mapq=MAP_QUAL + Minimum mapping quality (phred scaled) to determine + "uniquely mapped" reads. default=30 \ No newline at end of file diff --git a/src/rseqc/rseqc_bamstat/script.sh b/src/rseqc/rseqc_bamstat/script.sh new file mode 100644 index 00000000..32927bb6 --- /dev/null +++ b/src/rseqc/rseqc_bamstat/script.sh @@ -0,0 +1,9 @@ +#!/bin/bash + + +set -eo pipefail + +bam_stat.py \ + --input-file "${par_input_file}" \ + ${par_mapq:+--mapq "${par_mapq}"} \ +> $par_output diff --git a/src/rseqc/rseqc_bamstat/test.sh b/src/rseqc/rseqc_bamstat/test.sh new file mode 100644 index 00000000..f9180da8 --- /dev/null +++ b/src/rseqc/rseqc_bamstat/test.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# define input and output for script + +input_bam="sample.bam" +output_summary="mapping_quality.txt" + +# run executable and tests +echo "> Running $meta_functionality_name." + +"$meta_executable" \ + --input_file "$meta_resources_dir/test_data/$input_bam" \ + --output "$output_summary" + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +echo ">> Checking whether output is present" +[ ! -f "$output_summary" ] && echo "$output_summary file missing" && exit 1 +[ ! -s "$output_summary" ] && echo "$output_summary file is empty" && exit 1 + +echo ">> Checking whether output is correct" +diff "$meta_resources_dir/test_data/ref_output.txt" "$meta_resources_dir/$output_summary" || { echo "Output is not correct"; exit 1; } + +############################################################################# + +echo ">>> Test 2: Test with non-default mapping quality threshold" + +output_summary="mapping_quality_mapq_50.txt" + +# run executable and tests +echo "> Running $meta_functionality_name." + +"$meta_executable" \ + --input_file "$meta_resources_dir/test_data/$input_bam" \ + --output "$output_summary" \ + --mapq 50 + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +echo ">> Checking whether output is present" +[ ! -f "$output_summary" ] && echo "$output_summary file missing" && exit 1 +[ ! -s "$output_summary" ] && echo "$output_summary file is empty" && exit 1 + +echo ">> Checking whether output is correct" +diff "$meta_resources_dir/test_data/ref_output_mapq.txt" "$meta_resources_dir/$output_summary" || { echo "Output is not correct"; exit 1; } + +exit 0 \ No newline at end of file diff --git a/src/rseqc/rseqc_bamstat/test_data/ref_output.txt b/src/rseqc/rseqc_bamstat/test_data/ref_output.txt new file mode 100644 index 00000000..6b939096 --- /dev/null +++ b/src/rseqc/rseqc_bamstat/test_data/ref_output.txt @@ -0,0 +1,22 @@ + +#================================================== +#All numbers are READ count +#================================================== + +Total records: 90 + +QC failed: 0 +Optical/PCR duplicate: 0 +Non primary hits 0 +Unmapped reads: 1 +mapq < mapq_cut (non-unique): 0 + +mapq >= mapq_cut (unique): 89 +Read-1: 45 +Read-2: 44 +Reads map to '+': 44 +Reads map to '-': 45 +Non-splice reads: 89 +Splice reads: 0 +Reads mapped in proper pairs: 88 +Proper-paired reads map to different chrom:0 diff --git a/src/rseqc/rseqc_bamstat/test_data/ref_output_mapq.txt b/src/rseqc/rseqc_bamstat/test_data/ref_output_mapq.txt new file mode 100644 index 00000000..be8af62f --- /dev/null +++ b/src/rseqc/rseqc_bamstat/test_data/ref_output_mapq.txt @@ -0,0 +1,22 @@ + +#================================================== +#All numbers are READ count +#================================================== + +Total records: 90 + +QC failed: 0 +Optical/PCR duplicate: 0 +Non primary hits 0 +Unmapped reads: 1 +mapq < mapq_cut (non-unique): 6 + +mapq >= mapq_cut (unique): 83 +Read-1: 42 +Read-2: 41 +Reads map to '+': 44 +Reads map to '-': 39 +Non-splice reads: 83 +Splice reads: 0 +Reads mapped in proper pairs: 83 +Proper-paired reads map to different chrom:0 diff --git a/src/rseqc/rseqc_bamstat/test_data/sample.bam b/src/rseqc/rseqc_bamstat/test_data/sample.bam new file mode 100644 index 00000000..ed1e2433 Binary files /dev/null and b/src/rseqc/rseqc_bamstat/test_data/sample.bam differ