diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bd21a1e..2f4c0c71 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,8 @@ - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98). - `bedtools/bedtools_bamtofastq`: Convert BAM alignments to FASTQ files (PR #101). - `bedtools/bedtools_bedtobam`: Converts genomic feature records (bed/gff/vcf) to BAM format (PR #111). + +* `qualimap/qualimap_rnaseq`: RNA-seq QC analysis using qualimap (PR #74). ## MINOR CHANGES diff --git a/src/qualimap/qualimap_rnaseq/config.vsh.yaml b/src/qualimap/qualimap_rnaseq/config.vsh.yaml new file mode 100644 index 00000000..ffc807ab --- /dev/null +++ b/src/qualimap/qualimap_rnaseq/config.vsh.yaml @@ -0,0 +1,103 @@ +name: qualimap_rnaseq +namespace: qualimap +keywords: [RNA-seq, quality control, QC Report] +description: | + Qualimap RNA-seq QC reports quality control metrics and bias estimations + which are specific for whole transcriptome sequencing, including reads genomic + origin, junction analysis, transcript coverage and 5’-3’ bias computation. +links: + homepage: http://qualimap.conesalab.org/ + documentation: http://qualimap.conesalab.org/doc_html/analysis.html#rna-seq-qc + issue_tracker: https://bitbucket.org/kokonech/qualimap/issues?status=new&status=open + repository: https://bitbucket.org/kokonech/qualimap/commits/branch/master +references: + doi: 10.1093/bioinformatics/btv566 +license: GPL-2.0 +authors: + - __merge__: /src/_authors/dorien_roosen.yaml + roles: [ author, maintainer ] +argument_groups: + - name: "Input" + arguments: + - name: "--bam" + type: file + required: true + example: alignment.bam + description: Path to the sequence alignment file in BAM format, produced by a splicing-aware aligner. + - name: "--gtf" + type: file + required: true + example: annotations.gtf + description: Path to genomic annotations in Ensembl GTF format. + + - name: "Output" + arguments: + - name: "--qc_results" + direction: output + type: file + required: true + example: rnaseq_qc_results.txt + description: Text file containing the RNAseq QC results. + - name: "--counts" + type: file + required: false + direction: output + description: Output file for computed counts. + - name: "--report" + type: file + direction: output + required: false + example: report.html + description: Report output file. Supported formats are PDF or HTML. + + - name: "Optional" + arguments: + - name: "--num_pr_bases" + type: integer + required: false + min: 1 + description: Number of upstream/downstream nucleotide bases to compute 5'-3' bias (default = 100). + - name: "--num_tr_bias" + type: integer + required: false + min: 1 + description: Number of top highly expressed transcripts to compute 5'-3' bias (default = 1000). + - name: "--algorithm" + type: string + required: false + choices: ["uniquely-mapped-reads", "proportional"] + description: Counting algorithm (uniquely-mapped-reads (default) or proportional). + - name: "--sequencing_protocol" + type: string + required: false + choices: ["non-strand-specific", "strand-specific-reverse", "strand-specific-forward"] + description: Sequencing library protocol (strand-specific-forward, strand-specific-reverse or non-strand-specific (default)). + - name: "--paired" + type: boolean_true + description: Setting this flag for paired-end experiments will result in counting fragments instead of reads. + - name: "--sorted" + type: boolean_true + description: Setting this flag indicates that the input file is already sorted by name. If flag is not set, additional sorting by name will be performed. Only requiredfor paired-end analysis. + - name: "--java_memory_size" + type: string + required: false + description: maximum Java heap memory size, default = 4G. + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - path: test_data/ + +engines: + - type: docker + image: quay.io/biocontainers/qualimap:2.3--hdfd78af_0 + setup: + - type: docker + run: | + echo QualiMap: $(qualimap 2>&1 | grep QualiMap | sed 's/^.*QualiMap//') > /var/software_versions.txt +runners: + - type: executable + - type: nextflow diff --git a/src/qualimap/qualimap_rnaseq/help.txt b/src/qualimap/qualimap_rnaseq/help.txt new file mode 100644 index 00000000..c6493ed9 --- /dev/null +++ b/src/qualimap/qualimap_rnaseq/help.txt @@ -0,0 +1,52 @@ +QualiMap v.2.3 +Built on 2023-05-19 16:57 + +usage: qualimap [options] + +To launch GUI leave empty. + +Available tools: + + bamqc Evaluate NGS mapping to a reference genome + rnaseq Evaluate RNA-seq alignment data + counts Counts data analysis (further RNA-seq data evaluation) + multi-bamqc Compare QC reports from multiple NGS mappings + clustering Cluster epigenomic signals + comp-counts Compute feature counts + +Special arguments: + + --java-mem-size Use this argument to set Java memory heap size. Example: + qualimap bamqc -bam very_large_alignment.bam --java-mem-size=4G + +usage: qualimap rnaseq [-a ] -bam -gtf [-npb ] [-ntb + ] [-oc ] [-outdir ] [-outfile ] [-outformat ] + [-p ] [-pe] [-s] + -a,--algorithm Counting algorithm: + uniquely-mapped-reads(default) or + proportional. + -bam Input mapping file in BAM format. + -gtf Annotations file in Ensembl GTF format. + -npb,--num-pr-bases Number of upstream/downstream nucleotide bases + to compute 5'-3' bias (default is 100). + -ntb,--num-tr-bias Number of top highly expressed transcripts to + compute 5'-3' bias (default is 1000). + -oc Output file for computed counts. If only name + of the file is provided, then the file will be + saved in the output folder. + -outdir Output folder for HTML report and raw data. + -outfile Output file for PDF report (default value is + report.pdf). + -outformat Format of the output report (PDF, HTML or both + PDF:HTML, default is HTML). + -p,--sequencing-protocol Sequencing library protocol: + strand-specific-forward, + strand-specific-reverse or non-strand-specific + (default) + -pe,--paired Setting this flag for paired-end experiments + will result in counting fragments instead of + reads + -s,--sorted This flag indicates that the input file is + already sorted by name. If not set, additional + sorting by name will be performed. Only + required for paired-end analysis. \ No newline at end of file diff --git a/src/qualimap/qualimap_rnaseq/script.sh b/src/qualimap/qualimap_rnaseq/script.sh new file mode 100644 index 00000000..351e5159 --- /dev/null +++ b/src/qualimap/qualimap_rnaseq/script.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +set -eo pipefail + +tmp_dir=$(mktemp -d -p "$meta_temp_dir" qualimap_XXXXXXXXX) + +# Handle output parameters +if [ -n "$par_report" ]; then + outfile=$(basename "$par_report") + report_extension="${outfile##*.}" +fi + +if [ -n "$par_counts" ]; then + counts=$(basename "$par_counts") +fi + +# disable flags +[[ "$par_paired" == "false" ]] && unset par_paired +[[ "$par_sorted" == "false" ]] && unset par_sorted + +# Run qualimap +qualimap rnaseq \ + ${meta_memory_mb:+--java-mem-size=${meta_memory_mb}M} \ + ${par_algorithm:+--algorithm $par_algorithm} \ + ${par_sequencing_protocol:+--sequencing-protocol $par_sequencing_protocol} \ + -bam $par_bam \ + -gtf $par_gtf \ + -outdir "$tmp_dir" \ + ${par_num_pr_bases:+--num-pr-bases $par_num_pr_bases} \ + ${par_num_tr_bias:+--num-tr-bias $par_num_tr_bias} \ + ${par_report:+-outformat $report_extension} \ + ${par_paired:+--paired} \ + ${par_sorted:+--sorted} \ + ${par_report:+-outfile "$outfile"} \ + ${par_counts:+-oc "$counts"} + +# Move output files +mv "$tmp_dir/rnaseq_qc_results.txt" "$par_qc_results" + +if [ -n "$par_report" ] && [ $report_extension = "html" ]; then + mv "$tmp_dir/qualimapReport.html" "$par_report" +fi + +if [ -n "$par_report" ] && [ $report_extension = "pdf" ]; then + mv "$tmp_dir/$outfile" "$par_report" +fi + +if [ -n "$par_counts" ]; then + mv "$tmp_dir/$counts" "$par_counts" +fi diff --git a/src/qualimap/qualimap_rnaseq/test.sh b/src/qualimap/qualimap_rnaseq/test.sh new file mode 100755 index 00000000..2e1b647b --- /dev/null +++ b/src/qualimap/qualimap_rnaseq/test.sh @@ -0,0 +1,112 @@ +set -e + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_doesnt_exist() { + [ ! -f "$1" ] || { echo "File '$1' exists but shouldn't" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +############################################# + + +test_dir="$meta_resources_dir/test_data" + +mkdir "run_qualimap_rnaseq_html" +cd "run_qualimap_rnaseq_html" + +echo "> Running qualimap with html output report" + +"$meta_executable" \ + --bam $test_dir/a.bam \ + --gtf $test_dir/annotation.gtf \ + --report report.html \ + --counts counts.txt \ + --qc_results output.txt + +echo ">> Checking output" +assert_file_exists "report.html" +assert_file_exists "counts.txt" +assert_file_exists "output.txt" +assert_file_doesnt_exist "report.pdf" + +echo ">> Checking if output is empty" +assert_file_not_empty "report.html" +assert_file_not_empty "counts.txt" +assert_file_not_empty "output.txt" + +echo ">> Checking output contents" +assert_file_contains "output.txt" ">>>>>>> Input" +assert_file_contains "output.txt" ">>>>>>> Reads alignment" +assert_file_contains "output.txt" ">>>>>>> Reads genomic origin" +assert_file_contains "output.txt" ">>>>>>> Transcript coverage profile" +assert_file_contains "output.txt" ">>>>>>> Junction analysis" +assert_file_contains "output.txt" ">>>>>>> Transcript coverage profile" + +assert_file_contains "counts.txt" "ENSG00000125841.12" + +assert_file_contains "report.html" "Qualimap report: RNA Seq QC" +assert_file_contains "report.html" "

Input

" +assert_file_contains "report.html" "

Reads alignment

" +assert_file_contains "report.html" "

Reads genomic origin

" +assert_file_contains "report.html" "

Transcript coverage profile

" +assert_file_contains "report.html" "

Junction analysis

" + + +cd .. +rm -r run_qualimap_rnaseq_html + +mkdir "run_qualimap_rnaseq_pdf" +cd "run_qualimap_rnaseq_pdf" + +echo "> Running qualimap with pdf output report" + +"$meta_executable" \ + --bam $test_dir/a.bam \ + --gtf $test_dir/annotation.gtf \ + --report report.pdf \ + --counts counts.txt \ + --qc_results output.txt + +echo ">> Checking output" +assert_file_exists "report.pdf" +assert_file_exists "counts.txt" +assert_file_exists "output.txt" +assert_file_doesnt_exist "report.html" + +echo ">> Checking if output is empty" +assert_file_not_empty "report.pdf" +assert_file_not_empty "counts.txt" +assert_file_not_empty "output.txt" + +cd .. +rm -r run_qualimap_rnaseq_pdf + +mkdir "run_qualimap_rnaseq" +cd "run_qualimap_rnaseq" + +echo "> Running qualimap without report and counts output" + +"$meta_executable" \ + --bam $test_dir/a.bam \ + --gtf $test_dir/annotation.gtf \ + --qc_results output.txt + +echo ">> Checking output" +assert_file_doesnt_exist "report.pdf" +assert_file_doesnt_exist "report.html" +assert_file_doesnt_exist "counts.txt" +assert_file_exists "output.txt" + +echo ">> Checking if output is empty" +assert_file_not_empty "output.txt" + +cd .. +rm -r run_qualimap_rnaseq \ No newline at end of file diff --git a/src/qualimap/qualimap_rnaseq/test_data/a.bam b/src/qualimap/qualimap_rnaseq/test_data/a.bam new file mode 100644 index 00000000..c8ea1065 Binary files /dev/null and b/src/qualimap/qualimap_rnaseq/test_data/a.bam differ diff --git a/src/qualimap/qualimap_rnaseq/test_data/annotation.gtf b/src/qualimap/qualimap_rnaseq/test_data/annotation.gtf new file mode 100644 index 00000000..976de753 --- /dev/null +++ b/src/qualimap/qualimap_rnaseq/test_data/annotation.gtf @@ -0,0 +1,10 @@ +chr20 HAVANA transcript 347024 354868 . + . gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA exon 347024 347142 . + . gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 1; exon_id "ENSE00001831391.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA exon 349249 349363 . + . gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 2; exon_id "ENSE00001491647.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA exon 349638 349832 . + . gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 3; exon_id "ENSE00003710328.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA CDS 349644 349832 . + 0 gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 3; exon_id "ENSE00003710328.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA start_codon 349644 349646 . + 0 gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 3; exon_id "ENSE00003710328.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA exon 353210 354868 . + . gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 4; exon_id "ENSE00001822456.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA CDS 353210 353632 . + 0 gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 4; exon_id "ENSE00001822456.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA stop_codon 353633 353635 . + 0 gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 4; exon_id "ENSE00001822456.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA UTR 347024 347142 . + . gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 1; exon_id "ENSE00001831391.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; diff --git a/src/qualimap/qualimap_rnaseq/test_data/script.sh b/src/qualimap/qualimap_rnaseq/test_data/script.sh new file mode 100755 index 00000000..801fe405 --- /dev/null +++ b/src/qualimap/qualimap_rnaseq/test_data/script.sh @@ -0,0 +1,10 @@ +# qualimap test data + +# Test data was obtained from https://github.com/snakemake/snakemake-wrappers/raw/master/bio/qualimap/rnaseq/test + +if [ ! -d /tmp/snakemake-wrappers ]; then + git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers +fi + +cp -r /tmp/snakemake-wrappers/bio/qualimap/rnaseq/test/mapped/a.bam src/qualimap/qualimap_rnaseq/test_data +cp -r /tmp/snakemake-wrappers/bio/qualimap/rnaseq/test/annotation.gtf src/qualimap/qualimap_rnaseq/test_data