diff --git a/CHANGELOG.md b/CHANGELOG.md index 45a2a111..bcf6aa71 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,8 +31,21 @@ * `bedtools`: - `bedtools/bedtools_intersect`: Allows one to screen for overlaps between two sets of genomic features (PR #94). - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98). + - `bedtools/bedtools_groupby`: Summarizes a dataset column based upon common column groupings. Akin to the SQL "group by" command (PR #123). + - `bedtools/bedtools_merge`: Merges overlapping BED/GFF/VCF entries into a single interval (PR #118). - `bedtools/bedtools_bamtofastq`: Convert BAM alignments to FASTQ files (PR #101). - `bedtools/bedtools_bedtobam`: Converts genomic feature records (bed/gff/vcf) to BAM format (PR #111). + - `bedtools/bedtools_bed12tobed6`: Converts BED12 files to BED6 files (PR #140). + - `bedtools/bedtools_links`: Creates an HTML file with links to an instance of the UCSC Genome Browser for all features / intervals in a (bed/gff/vcf) file (PR #137). + +* `qualimap/qualimap_rnaseq`: RNA-seq QC analysis using qualimap (PR #74). + +* `rsem/rsem_prepare_reference`: Prepare transcript references for RSEM (PR #89). + +* `bcftools`: + - `bcftools/bcftools_sort`: Sorts BCF/VCF files by position and other criteria (PR #141). + +* `fastqc`: High throughput sequence quality control analysis tool (PR #92). ## MINOR CHANGES diff --git a/src/bcftools/bcftools_sort/config.vsh.yaml b/src/bcftools/bcftools_sort/config.vsh.yaml new file mode 100644 index 00000000..71a15309 --- /dev/null +++ b/src/bcftools/bcftools_sort/config.vsh.yaml @@ -0,0 +1,73 @@ +name: bcftools_sort +namespace: bcftools +description: | + Sorts VCF/BCF files. +keywords: [Sort, VCF, BCF] +links: + homepage: https://samtools.github.io/bcftools/ + documentation: https://samtools.github.io/bcftools/bcftools.html#sort + repository: https://github.com/samtools/bcftools + issue_tracker: https://github.com/samtools/bcftools/issues +references: + doi: https://doi.org/10.1093/gigascience/giab008 +license: MIT/Expat, GNU +requirements: + commands: [bcftools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + description: Input VCF/BCF file. + required: true + + - name: Outputs + arguments: + - name: --output + alternatives: -o + direction: output + type: file + description: Output sorted VCF/BCF file. + required: true + + - name: Options + arguments: + - name: --output_type + alternatives: -O + type: string + choices: [b, u, z, v] + description: | + Compresses or uncompresses the output. + The options are: + b: compressed BCF, + u: uncompressed BCF, + z: compressed VCF, + v: uncompressed VCF. + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bcftools, procps] + - type: docker + run: | + echo "bcftools: \"$(bcftools --version | grep 'bcftools' | sed -n 's/^bcftools //p')\"" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/bcftools/bcftools_sort/help.txt b/src/bcftools/bcftools_sort/help.txt new file mode 100644 index 00000000..3b5fa80b --- /dev/null +++ b/src/bcftools/bcftools_sort/help.txt @@ -0,0 +1,14 @@ +``` +bcftools sort +``` + +About: Sort VCF/BCF file. +Usage: bcftools sort [OPTIONS] + +Options: + -m, --max-mem FLOAT[kMG] maximum memory to use [768M] + -o, --output FILE output file name [stdout] + -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v] + -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v] + -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX] + diff --git a/src/bcftools/bcftools_sort/script.sh b/src/bcftools/bcftools_sort/script.sh new file mode 100644 index 00000000..e9afb223 --- /dev/null +++ b/src/bcftools/bcftools_sort/script.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +# Execute bedtools bamtofastq with the provided arguments +bcftools sort \ + -o "$par_output" \ + ${par_output_type:+-O "$par_output_type"} \ + ${meta_memory_mb:+-m "${meta_memory_mb}M"} \ + ${meta_temp_dir:+-T "$meta_temp_dir"} \ + $par_input \ + diff --git a/src/bcftools/bcftools_sort/test.sh b/src/bcftools/bcftools_sort/test.sh new file mode 100644 index 00000000..f406b8e2 --- /dev/null +++ b/src/bcftools/bcftools_sort/test.sh @@ -0,0 +1,185 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +test_data="$meta_resources_dir/test_data" + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create test data +cat < "$TMPDIR/example.vcf" +##fileformat=VCFv4.0 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=1000GenomesPilot-NCBI36 +##contig= +##contig= +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##ALT= +##ALT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 +19 111 . A C 9.6 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 +20 1235237 . T . . . . GT 0/0 0|0 ./. +20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. +20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,. +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.,. +20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:.:56,60 0|0:48:4:51,51 0/0:61:2:.,. +20 1234567 microsat1 G GA,GAC 50 PASS NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:.:4 0/2:17:2 1/1:40:3 +EOF + +# Create expected output +cat < "$TMPDIR/expected_output.vcf" +##fileformat=VCFv4.0 +##FILTER= +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=1000GenomesPilot-NCBI36 +##contig= +##contig= +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##ALT= +##ALT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +19 111 . A C 9.6 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 +19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 +20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. +20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,. +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.,. +20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:.:56,60 0|0:48:4:51,51 0/0:61:2:.,. +20 1234567 microsat1 G GA,GAC 50 PASS NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:.:4 0/2:17:2 1/1:40:3 +20 1235237 . T . . . . GT 0/0 0|0 ./. +EOF + +cat < "$TMPDIR/expected_bcf.vcf" +##fileformat=VCFv4.0 +##FILTER= +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=1000GenomesPilot-NCBI36 +##contig= +##contig= +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##ALT= +##ALT= +##bcftools_viewVersion=1.16+htslib-1.16 +##bcftools_viewCommand=view -O b -o example.bcf example.vcf.gz; Date=Mon Aug 26 13:00:22 2024 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +19 111 . A C 9.6 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 +19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 +20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. +20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,. +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.,. +20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:.:56,60 0|0:48:4:51,51 0/0:61:2:.,. +20 1234567 microsat1 G GA,GAC 50 PASS NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:.:4 0/2:17:2 1/1:40:3 +20 1235237 . T . . . . GT 0/0 0|0 ./. +EOF + + +# Test 1: Default Use +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null + +echo "> Run bcftools_sort on VCF file" +"$meta_executable" \ + --input "../example.vcf" \ + --output "output.vcf" \ + --output_type "v" \ + &> /dev/null + +# checks +assert_file_exists "output.vcf" +assert_file_not_empty "output.vcf" +assert_identical_content "output.vcf" "../expected_output.vcf" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: BCF file input +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bcftools_sort on BCF file" +"$meta_executable" \ + --input "${test_data}/example.bcf" \ + --output "output.vcf" \ + --output_type "v" \ + &> /dev/null + +# checks +assert_file_exists "output.vcf" +assert_file_not_empty "output.vcf" +assert_identical_content "output.vcf" "../expected_bcf.vcf" +echo "- test2 succeeded -" + +popd > /dev/null + +echo "---- All tests succeeded! ----" +exit 0 diff --git a/src/bcftools/bcftools_sort/test_data/example.bcf b/src/bcftools/bcftools_sort/test_data/example.bcf new file mode 100644 index 00000000..d78ae010 Binary files /dev/null and b/src/bcftools/bcftools_sort/test_data/example.bcf differ diff --git a/src/bedtools/bedtools_bed12tobed6/config.vsh.yaml b/src/bedtools/bedtools_bed12tobed6/config.vsh.yaml new file mode 100644 index 00000000..8dd6328c --- /dev/null +++ b/src/bedtools/bedtools_bed12tobed6/config.vsh.yaml @@ -0,0 +1,67 @@ +name: bedtools_bed12tobed6 +namespace: bedtools +description: | + Converts BED features in BED12 (a.k.a. “blocked” BED features such as genes) to discrete BED6 features. + For example, in the case of a gene with six exons, bed12ToBed6 would create six separate BED6 features (i.e., one for each exon). +keywords: [Converts, BED12, BED6] +links: + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/bed12tobed6.html + repository: https://github.com/arq5x/bedtools2 + homepage: https://bedtools.readthedocs.io/en/latest/# + issue_tracker: https://github.com/arq5x/bedtools2/issues +references: + doi: 10.1093/bioinformatics/btq033 +license: MIT +requirements: + commands: [bedtools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] + +argument_groups: + + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + description: Input BED12 file. + required: true + + - name: Outputs + arguments: + - name: --output + alternatives: -o + type: file + direction: output + description: Output BED6 file to be written. + + - name: Options + arguments: + - name: --n_score + alternatives: -n + type: boolean_true + description: | + Force the score to be the (1-based) block number from the BED12. + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bedtools, procps] + - type: docker + run: | + echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/bedtools/bedtools_bed12tobed6/help.txt b/src/bedtools/bedtools_bed12tobed6/help.txt new file mode 100644 index 00000000..17af6983 --- /dev/null +++ b/src/bedtools/bedtools_bed12tobed6/help.txt @@ -0,0 +1,13 @@ +``` +bedtools bed12tobed6 -h +``` + +Tool: bedtools bed12tobed6 (aka bed12ToBed6) +Version: v2.30.0 +Summary: Splits BED12 features into discrete BED6 features. + +Usage: bedtools bed12tobed6 [OPTIONS] -i + +Options: + -n Force the score to be the (1-based) block number from the BED12. + diff --git a/src/bedtools/bedtools_bed12tobed6/script.sh b/src/bedtools/bedtools_bed12tobed6/script.sh new file mode 100644 index 00000000..bbfaddc6 --- /dev/null +++ b/src/bedtools/bedtools_bed12tobed6/script.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -eo pipefail + +# Unset parameters +[[ "$par_n_score" == "false" ]] && unset par_n_score + +# Execute bedtools bed12tobed6 conversion +bedtools bed12tobed6 \ + ${par_n_score:+-n} \ + -i "$par_input" \ + > "$par_output" diff --git a/src/bedtools/bedtools_bed12tobed6/test.sh b/src/bedtools/bedtools_bed12tobed6/test.sh new file mode 100644 index 00000000..2ef596d9 --- /dev/null +++ b/src/bedtools/bedtools_bed12tobed6/test.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +# exit on error +set -eo pipefail + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create example BED12 file +cat < "$TMPDIR/example.bed12" +chr21 10079666 10120808 uc002yiv.1 0 - 10081686 1 0 1 2 0 6 0 8 0 4 528,91,101,215, 0,1930,39750,40927, +chr21 10080031 10081687 uc002yiw.1 0 - 10080031 1 0 0 8 0 0 3 1 0 2 200,91, 0,1565, +chr21 10081660 10120796 uc002yix.2 0 - 10081660 1 0 0 8 1 6 6 0 0 3 27,101,223, 0,37756,38913, +EOF + +# Expected output bed6 file +cat < "$TMPDIR/expected.bed6" +chr21 10079666 10120808 uc002yiv.1 0 - +chr21 10080031 10081687 uc002yiw.1 0 - +chr21 10081660 10120796 uc002yix.2 0 - +EOF +# Expected output bed6 file with -n option +cat < "$TMPDIR/expected_n.bed6" +chr21 10079666 10120808 uc002yiv.1 1 - +chr21 10080031 10081687 uc002yiw.1 1 - +chr21 10081660 10120796 uc002yix.2 1 - +EOF + +# Test 1: Default conversion BED12 to BED6 +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null + +echo "> Run bedtools_bed12tobed6 on BED12 file" +"$meta_executable" \ + --input "../example.bed12" \ + --output "output.bed6" + +# checks +assert_file_exists "output.bed6" +assert_file_not_empty "output.bed6" +assert_identical_content "output.bed6" "../expected.bed6" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: Conversion BED12 to BED6 with -n option +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bedtools_bed12tobed6 on BED12 file with -n option" +"$meta_executable" \ + --input "../example.bed12" \ + --output "output.bed6" \ + --n_score + +# checks +assert_file_exists "output.bed6" +assert_file_not_empty "output.bed6" +assert_identical_content "output.bed6" "../expected_n.bed6" +echo "- test2 succeeded -" + +popd > /dev/null + +echo "---- All tests succeeded! ----" +exit 0 diff --git a/src/bedtools/bedtools_groupby/config.vsh.yaml b/src/bedtools/bedtools_groupby/config.vsh.yaml new file mode 100644 index 00000000..89c4845b --- /dev/null +++ b/src/bedtools/bedtools_groupby/config.vsh.yaml @@ -0,0 +1,155 @@ +name: bedtools_groupby +namespace: bedtools +description: | + Summarizes a dataset column based upon common column groupings. + Akin to the SQL "group by" command. +keywords: [groupby, BED] +links: + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/groupby.html + repository: https://github.com/arq5x/bedtools2 + homepage: https://bedtools.readthedocs.io/en/latest/# + issue_tracker: https://github.com/arq5x/bedtools2/issues +references: + doi: 10.1093/bioinformatics/btq033 +license: MIT +requirements: + commands: [bedtools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + direction: input + description: | + The input BED file to be used. + required: true + example: input_a.bed + + - name: Outputs + arguments: + - name: --output + type: file + direction: output + description: | + The output groupby BED file. + required: true + example: output.bed + + - name: Options + arguments: + - name: --groupby + alternatives: [-g, -grp] + type: string + description: | + Specify the columns (1-based) for the grouping. + The columns must be comma separated. + - Default: 1,2,3 + required: true + + - name: --column + alternatives: [-c, -opCols] + type: integer + description: | + Specify the column (1-based) that should be summarized. + required: true + + - name: --operation + alternatives: [-o, -ops] + type: string + description: | + Specify the operation that should be applied to opCol. + Valid operations: + sum, count, count_distinct, min, max, + mean, median, mode, antimode, + stdev, sstdev (sample standard dev.), + collapse (i.e., print a comma separated list (duplicates allowed)), + distinct (i.e., print a comma separated list (NO duplicates allowed)), + distinct_sort_num (as distinct, but sorted numerically, ascending), + distinct_sort_num_desc (as distinct, but sorted numerically, descending), + concat (i.e., merge values into a single, non-delimited string), + freqdesc (i.e., print desc. list of values:freq) + freqasc (i.e., print asc. list of values:freq) + first (i.e., print first value) + last (i.e., print last value) + + Default value: sum + + If there is only column, but multiple operations, all operations will be + applied on that column. Likewise, if there is only one operation, but + multiple columns, that operation will be applied to all columns. + Otherwise, the number of columns must match the the number of operations, + and will be applied in respective order. + E.g., "-c 5,4,6 -o sum,mean,count" will give the sum of column 5, + the mean of column 4, and the count of column 6. + The order of output columns will match the ordering given in the command. + + - name: --full + type: boolean_true + description: | + Print all columns from input file. The first line in the group is used. + Default: print only grouped columns. + + - name: --inheader + type: boolean_true + description: | + Input file has a header line - the first line will be ignored. + + - name: --outheader + type: boolean_true + description: | + Print header line in the output, detailing the column names. + If the input file has headers (-inheader), the output file + will use the input's column names. + If the input file has no headers, the output file + will use "col_1", "col_2", etc. as the column names. + + - name: --header + type: boolean_true + description: same as '-inheader -outheader'. + + - name: --ignorecase + type: boolean_true + description: | + Group values regardless of upper/lower case. + + - name: --precision + alternatives: -prec + type: integer + description: | + Sets the decimal precision for output. + default: 5 + + - name: --delimiter + alternatives: -delim + type: string + description: | + Specify a custom delimiter for the collapse operations. + example: "|" + default: "," + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bedtools, procps] + - type: docker + run: | + echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/bedtools/bedtools_groupby/help.txt b/src/bedtools/bedtools_groupby/help.txt new file mode 100644 index 00000000..a631b4b1 --- /dev/null +++ b/src/bedtools/bedtools_groupby/help.txt @@ -0,0 +1,93 @@ +```bash +bedtools groupby +``` + +Tool: bedtools groupby +Version: v2.30.0 +Summary: Summarizes a dataset column based upon + common column groupings. Akin to the SQL "group by" command. + +Usage: bedtools groupby -g [group_column(s)] -c [op_column(s)] -o [ops] + cat [FILE] | bedtools groupby -g [group_column(s)] -c [op_column(s)] -o [ops] + +Options: + -i Input file. Assumes "stdin" if omitted. + + -g -grp Specify the columns (1-based) for the grouping. + The columns must be comma separated. + - Default: 1,2,3 + + -c -opCols Specify the column (1-based) that should be summarized. + - Required. + + -o -ops Specify the operation that should be applied to opCol. + Valid operations: + sum, count, count_distinct, min, max, + mean, median, mode, antimode, + stdev, sstdev (sample standard dev.), + collapse (i.e., print a comma separated list (duplicates allowed)), + distinct (i.e., print a comma separated list (NO duplicates allowed)), + distinct_sort_num (as distinct, but sorted numerically, ascending), + distinct_sort_num_desc (as distinct, but sorted numerically, descending), + concat (i.e., merge values into a single, non-delimited string), + freqdesc (i.e., print desc. list of values:freq) + freqasc (i.e., print asc. list of values:freq) + first (i.e., print first value) + last (i.e., print last value) + - Default: sum + + If there is only column, but multiple operations, all operations will be + applied on that column. Likewise, if there is only one operation, but + multiple columns, that operation will be applied to all columns. + Otherwise, the number of columns must match the the number of operations, + and will be applied in respective order. + E.g., "-c 5,4,6 -o sum,mean,count" will give the sum of column 5, + the mean of column 4, and the count of column 6. + The order of output columns will match the ordering given in the command. + + + -full Print all columns from input file. The first line in the group is used. + Default: print only grouped columns. + + -inheader Input file has a header line - the first line will be ignored. + + -outheader Print header line in the output, detailing the column names. + If the input file has headers (-inheader), the output file + will use the input's column names. + If the input file has no headers, the output file + will use "col_1", "col_2", etc. as the column names. + + -header same as '-inheader -outheader' + + -ignorecase Group values regardless of upper/lower case. + + -prec Sets the decimal precision for output (Default: 5) + + -delim Specify a custom delimiter for the collapse operations. + - Example: -delim "|" + - Default: ",". + +Examples: + $ cat ex1.out + chr1 10 20 A chr1 15 25 B.1 1000 ATAT + chr1 10 20 A chr1 25 35 B.2 10000 CGCG + + $ groupBy -i ex1.out -g 1,2,3,4 -c 9 -o sum + chr1 10 20 A 11000 + + $ groupBy -i ex1.out -grp 1,2,3,4 -opCols 9,9 -ops sum,max + chr1 10 20 A 11000 10000 + + $ groupBy -i ex1.out -g 1,2,3,4 -c 8,9 -o collapse,mean + chr1 10 20 A B.1,B.2, 5500 + + $ cat ex1.out | groupBy -g 1,2,3,4 -c 8,9 -o collapse,mean + chr1 10 20 A B.1,B.2, 5500 + + $ cat ex1.out | groupBy -g 1,2,3,4 -c 10 -o concat + chr1 10 20 A ATATCGCG + +Notes: + (1) The input file/stream should be sorted/grouped by the -grp. columns + (2) If -i is unspecified, input is assumed to come from stdin. + diff --git a/src/bedtools/bedtools_groupby/script.sh b/src/bedtools/bedtools_groupby/script.sh new file mode 100644 index 00000000..b8a40cdc --- /dev/null +++ b/src/bedtools/bedtools_groupby/script.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +# Unset parameters +unset_if_false=( + par_full + par_inheader + par_outheader + par_header + par_ignorecase +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +bedtools groupby \ + ${par_full:+-full} \ + ${par_inheader:+-inheader} \ + ${par_outheader:+-outheader} \ + ${par_header:+-header} \ + ${par_ignorecase:+-ignorecase} \ + ${par_precision:+-prec "$par_precision"} \ + ${par_delimiter:+-delim "$par_delimiter"} \ + -i "$par_input" \ + -g "$par_groupby" \ + -c "$par_column" \ + ${par_operation:+-o "$par_operation"} \ + > "$par_output" + \ No newline at end of file diff --git a/src/bedtools/bedtools_groupby/test.sh b/src/bedtools/bedtools_groupby/test.sh new file mode 100644 index 00000000..ce99a1ec --- /dev/null +++ b/src/bedtools/bedtools_groupby/test.sh @@ -0,0 +1,198 @@ +#!/bin/bash + +# exit on error +set -eo pipefail + +## VIASH START +meta_executable="target/executable/bedtools/bedtools_groupby/bedtools_groupby" +meta_resources_dir="src/bedtools/bedtools_groupby" +## VIASH END + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create and populate example.bed +cat << EOF > $TMPDIR/example.bed +# Header +chr21 9719758 9729320 variant1 chr21 9719768 9721892 ALR/Alpha 1004 + +chr21 9719758 9729320 variant1 chr21 9721905 9725582 ALR/Alpha 1010 + +chr21 9719758 9729320 variant1 chr21 9725582 9725977 L1PA3 3288 + +chr21 9719758 9729320 variant1 chr21 9726021 9729309 ALR/Alpha 1051 + +chr21 9729310 9757478 variant2 chr21 9729320 9729809 L1PA3 3897 - +chr21 9729310 9757478 variant2 chr21 9729809 9730866 L1P1 8367 + +chr21 9729310 9757478 variant2 chr21 9730866 9734026 ALR/Alpha 1036 - +chr21 9729310 9757478 variant2 chr21 9734037 9757471 ALR/Alpha 1182 - +chr21 9795588 9796685 variant3 chr21 9795589 9795713 (GAATG)n 308 + +chr21 9795588 9796685 variant3 chr21 9795736 9795894 (GAATG)n 683 + +chr21 9795588 9796685 variant3 chr21 9795911 9796007 (GAATG)n 345 + +chr21 9795588 9796685 variant3 chr21 9796028 9796187 (GAATG)n 756 + +chr21 9795588 9796685 variant3 chr21 9796202 9796615 (GAATG)n 891 + +chr21 9795588 9796685 variant3 chr21 9796637 9796824 (GAATG)n 621 + +EOF + +# Create and populate expected output files for different tests +cat << EOF > $TMPDIR/expected.bed +chr21 9719758 9729320 6353 +chr21 9729310 9757478 14482 +chr21 9795588 9796685 3604 +EOF +cat << EOF > $TMPDIR/expected_max.bed +chr21 9719758 9729320 variant1 3288 +chr21 9729310 9757478 variant2 8367 +chr21 9795588 9796685 variant3 891 +EOF +cat << EOF > $TMPDIR/expected_full.bed +chr21 9719758 9729320 variant1 chr21 9719768 9721892 ALR/Alpha 1004 + 6353 +chr21 9729310 9757478 variant2 chr21 9729320 9729809 L1PA3 3897 - 14482 +chr21 9795588 9796685 variant3 chr21 9795589 9795713 (GAATG)n 308 + 3604 +EOF +cat << EOF > $TMPDIR/expected_delimited.bed +chr21 9719758 9729320 variant1 1004;1010;3288;1051 +chr21 9729310 9757478 variant2 3897;8367;1036;1182 +chr21 9795588 9796685 variant3 308;683;345;756;891;621 +EOF +cat << EOF > $TMPDIR/expected_precision.bed +chr21 9719758 9729320 variant1 1.6e+03 +chr21 9729310 9757478 variant2 3.6e+03 +chr21 9795588 9796685 variant3 6e+02 +EOF + +# Test 1: without operation option, default operation is sum +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null + +echo "> Run bedtools groupby on BED file" +"$meta_executable" \ + --input "../example.bed" \ + --groupby "1,2,3" \ + --column "9" \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected.bed" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: with operation max option +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bedtools groupby on BED file with max operation" +"$meta_executable" \ + --input "../example.bed" \ + --groupby "1-4" \ + --column "9" \ + --operation "max" \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_max.bed" +echo "- test2 succeeded -" + +popd > /dev/null + +# Test 3: full option +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null + +echo "> Run bedtools groupby on BED file with full option" +"$meta_executable" \ + --input "../example.bed" \ + --groupby "1-4" \ + --column "9" \ + --full \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_full.bed" +echo "- test3 succeeded -" + +popd > /dev/null + +# Test 4: header option +mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null + +echo "> Run bedtools groupby on BED file with header option" +"$meta_executable" \ + --input "../example.bed" \ + --groupby "1-4" \ + --column "9" \ + --header \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_file_contains "output.bed" "# Header" +echo "- test4 succeeded -" + +popd > /dev/null + +# Test 5: Delimiter and collapse +mkdir "$TMPDIR/test5" && pushd "$TMPDIR/test5" > /dev/null + +echo "> Run bedtools groupby on BED file with delimiter and collapse options" +"$meta_executable" \ + --input "../example.bed" \ + --groupby "1-4" \ + --column "9" \ + --operation "collapse" \ + --delimiter ";" \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_delimited.bed" +echo "- test5 succeeded -" + +popd > /dev/null + +# Test 6: precision option +mkdir "$TMPDIR/test6" && pushd "$TMPDIR/test6" > /dev/null + +echo "> Run bedtools groupby on BED file with precision option" +"$meta_executable" \ + --input "../example.bed" \ + --groupby "1-4" \ + --column "9" \ + --operation "mean" \ + --precision 2 \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_precision.bed" +echo "- test6 succeeded -" + +popd > /dev/null + +echo "---- All tests succeeded! ----" +exit 0 diff --git a/src/bedtools/bedtools_links/config.vsh.yaml b/src/bedtools/bedtools_links/config.vsh.yaml new file mode 100644 index 00000000..b4e43cd3 --- /dev/null +++ b/src/bedtools/bedtools_links/config.vsh.yaml @@ -0,0 +1,91 @@ +name: bedtools_links +namespace: bedtools +description: | + Creates an HTML file with links to an instance of the UCSC Genome Browser for all features / intervals in a file. + This is useful for cases when one wants to manually inspect through a large set of annotations or features. +keywords: [Links, BED, GFF, VCF] +links: + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/links.html + repository: https://github.com/arq5x/bedtools2 + homepage: https://bedtools.readthedocs.io/en/latest/# + issue_tracker: https://github.com/arq5x/bedtools2/issues +references: + doi: 10.1093/bioinformatics/btq033 +license: MIT +requirements: + commands: [bedtools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + description: Input file (bed/gff/vcf). + required: true + + - name: Outputs + arguments: + - name: --output + alternatives: -o + type: file + direction: output + description: Output HTML file to be written. + + - name: Options + description: | + By default, the links created will point to human (hg18) UCSC browser. + If you have a local mirror, you can override this behavior by supplying + the -base, -org, and -db options. + + For example, if the URL of your local mirror for mouse MM9 is called: + http://mymirror.myuniversity.edu, then you would use the following: + --base_url http://mymirror.myuniversity.edu + --organism mouse + --database mm9 + arguments: + - name: --base_url + alternatives: -base + type: string + description: | + The “basename” for the UCSC browser. + default: http://genome.ucsc.edu + + - name: --organism + alternatives: -org + type: string + description: | + The organism (e.g. mouse, human). + default: human + + - name: --database + alternatives: -db + type: string + description: | + The genome build. + default: hg18 + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bedtools, procps] + - type: docker + run: | + echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/bedtools/bedtools_links/help.txt b/src/bedtools/bedtools_links/help.txt new file mode 100644 index 00000000..d848d989 --- /dev/null +++ b/src/bedtools/bedtools_links/help.txt @@ -0,0 +1,25 @@ +``` +bedtools links -h +``` + +Tool: bedtools links (aka linksBed) +Version: v2.30.0 +Summary: Creates HTML links to an UCSC Genome Browser from a feature file. + +Usage: bedtools links [OPTIONS] -i > out.html + +Options: + -base The browser basename. Default: http://genome.ucsc.edu + -org The organism. Default: human + -db The build. Default: hg18 + +Example: + By default, the links created will point to human (hg18) UCSC browser. + If you have a local mirror, you can override this behavior by supplying + the -base, -org, and -db options. + + For example, if the URL of your local mirror for mouse MM9 is called: + http://mymirror.myuniversity.edu, then you would use the following: + -base http://mymirror.myuniversity.edu + -org mouse + -db mm9 diff --git a/src/bedtools/bedtools_links/script.sh b/src/bedtools/bedtools_links/script.sh new file mode 100644 index 00000000..b8ee9a56 --- /dev/null +++ b/src/bedtools/bedtools_links/script.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -eo pipefail + +# Execute bedtools links +bedtools links \ + ${par_base_url:+-base "$par_base_url"} \ + ${par_organism:+-org "$par_organism"} \ + ${par_database:+-db "$par_database"} \ + -i "$par_input" \ + > "$par_output" diff --git a/src/bedtools/bedtools_links/test.sh b/src/bedtools/bedtools_links/test.sh new file mode 100644 index 00000000..d79cbd6c --- /dev/null +++ b/src/bedtools/bedtools_links/test.sh @@ -0,0 +1,98 @@ +#!/bin/bash + +# exit on error +set -eo pipefail + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create test data +cat < "$TMPDIR/genes.bed" +chr21 9928613 10012791 uc002yip.1 0 - +chr21 9928613 10012791 uc002yiq.1 0 - +chr21 9928613 10012791 uc002yir.1 0 - +chr21 9928613 10012791 uc010gkv.1 0 - +chr21 9928613 10061300 uc002yis.1 0 - +chr21 10042683 10120796 uc002yit.1 0 - +chr21 10042683 10120808 uc002yiu.1 0 - +chr21 10079666 10120808 uc002yiv.1 0 - +chr21 10080031 10081687 uc002yiw.1 0 - +chr21 10081660 10120796 uc002yix.2 0 - +EOF + +# Test 1: Default Use +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null + +echo "> Run bedtools_links on BED file" +"$meta_executable" \ + --input "../genes.bed" \ + --output "genes.html" + +# checks +assert_file_exists "genes.html" +assert_file_not_empty "genes.html" +assert_file_contains "genes.html" "uc002yip.1" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: Base URL +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bedtools_links with base option" +"$meta_executable" \ + --input "../genes.bed" \ + --output "genes.html" \ + --base_url "http://genome.ucsc.edu" + +# checks +assert_file_exists "genes.html" +assert_file_not_empty "genes.html" +assert_file_contains "genes.html" "uc002yip.1" +echo "- test2 succeeded -" + +popd > /dev/null + +# Test 3: Organism and Genome Database Build +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null + +echo "> Run bedtools_links with organism option and genome database build" +"$meta_executable" \ + --input "../genes.bed" \ + --output "genes.html" \ + --base_url "http://genome.ucsc.edu" \ + --organism "mouse" \ + --database "mm9" + +# checks +assert_file_exists "genes.html" +assert_file_not_empty "genes.html" +assert_file_contains "genes.html" "uc002yip.1" +echo "- test3 succeeded -" + +popd > /dev/null + +echo "---- All tests succeeded! ----" +exit 0 diff --git a/src/bedtools/bedtools_merge/config.vsh.yaml b/src/bedtools/bedtools_merge/config.vsh.yaml new file mode 100644 index 00000000..45e4a01d --- /dev/null +++ b/src/bedtools/bedtools_merge/config.vsh.yaml @@ -0,0 +1,160 @@ +name: bedtools_merge +namespace: bedtools +description: | + Merges overlapping BED/GFF/VCF entries into a single interval. +links: + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/merge.html + repository: https://github.com/arq5x/bedtools2 + homepage: https://bedtools.readthedocs.io/en/latest/# + issue_tracker: https://github.com/arq5x/bedtools2/issues +references: + doi: 10.1093/bioinformatics/btq033 +license: MIT +requirements: + commands: [bedtools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + description: Input file (BED/GFF/VCF) to be merged. + required: true + + - name: Outputs + arguments: + - name: --output + type: file + direction: output + description: Output merged file BED to be written. + required: true + + - name: Options + arguments: + - name: --strand + alternatives: -s + type: boolean_true + description: | + Force strandedness. That is, only merge features + that are on the same strand. + - By default, merging is done without respect to strand. + + - name: --specific_strand + alternatives: -S + type: string + choices: ["+", "-"] + description: | + Force merge for one specific strand only. + Follow with + or - to force merge from only + the forward or reverse strand, respectively. + - By default, merging is done without respect to strand. + + - name: --distance + alternatives: -d + type: integer + description: | + Maximum distance between features allowed for features + to be merged. + - Def. 0. That is, overlapping & book-ended features are merged. + - (INTEGER) + - Note: negative values enforce the number of b.p. required for overlap. + + - name: --columns + alternatives: -c + type: integer + description: | + Specify columns from the B file to map onto intervals in A. + Default: 5. + Multiple columns can be specified in a comma-delimited list. + + - name: --operation + alternatives: -o + type: string + description: | + Specify the operation that should be applied to -c. + Valid operations: + sum, min, max, absmin, absmax, + mean, median, mode, antimode + stdev, sstdev + collapse (i.e., print a delimited list (duplicates allowed)), + distinct (i.e., print a delimited list (NO duplicates allowed)), + distinct_sort_num (as distinct, sorted numerically, ascending), + distinct_sort_num_desc (as distinct, sorted numerically, desscending), + distinct_only (delimited list of only unique values), + count + count_distinct (i.e., a count of the unique values in the column), + first (i.e., just the first value in the column), + last (i.e., just the last value in the column), + Default: sum + Multiple operations can be specified in a comma-delimited list. + + If there is only column, but multiple operations, all operations will be + applied on that column. Likewise, if there is only one operation, but + multiple columns, that operation will be applied to all columns. + Otherwise, the number of columns must match the the number of operations, + and will be applied in respective order. + E.g., "-c 5,4,6 -o sum,mean,count" will give the sum of column 5, + the mean of column 4, and the count of column 6. + The order of output columns will match the ordering given in the command. + + - name: --delimiter + alternatives: -delim + type: string + description: | + Specify a custom delimiter for the collapse operations. + example: "|" + default: "," + + - name: --precision + alternatives: -prec + type: integer + description: | + Sets the decimal precision for output (Default: 5). + + - name: --bed + type: boolean_true + description: | + If using BAM input, write output as BED. + + - name: --header + type: boolean_true + description: | + Print the header from the A file prior to results. + + - name: --no_buffer + alternatives: -nobuf + type: boolean_true + description: | + Disable buffered output. Using this option will cause each line + of output to be printed as it is generated, rather than saved + in a buffer. This will make printing large output files + noticeably slower, but can be useful in conjunction with + other software tools and scripts that need to process one + line of bedtools output at a time. + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bedtools, procps] + - type: docker + run: | + echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/bedtools/bedtools_merge/help.txt b/src/bedtools/bedtools_merge/help.txt new file mode 100644 index 00000000..bc78fc67 --- /dev/null +++ b/src/bedtools/bedtools_merge/help.txt @@ -0,0 +1,85 @@ +```bash +bedtools merge +``` + +Tool: bedtools merge (aka mergeBed) +Version: v2.30.0 +Summary: Merges overlapping BED/GFF/VCF entries into a single interval. + +Usage: bedtools merge [OPTIONS] -i + +Options: + -s Force strandedness. That is, only merge features + that are on the same strand. + - By default, merging is done without respect to strand. + + -S Force merge for one specific strand only. + Follow with + or - to force merge from only + the forward or reverse strand, respectively. + - By default, merging is done without respect to strand. + + -d Maximum distance between features allowed for features + to be merged. + - Def. 0. That is, overlapping & book-ended features are merged. + - (INTEGER) + - Note: negative values enforce the number of b.p. required for overlap. + + -c Specify columns from the B file to map onto intervals in A. + Default: 5. + Multiple columns can be specified in a comma-delimited list. + + -o Specify the operation that should be applied to -c. + Valid operations: + sum, min, max, absmin, absmax, + mean, median, mode, antimode + stdev, sstdev + collapse (i.e., print a delimited list (duplicates allowed)), + distinct (i.e., print a delimited list (NO duplicates allowed)), + distinct_sort_num (as distinct, sorted numerically, ascending), + distinct_sort_num_desc (as distinct, sorted numerically, desscending), + distinct_only (delimited list of only unique values), + count + count_distinct (i.e., a count of the unique values in the column), + first (i.e., just the first value in the column), + last (i.e., just the last value in the column), + Default: sum + Multiple operations can be specified in a comma-delimited list. + + If there is only column, but multiple operations, all operations will be + applied on that column. Likewise, if there is only one operation, but + multiple columns, that operation will be applied to all columns. + Otherwise, the number of columns must match the the number of operations, + and will be applied in respective order. + E.g., "-c 5,4,6 -o sum,mean,count" will give the sum of column 5, + the mean of column 4, and the count of column 6. + The order of output columns will match the ordering given in the command. + + + -delim Specify a custom delimiter for the collapse operations. + - Example: -delim "|" + - Default: ",". + + -prec Sets the decimal precision for output (Default: 5) + + -bed If using BAM input, write output as BED. + + -header Print the header from the A file prior to results. + + -nobuf Disable buffered output. Using this option will cause each line + of output to be printed as it is generated, rather than saved + in a buffer. This will make printing large output files + noticeably slower, but can be useful in conjunction with + other software tools and scripts that need to process one + line of bedtools output at a time. + + -iobuf Specify amount of memory to use for input buffer. + Takes an integer argument. Optional suffixes K/M/G supported. + Note: currently has no effect with compressed files. + +Notes: + (1) The input file (-i) file must be sorted by chrom, then start. + + + + +***** ERROR: No input file given. Exiting. ***** diff --git a/src/bedtools/bedtools_merge/script.sh b/src/bedtools/bedtools_merge/script.sh new file mode 100644 index 00000000..db50dd83 --- /dev/null +++ b/src/bedtools/bedtools_merge/script.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +# Unset parameters +unset_if_false=( + par_strand + par_bed + par_header + par_no_buffer +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +# Execute bedtools merge with the provided arguments +bedtools merge \ + ${par_strand:+-s} \ + ${par_specific_strand:+-S "$par_specific_strand"} \ + ${par_bed:+-bed} \ + ${par_header:+-header} \ + ${par_no_buffer:+-nobuf} \ + ${par_distance:+-d "$par_distance"} \ + ${par_columns:+-c "$par_columns"} \ + ${par_operation:+-o "$par_operation"} \ + ${par_delimiter:+-delim "$par_delimiter"} \ + ${par_precision:+-prec "$par_precision"} \ + -i "$par_input" \ + > "$par_output" diff --git a/src/bedtools/bedtools_merge/test.sh b/src/bedtools/bedtools_merge/test.sh new file mode 100644 index 00000000..e2b46c15 --- /dev/null +++ b/src/bedtools/bedtools_merge/test.sh @@ -0,0 +1,222 @@ +#!/bin/bash + +# exit on error +set -eo pipefail + +## VIASH START +meta_executable="target/executable/bedtools/bedtools_sort/bedtools_merge" +meta_resources_dir="src/bedtools/bedtools_merge" +## VIASH END + +# directory of the bam file +test_data="$meta_resources_dir/test_data" + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create and populate example files +printf "chr1\t100\t200\nchr1\t150\t250\nchr1\t300\t400\n" > "$TMPDIR/featureA.bed" +printf "chr1\t100\t200\ta1\t1\t+\nchr1\t180\t250\ta2\t2\t+\nchr1\t250\t500\ta3\t3\t-\nchr1\t501\t1000\ta4\t4\t+\n" > "$TMPDIR/featureB.bed" +printf "chr1\t100\t200\ta1\t1.9\t+\nchr1\t180\t250\ta2\t2.5\t+\nchr1\t250\t500\ta3\t3.3\t-\nchr1\t501\t1000\ta4\t4\t+\n" > "$TMPDIR/feature_precision.bed" + +# Create and populate feature.gff file +printf "##gff-version 3\n" > "$TMPDIR/feature.gff" +printf "chr1\t.\tgene\t1000\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "$TMPDIR/feature.gff" +printf "chr1\t.\texon\t1000\t1200\t.\t+\t.\tID=exon1;Parent=transcript1\n" >> "$TMPDIR/feature.gff" +printf "chr1\t.\tCDS\t1000\t1200\t.\t+\t0\tID=cds1;Parent=transcript1\n" >> "$TMPDIR/feature.gff" +printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "$TMPDIR/feature.gff" +printf "chr2\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "$TMPDIR/feature.gff" +printf "chr3\t.\tmRNA\t1000\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "$TMPDIR/feature.gff" + +# Create expected output files +printf "chr1\t100\t250\nchr1\t300\t400\n" > "$TMPDIR/expected.bed" +printf "chr1\t100\t250\nchr1\t250\t500\nchr1\t501\t1000\n" > "$TMPDIR/expected_strand.bed" +printf "chr1\t100\t250\nchr1\t501\t1000\n" > "$TMPDIR/expected_specific_strand.bed" +printf "chr1\t128\t228\nchr1\t428\t528\n" > "$TMPDIR/expected_bam.bed" +printf "chr1\t100\t400\n" > "$TMPDIR/expected_distance.bed" +printf "chr1\t100\t500\t2\t1\t3\nchr1\t501\t1000\t4\t4\t4\n" > "$TMPDIR/expected_operation.bed" +printf "chr1\t100\t500\ta1|a2|a3\nchr1\t501\t1000\ta4\n" > "$TMPDIR/expected_delim.bed" +printf "chr1\t100\t500\t2.567\nchr1\t501\t1000\t4\n" > "$TMPDIR/expected_precision.bed" +printf "##gff-version 3\nchr1\t999\t2000\nchr2\t1499\t1700\nchr3\t999\t2000\n" > "$TMPDIR/expected_header.bed" + +# Test 1: Default sort on BED file +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null + +echo "> Run bedtools_merge on BED file" +"$meta_executable" \ + --input "../featureA.bed" \ + --output "output.bed" + +# # checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected.bed" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: strand option +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bedtools_merge on BED file with strand option" +"$meta_executable" \ + --input "../featureB.bed" \ + --output "output.bed" \ + --strand + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_strand.bed" +echo "- test2 succeeded -" + +popd > /dev/null + +# Test 3: specific strand option +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null + +echo "> Run bedtools_merge on BED file with specific strand option" +"$meta_executable" \ + --input "../featureB.bed" \ + --output "output.bed" \ + --specific_strand "+" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_specific_strand.bed" +echo "- test3 succeeded -" + +popd > /dev/null + +# Test 4: BED option +mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null + +echo "> Run bedtools_merge on BAM file with BED option" +"$meta_executable" \ + --input "$test_data/feature.bam" \ + --output "output.bed" \ + --bed + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_bam.bed" +echo "- test4 succeeded -" + +popd > /dev/null + +# Test 5: distance option +mkdir "$TMPDIR/test5" && pushd "$TMPDIR/test5" > /dev/null + +echo "> Run bedtools_merge on BED file with distance option" +"$meta_executable" \ + --input "../featureA.bed" \ + --output "output.bed" \ + --distance -5 + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected.bed" +echo "- test5 succeeded -" + +popd > /dev/null + +# Test 6: columns option & operation option +mkdir "$TMPDIR/test6" && pushd "$TMPDIR/test6" > /dev/null + +echo "> Run bedtools_merge on BED file with columns & operation options" +"$meta_executable" \ + --input "../featureB.bed" \ + --output "output.bed" \ + --columns 5 \ + --operation "mean,min,max" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_operation.bed" +echo "- test6 succeeded -" + +popd > /dev/null + +# Test 7: delimeter option +mkdir "$TMPDIR/test7" && pushd "$TMPDIR/test7" > /dev/null + +echo "> Run bedtools_merge on BED file with delimeter option" +"$meta_executable" \ + --input "../featureB.bed" \ + --output "output.bed" \ + --columns 4 \ + --operation "collapse" \ + --delimiter "|" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_delim.bed" +echo "- test7 succeeded -" + +popd > /dev/null + +# Test 8: precision option +mkdir "$TMPDIR/test8" && pushd "$TMPDIR/test8" > /dev/null + +echo "> Run bedtools_merge on BED file with precision option" +"$meta_executable" \ + --input "../feature_precision.bed" \ + --output "output.bed" \ + --columns 5 \ + --operation "mean" \ + --precision 4 + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_precision.bed" +echo "- test8 succeeded -" + +popd > /dev/null + +# Test 9: header option +mkdir "$TMPDIR/test9" && pushd "$TMPDIR/test9" > /dev/null + +echo "> Run bedtools_merge on GFF file with header option" +"$meta_executable" \ + --input "../feature.gff" \ + --output "output.gff" \ + --header + +# checks +assert_file_exists "output.gff" +assert_file_not_empty "output.gff" +assert_identical_content "output.gff" "../expected_header.bed" +echo "- test9 succeeded -" + +popd > /dev/null + +echo "---- All tests succeeded! ----" +exit 0 diff --git a/src/bedtools/bedtools_merge/test_data/feature.bam b/src/bedtools/bedtools_merge/test_data/feature.bam new file mode 100644 index 00000000..3d56a631 Binary files /dev/null and b/src/bedtools/bedtools_merge/test_data/feature.bam differ diff --git a/src/fastqc/config.vsh.yaml b/src/fastqc/config.vsh.yaml new file mode 100644 index 00000000..75b16f36 --- /dev/null +++ b/src/fastqc/config.vsh.yaml @@ -0,0 +1,209 @@ +name: fastqc +description: FastQC - A high throughput sequence QC analysis tool. +keywords: [Quality control, BAM, SAM, FASTQ] +links: + homepage: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ + documentation: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/ + repository: https://github.com/s-andrews/FastQC + issue_tracker: https://github.com/s-andrews/FastQC/issues +license: GPL-3.0, Apache-2.0 +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + direction: input + multiple: true + description: | + FASTQ file(s) to be analyzed. + required: true + example: input.fq + + - name: Outputs + description: | + At least one of the output options (--html, --zip, --summary, --data) must be used. + arguments: + + - name: --html + type: file + direction: output + multiple: true + description: | + Create the HTML report of the results. + '*' wild card must be provided in the output file name. + Wild card will be replaced by the input file basename. + e.g. + --input "sample_1.fq" + --html "*.html" + would create an output html file named sample_1.html + example: "*.html" + + - name: --zip + type: file + direction: output + multiple: true + description: | + Create the zip file(s) containing: html report, data, images, icons, summary, etc. + '*' wild card must be provided in the output file name. + Wild card will be replaced by the input basename. + e.g. + --input "sample_1.fq" + --html "*.zip" + would create an output zip file named sample_1.zip + example: "*.zip" + + - name: --summary + type: file + direction: output + multiple: true + description: | + Create the summary file(s). + '*' wild card must be provided in the output file name. + Wild card will be replaced by the input basename. + e.g. + --input "sample_1.fq" + --summary "*_summary.txt" + would create an output summary.txt file named sample_1_summary.txt + example: "*_summary.txt" + + - name: --data + type: file + direction: output + multiple: true + description: | + Create the data file(s). + '*' wild card must be provided in the output file name. + Wild card will be replaced by the input basename. + e.g. + --input "sample_1.fq" + --summary "*_data.txt" + would create an output data.txt file named sample_1_data.txt + example: "*_data.txt" + + - name: Options + arguments: + - name: --casava + type: boolean_true + description: | + Files come from raw casava output. Files in the same sample + group (differing only by the group number) will be analysed + as a set rather than individually. Sequences with the filter + flag set in the header will be excluded from the analysis. + Files must have the same names given to them by casava + (including being gzipped and ending with .gz) otherwise they + won't be grouped together correctly. + + - name: --nano + type: boolean_true + description: | + Files come from nanopore sequences and are in fast5 format. In + this mode you can pass in directories to process and the program + will take in all fast5 files within those directories and produce + a single output file from the sequences found in all files. + + - name: --nofilter + type: boolean_true + description: | + If running with --casava then don't remove read flagged by + casava as poor quality when performing the QC analysis. + + - name: --nogroup + type: boolean_true + description: | + Disable grouping of bases for reads >50bp. + All reports will show data for every base in the read. + WARNING: Using this option will cause fastqc to crash + and burn if you use it on really long reads, and your + plots may end up a ridiculous size. You have been warned! + + - name: --min_length + type: integer + description: | + Sets an artificial lower limit on the length of the + sequence to be shown in the report. As long as you + set this to a value greater or equal to your longest + read length then this will be the sequence length used + to create your read groups. This can be useful for making + directly comparable statistics from datasets with somewhat + variable read lengths. + example: 0 + + - name: --format + alternatives: -f + type: string + description: | + Bypasses the normal sequence file format detection and + forces the program to use the specified format. + Valid formats are bam, sam, bam_mapped, sam_mapped, and fastq. + example: bam + + - name: --contaminants + alternatives: -c + type: file + description: | + Specifies a non-default file which contains the list + of contaminants to screen overrepresented sequences against. + The file must contain sets of named contaminants in the form + name[tab]sequence. Lines prefixed with a hash will be ignored. + example: contaminants.txt + + - name: --adapters + alternatives: -a + type: file + description: | + Specifies a non-default file which contains the list of + adapter sequences which will be explicitly searched against + the library. The file must contain sets of named adapters + in the form name[tab]sequence. Lines prefixed with a hash will be ignored. + example: adapters.txt + + - name: --limits + alternatives: -l + type: file + description: | + Specifies a non-default file which contains + a set of criteria which will be used to determine + the warn/error limits for the various modules. + This file can also be used to selectively remove + some modules from the output altogether. The format + needs to mirror the default limits.txt file found in + the Configuration folder. + example: limits.txt + + - name: --kmers + alternatives: -k + type: integer + description: | + Specifies the length of Kmer to look for in the Kmer + content module. Specified Kmer length must be between + 2 and 10. Default length is 7 if not specified. + example: 7 + + - name: --quiet + alternatives: -q + type: boolean_true + description: | + Suppress all progress messages on stdout and only report errors. + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: biocontainers/fastqc:v0.11.9_cv8 + setup: + - type: docker + run: | + echo "fastqc: $(fastqc --version | sed -n 's/^FastQC //p')" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/fastqc/help.txt b/src/fastqc/help.txt new file mode 100644 index 00000000..502aebc0 --- /dev/null +++ b/src/fastqc/help.txt @@ -0,0 +1,125 @@ +```bash +fastqc --help +``` + + FastQC - A high throughput sequence QC analysis tool + +SYNOPSIS + + fastqc seqfile1 seqfile2 .. seqfileN + + fastqc [-o output dir] [--(no)extract] [-f fastq|bam|sam] + [-c contaminant file] seqfile1 .. seqfileN + +DESCRIPTION + + FastQC reads a set of sequence files and produces from each one a quality + control report consisting of a number of different modules, each one of + which will help to identify a different potential type of problem in your + data. + + If no files to process are specified on the command line then the program + will start as an interactive graphical application. If files are provided + on the command line then the program will run with no user interaction + required. In this mode it is suitable for inclusion into a standardised + analysis pipeline. + + The options for the program as as follows: + + -h --help Print this help file and exit + + -v --version Print the version of the program and exit + + -o --outdir Create all output files in the specified output directory. + Please note that this directory must exist as the program + will not create it. If this option is not set then the + output file for each sequence file is created in the same + directory as the sequence file which was processed. + + --casava Files come from raw casava output. Files in the same sample + group (differing only by the group number) will be analysed + as a set rather than individually. Sequences with the filter + flag set in the header will be excluded from the analysis. + Files must have the same names given to them by casava + (including being gzipped and ending with .gz) otherwise they + won't be grouped together correctly. + + --nano Files come from nanopore sequences and are in fast5 format. In + this mode you can pass in directories to process and the program + will take in all fast5 files within those directories and produce + a single output file from the sequences found in all files. + + --nofilter If running with --casava then don't remove read flagged by + casava as poor quality when performing the QC analysis. + + --extract If set then the zipped output file will be uncompressed in + the same directory after it has been created. By default + this option will be set if fastqc is run in non-interactive + mode. + + -j --java Provides the full path to the java binary you want to use to + launch fastqc. If not supplied then java is assumed to be in + your path. + + --noextract Do not uncompress the output file after creating it. You + should set this option if you do not wish to uncompress + the output when running in non-interactive mode. + + --nogroup Disable grouping of bases for reads >50bp. All reports will + show data for every base in the read. WARNING: Using this + option will cause fastqc to crash and burn if you use it on + really long reads, and your plots may end up a ridiculous size. + You have been warned! + + --min_length Sets an artificial lower limit on the length of the sequence + to be shown in the report. As long as you set this to a value + greater or equal to your longest read length then this will be + the sequence length used to create your read groups. This can + be useful for making directly comaparable statistics from + datasets with somewhat variable read lengths. + + -f --format Bypasses the normal sequence file format detection and + forces the program to use the specified format. Valid + formats are bam,sam,bam_mapped,sam_mapped and fastq + + -t --threads Specifies the number of files which can be processed + simultaneously. Each thread will be allocated 250MB of + memory so you shouldn't run more threads than your + available memory will cope with, and not more than + 6 threads on a 32 bit machine + + -c Specifies a non-default file which contains the list of + --contaminants contaminants to screen overrepresented sequences against. + The file must contain sets of named contaminants in the + form name[tab]sequence. Lines prefixed with a hash will + be ignored. + + -a Specifies a non-default file which contains the list of + --adapters adapter sequences which will be explicity searched against + the library. The file must contain sets of named adapters + in the form name[tab]sequence. Lines prefixed with a hash + will be ignored. + + -l Specifies a non-default file which contains a set of criteria + --limits which will be used to determine the warn/error limits for the + various modules. This file can also be used to selectively + remove some modules from the output all together. The format + needs to mirror the default limits.txt file found in the + Configuration folder. + + -k --kmers Specifies the length of Kmer to look for in the Kmer content + module. Specified Kmer length must be between 2 and 10. Default + length is 7 if not specified. + + -q --quiet Supress all progress messages on stdout and only report errors. + + -d --dir Selects a directory to be used for temporary files written when + generating report images. Defaults to system temp directory if + not specified. + +BUGS + + Any bugs in fastqc should be reported either to simon.andrews@babraham.ac.uk + or in www.bioinformatics.babraham.ac.uk/bugzilla/ + + diff --git a/src/fastqc/script.sh b/src/fastqc/script.sh new file mode 100644 index 00000000..5cf55868 --- /dev/null +++ b/src/fastqc/script.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# exit on error +set -eo pipefail + +# Check if both outputs are empty, at least one must be passed. +if [[ -z "$par_html" ]] && [[ -z "$par_zip" ]] && [[ -z "$par_summary" ]] && [[ -z "$par_data" ]]; then + echo "Error: At least one of the output arguments (--html, --zip, --summary, and --data) must be passed." + exit 1 +fi + +# unset flags +unset_if_false=( + par_casava + par_nano + par_nofilter + par_extract + par_noextract + par_nogroup + par_quiet +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +tmpdir=$(mktemp -d "${meta_temp_dir}/${meta_name}-XXXXXXXX") +function clean_up { + rm -rf "$tmpdir" +} +trap clean_up EXIT + +# Create input array +IFS=";" read -ra input <<< $par_input + +# Run fastqc +fastqc \ + --extract \ + ${par_casava:+--casava} \ + ${par_nano:+--nano} \ + ${par_nofilter:+--nofilter} \ + ${par_nogroup:+--nogroup} \ + ${par_min_length:+--min_length "$par_min_length"} \ + ${par_format:+--format "$par_format"} \ + ${par_contaminants:+--contaminants "$par_contaminants"} \ + ${par_adapters:+--adapters "$par_adapters"} \ + ${par_limits:+--limits "$par_limits"} \ + ${par_kmers:+--kmers "$par_kmers"} \ + ${par_quiet:+--quiet} \ + ${meta_cpus:+--threads "$meta_cpus"} \ + ${meta_temp_dir:+--dir "$meta_temp_dir"} \ + --outdir "${tmpdir}" \ + "${input[@]}" + +# Move output files +for file in "${input[@]}"; do + # Removes everthing after the first dot of the basename + sample_name=$(basename "${file}" | sed 's/\..*$//') + if [[ -n "$par_html" ]]; then + input_html="${tmpdir}/${sample_name}_fastqc.html" + html_file="${par_html//\*/$sample_name}" + mv "$input_html" "$html_file" + fi + if [[ -n "$par_zip" ]]; then + input_zip="${tmpdir}/${sample_name}_fastqc.zip" + zip_file="${par_zip//\*/$sample_name}" + mv "$input_zip" "$zip_file" + fi + if [[ -n "$par_summary" ]]; then + summary_file="${tmpdir}/${sample_name}_fastqc/summary.txt" + new_summary="${par_summary//\*/$sample_name}" + mv "$summary_file" "$new_summary" + fi + if [[ -n "$par_data" ]]; then + data_file="${tmpdir}/${sample_name}_fastqc/fastqc_data.txt" + new_data="${par_data//\*/$sample_name}" + mv "$data_file" "$new_data" + fi + # Remove the extracted directory + rm -r "${tmpdir}/${sample_name}_fastqc" +done + diff --git a/src/fastqc/test.sh b/src/fastqc/test.sh new file mode 100644 index 00000000..8c581ac8 --- /dev/null +++ b/src/fastqc/test.sh @@ -0,0 +1,235 @@ +#!/bin/bash + +# exit on error +set -eo pipefail + +## VIASH START +# meta_executable="target/executable/fastqc" +# meta_resources_dir="src/fastqc" +## VIASH END + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create and populate input.fasta +cat > "$TMPDIR/input_1.fq" < "$TMPDIR/input_2.fq" < "$TMPDIR/contaminants.txt" +printf "contaminant_sequence2\tGATCTTGG\n" >> "$TMPDIR/contaminants.txt" + +# Create and populate SAM file +printf "@HD\tVN:1.0\tSO:unsorted\n" > "$TMPDIR/example.sam" +printf "@SQ\tSN:chr1\tLN:248956422\n" >> "$TMPDIR/example.sam" +printf "@SQ\tSN:chr2\tLN:242193529\n" >> "$TMPDIR/example.sam" +printf "@PG\tID:bowtie2\tPN:bowtie2\tVN:2.3.4.1\tCL:\"/usr/bin/bowtie2-align-s --wrapper basic-0 -x genome -U reads.fq -S output.sam\"\n" >> "$TMPDIR/example.sam" +printf "read1\t0\tchr1\t100\t255\t50M\t*\t0\t0\tACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT\tIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII\tAS:i:-10\tXN:i:0\tXM:i:0\tXO:i:0\tXG:i:0\tNM:i:0\tMD:Z:50\tYT:Z:UU\n" >> "$TMPDIR/example.sam" +printf "read2\t0\tchr2\t150\t255\t50M\t*\t0\t0\tTGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC\tIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII\tAS:i:-8\tXN:i:0\tXM:i:0\tXO:i:0\tXG:i:0\tNM:i:0\tMD:Z:50\tYT:Z:UU\n" >> "$TMPDIR/example.sam" +printf "read3\t16\tchr1\t200\t255\t50M\t*\t0\t0\tGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTA\tIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII\tAS:i:-12\tXN:i:0\tXM:i:0\tXO:i:0\tXG:i:0\tNM:i:0\tMD:Z:50\tYT:Z:UU" >> "$TMPDIR/example.sam" + +cat > "$TMPDIR/expected_summary.txt" < "$TMPDIR/expected_summary2.txt" < "$TMPDIR/expected_summary_sam.txt" < /dev/null + +echo "-> Run Test1: one input" +"$meta_executable" \ + --input "../input_1.fq" \ + --html "*_fastqc.html" \ + --zip "*_fastqc.zip" \ + --summary "*_summary.txt" \ + --data "*_data.txt" \ + --quiet \ + +assert_file_exists "input_1_fastqc.html" +assert_file_exists "input_1_fastqc.zip" +assert_file_exists "input_1_summary.txt" +assert_file_not_empty "input_1_fastqc.html" +assert_file_not_empty "input_1_fastqc.zip" +assert_identical_content "input_1_summary.txt" "../expected_summary.txt" +echo "- test succeeded -" + +popd > /dev/null + + +# Test 2: Run fastqc with multiple inputs +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "-> Run Test2: two inputs" +"$meta_executable" \ + --input "../input_1.fq" \ + --input "../input_2.fq" \ + --html "*_fastqc.html" \ + --zip "*_fastqc.zip" \ + --summary "*_summary.txt" \ + --data "*_data.txt" \ + --quiet \ + +# File 1 +assert_file_exists "input_1_fastqc.html" +assert_file_exists "input_1_fastqc.zip" +assert_file_exists "input_1_summary.txt" +assert_file_not_empty "input_1_fastqc.html" +assert_file_not_empty "input_1_fastqc.zip" +assert_identical_content "input_1_summary.txt" "../expected_summary.txt" +# File 2 +assert_file_exists "input_2_fastqc.html" +assert_file_exists "input_2_fastqc.zip" +assert_file_exists "input_2_summary.txt" +assert_file_not_empty "input_2_fastqc.html" +assert_file_not_empty "input_2_fastqc.zip" +assert_identical_content "input_2_summary.txt" "../expected_summary2.txt" +echo "- test succeeded -" + +popd > /dev/null + +# Test 3: Run fastqc with contaminants +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null + +echo "-> Run Test3: contaminants" +"$meta_executable" \ + --input "../input_1.fq" \ + --contaminants "../contaminants.txt" \ + --html "*_fastqc.html" \ + --zip "*_fastqc.zip" \ + --summary "*_summary.txt" \ + --data "*_data.txt" \ + --quiet \ + +assert_file_exists "input_1_fastqc.html" +assert_file_exists "input_1_fastqc.zip" +assert_file_exists "input_1_summary.txt" +assert_file_not_empty "input_1_fastqc.html" +assert_file_not_empty "input_1_fastqc.zip" +assert_identical_content "input_1_summary.txt" "../expected_summary.txt" +assert_file_contains "input_1_data.txt" "contaminant" +echo "- test succeeded -" + +popd > /dev/null + +# Test 4: Run fastqc with sam file +mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null + +echo "-> Run Test4: sam file" +"$meta_executable" \ + --input "../example.sam" \ + --format "sam" \ + --html "*_fastqc.html" \ + --zip "*_fastqc.zip" \ + --summary "*_summary.txt" \ + --data "*_data.txt" \ + --quiet \ + +assert_file_exists "example_fastqc.html" +assert_file_exists "example_fastqc.zip" +assert_file_exists "example_summary.txt" +assert_file_not_empty "example_fastqc.html" +assert_file_not_empty "example_fastqc.zip" +assert_identical_content "example_summary.txt" "../expected_summary_sam.txt" +echo "- test succeeded -" + +popd > /dev/null + +# Test 5: Run fastqc with multiple options +mkdir "$TMPDIR/test5" && pushd "$TMPDIR/test5" > /dev/null + +echo "-> Run Test5: multiple options" +"$meta_executable" \ + --input "../input_1.fq" \ + --contaminants "../contaminants.txt" \ + --format "fastq" \ + --nofilter \ + --nogroup \ + --min_length 10 \ + --kmers 5 \ + --html "*_fastqc.html" \ + --zip "*_fastqc.zip" \ + --summary "*_summary.txt" \ + --data "*_data.txt" \ + --quiet \ +# --casava \ + +assert_file_exists "input_1_fastqc.html" +assert_file_exists "input_1_fastqc.zip" +assert_file_exists "input_1_summary.txt" +assert_file_not_empty "input_1_fastqc.html" +assert_file_not_empty "input_1_fastqc.zip" +assert_identical_content "input_1_summary.txt" "../expected_summary.txt" +assert_file_contains "input_1_data.txt" "contaminant" +echo "- test succeeded -" + +popd > /dev/null + +echo "All tests succeeded!" +exit 0 diff --git a/src/qualimap/qualimap_rnaseq/config.vsh.yaml b/src/qualimap/qualimap_rnaseq/config.vsh.yaml new file mode 100644 index 00000000..ffc807ab --- /dev/null +++ b/src/qualimap/qualimap_rnaseq/config.vsh.yaml @@ -0,0 +1,103 @@ +name: qualimap_rnaseq +namespace: qualimap +keywords: [RNA-seq, quality control, QC Report] +description: | + Qualimap RNA-seq QC reports quality control metrics and bias estimations + which are specific for whole transcriptome sequencing, including reads genomic + origin, junction analysis, transcript coverage and 5’-3’ bias computation. +links: + homepage: http://qualimap.conesalab.org/ + documentation: http://qualimap.conesalab.org/doc_html/analysis.html#rna-seq-qc + issue_tracker: https://bitbucket.org/kokonech/qualimap/issues?status=new&status=open + repository: https://bitbucket.org/kokonech/qualimap/commits/branch/master +references: + doi: 10.1093/bioinformatics/btv566 +license: GPL-2.0 +authors: + - __merge__: /src/_authors/dorien_roosen.yaml + roles: [ author, maintainer ] +argument_groups: + - name: "Input" + arguments: + - name: "--bam" + type: file + required: true + example: alignment.bam + description: Path to the sequence alignment file in BAM format, produced by a splicing-aware aligner. + - name: "--gtf" + type: file + required: true + example: annotations.gtf + description: Path to genomic annotations in Ensembl GTF format. + + - name: "Output" + arguments: + - name: "--qc_results" + direction: output + type: file + required: true + example: rnaseq_qc_results.txt + description: Text file containing the RNAseq QC results. + - name: "--counts" + type: file + required: false + direction: output + description: Output file for computed counts. + - name: "--report" + type: file + direction: output + required: false + example: report.html + description: Report output file. Supported formats are PDF or HTML. + + - name: "Optional" + arguments: + - name: "--num_pr_bases" + type: integer + required: false + min: 1 + description: Number of upstream/downstream nucleotide bases to compute 5'-3' bias (default = 100). + - name: "--num_tr_bias" + type: integer + required: false + min: 1 + description: Number of top highly expressed transcripts to compute 5'-3' bias (default = 1000). + - name: "--algorithm" + type: string + required: false + choices: ["uniquely-mapped-reads", "proportional"] + description: Counting algorithm (uniquely-mapped-reads (default) or proportional). + - name: "--sequencing_protocol" + type: string + required: false + choices: ["non-strand-specific", "strand-specific-reverse", "strand-specific-forward"] + description: Sequencing library protocol (strand-specific-forward, strand-specific-reverse or non-strand-specific (default)). + - name: "--paired" + type: boolean_true + description: Setting this flag for paired-end experiments will result in counting fragments instead of reads. + - name: "--sorted" + type: boolean_true + description: Setting this flag indicates that the input file is already sorted by name. If flag is not set, additional sorting by name will be performed. Only requiredfor paired-end analysis. + - name: "--java_memory_size" + type: string + required: false + description: maximum Java heap memory size, default = 4G. + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - path: test_data/ + +engines: + - type: docker + image: quay.io/biocontainers/qualimap:2.3--hdfd78af_0 + setup: + - type: docker + run: | + echo QualiMap: $(qualimap 2>&1 | grep QualiMap | sed 's/^.*QualiMap//') > /var/software_versions.txt +runners: + - type: executable + - type: nextflow diff --git a/src/qualimap/qualimap_rnaseq/help.txt b/src/qualimap/qualimap_rnaseq/help.txt new file mode 100644 index 00000000..c6493ed9 --- /dev/null +++ b/src/qualimap/qualimap_rnaseq/help.txt @@ -0,0 +1,52 @@ +QualiMap v.2.3 +Built on 2023-05-19 16:57 + +usage: qualimap [options] + +To launch GUI leave empty. + +Available tools: + + bamqc Evaluate NGS mapping to a reference genome + rnaseq Evaluate RNA-seq alignment data + counts Counts data analysis (further RNA-seq data evaluation) + multi-bamqc Compare QC reports from multiple NGS mappings + clustering Cluster epigenomic signals + comp-counts Compute feature counts + +Special arguments: + + --java-mem-size Use this argument to set Java memory heap size. Example: + qualimap bamqc -bam very_large_alignment.bam --java-mem-size=4G + +usage: qualimap rnaseq [-a ] -bam -gtf [-npb ] [-ntb + ] [-oc ] [-outdir ] [-outfile ] [-outformat ] + [-p ] [-pe] [-s] + -a,--algorithm Counting algorithm: + uniquely-mapped-reads(default) or + proportional. + -bam Input mapping file in BAM format. + -gtf Annotations file in Ensembl GTF format. + -npb,--num-pr-bases Number of upstream/downstream nucleotide bases + to compute 5'-3' bias (default is 100). + -ntb,--num-tr-bias Number of top highly expressed transcripts to + compute 5'-3' bias (default is 1000). + -oc Output file for computed counts. If only name + of the file is provided, then the file will be + saved in the output folder. + -outdir Output folder for HTML report and raw data. + -outfile Output file for PDF report (default value is + report.pdf). + -outformat Format of the output report (PDF, HTML or both + PDF:HTML, default is HTML). + -p,--sequencing-protocol Sequencing library protocol: + strand-specific-forward, + strand-specific-reverse or non-strand-specific + (default) + -pe,--paired Setting this flag for paired-end experiments + will result in counting fragments instead of + reads + -s,--sorted This flag indicates that the input file is + already sorted by name. If not set, additional + sorting by name will be performed. Only + required for paired-end analysis. \ No newline at end of file diff --git a/src/qualimap/qualimap_rnaseq/script.sh b/src/qualimap/qualimap_rnaseq/script.sh new file mode 100644 index 00000000..351e5159 --- /dev/null +++ b/src/qualimap/qualimap_rnaseq/script.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +set -eo pipefail + +tmp_dir=$(mktemp -d -p "$meta_temp_dir" qualimap_XXXXXXXXX) + +# Handle output parameters +if [ -n "$par_report" ]; then + outfile=$(basename "$par_report") + report_extension="${outfile##*.}" +fi + +if [ -n "$par_counts" ]; then + counts=$(basename "$par_counts") +fi + +# disable flags +[[ "$par_paired" == "false" ]] && unset par_paired +[[ "$par_sorted" == "false" ]] && unset par_sorted + +# Run qualimap +qualimap rnaseq \ + ${meta_memory_mb:+--java-mem-size=${meta_memory_mb}M} \ + ${par_algorithm:+--algorithm $par_algorithm} \ + ${par_sequencing_protocol:+--sequencing-protocol $par_sequencing_protocol} \ + -bam $par_bam \ + -gtf $par_gtf \ + -outdir "$tmp_dir" \ + ${par_num_pr_bases:+--num-pr-bases $par_num_pr_bases} \ + ${par_num_tr_bias:+--num-tr-bias $par_num_tr_bias} \ + ${par_report:+-outformat $report_extension} \ + ${par_paired:+--paired} \ + ${par_sorted:+--sorted} \ + ${par_report:+-outfile "$outfile"} \ + ${par_counts:+-oc "$counts"} + +# Move output files +mv "$tmp_dir/rnaseq_qc_results.txt" "$par_qc_results" + +if [ -n "$par_report" ] && [ $report_extension = "html" ]; then + mv "$tmp_dir/qualimapReport.html" "$par_report" +fi + +if [ -n "$par_report" ] && [ $report_extension = "pdf" ]; then + mv "$tmp_dir/$outfile" "$par_report" +fi + +if [ -n "$par_counts" ]; then + mv "$tmp_dir/$counts" "$par_counts" +fi diff --git a/src/qualimap/qualimap_rnaseq/test.sh b/src/qualimap/qualimap_rnaseq/test.sh new file mode 100755 index 00000000..2e1b647b --- /dev/null +++ b/src/qualimap/qualimap_rnaseq/test.sh @@ -0,0 +1,112 @@ +set -e + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_doesnt_exist() { + [ ! -f "$1" ] || { echo "File '$1' exists but shouldn't" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +############################################# + + +test_dir="$meta_resources_dir/test_data" + +mkdir "run_qualimap_rnaseq_html" +cd "run_qualimap_rnaseq_html" + +echo "> Running qualimap with html output report" + +"$meta_executable" \ + --bam $test_dir/a.bam \ + --gtf $test_dir/annotation.gtf \ + --report report.html \ + --counts counts.txt \ + --qc_results output.txt + +echo ">> Checking output" +assert_file_exists "report.html" +assert_file_exists "counts.txt" +assert_file_exists "output.txt" +assert_file_doesnt_exist "report.pdf" + +echo ">> Checking if output is empty" +assert_file_not_empty "report.html" +assert_file_not_empty "counts.txt" +assert_file_not_empty "output.txt" + +echo ">> Checking output contents" +assert_file_contains "output.txt" ">>>>>>> Input" +assert_file_contains "output.txt" ">>>>>>> Reads alignment" +assert_file_contains "output.txt" ">>>>>>> Reads genomic origin" +assert_file_contains "output.txt" ">>>>>>> Transcript coverage profile" +assert_file_contains "output.txt" ">>>>>>> Junction analysis" +assert_file_contains "output.txt" ">>>>>>> Transcript coverage profile" + +assert_file_contains "counts.txt" "ENSG00000125841.12" + +assert_file_contains "report.html" "Qualimap report: RNA Seq QC" +assert_file_contains "report.html" "

Input

" +assert_file_contains "report.html" "

Reads alignment

" +assert_file_contains "report.html" "

Reads genomic origin

" +assert_file_contains "report.html" "

Transcript coverage profile

" +assert_file_contains "report.html" "

Junction analysis

" + + +cd .. +rm -r run_qualimap_rnaseq_html + +mkdir "run_qualimap_rnaseq_pdf" +cd "run_qualimap_rnaseq_pdf" + +echo "> Running qualimap with pdf output report" + +"$meta_executable" \ + --bam $test_dir/a.bam \ + --gtf $test_dir/annotation.gtf \ + --report report.pdf \ + --counts counts.txt \ + --qc_results output.txt + +echo ">> Checking output" +assert_file_exists "report.pdf" +assert_file_exists "counts.txt" +assert_file_exists "output.txt" +assert_file_doesnt_exist "report.html" + +echo ">> Checking if output is empty" +assert_file_not_empty "report.pdf" +assert_file_not_empty "counts.txt" +assert_file_not_empty "output.txt" + +cd .. +rm -r run_qualimap_rnaseq_pdf + +mkdir "run_qualimap_rnaseq" +cd "run_qualimap_rnaseq" + +echo "> Running qualimap without report and counts output" + +"$meta_executable" \ + --bam $test_dir/a.bam \ + --gtf $test_dir/annotation.gtf \ + --qc_results output.txt + +echo ">> Checking output" +assert_file_doesnt_exist "report.pdf" +assert_file_doesnt_exist "report.html" +assert_file_doesnt_exist "counts.txt" +assert_file_exists "output.txt" + +echo ">> Checking if output is empty" +assert_file_not_empty "output.txt" + +cd .. +rm -r run_qualimap_rnaseq \ No newline at end of file diff --git a/src/qualimap/qualimap_rnaseq/test_data/a.bam b/src/qualimap/qualimap_rnaseq/test_data/a.bam new file mode 100644 index 00000000..c8ea1065 Binary files /dev/null and b/src/qualimap/qualimap_rnaseq/test_data/a.bam differ diff --git a/src/qualimap/qualimap_rnaseq/test_data/annotation.gtf b/src/qualimap/qualimap_rnaseq/test_data/annotation.gtf new file mode 100644 index 00000000..976de753 --- /dev/null +++ b/src/qualimap/qualimap_rnaseq/test_data/annotation.gtf @@ -0,0 +1,10 @@ +chr20 HAVANA transcript 347024 354868 . + . gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA exon 347024 347142 . + . gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 1; exon_id "ENSE00001831391.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA exon 349249 349363 . + . gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 2; exon_id "ENSE00001491647.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA exon 349638 349832 . + . gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 3; exon_id "ENSE00003710328.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA CDS 349644 349832 . + 0 gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 3; exon_id "ENSE00003710328.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA start_codon 349644 349646 . + 0 gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 3; exon_id "ENSE00003710328.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA exon 353210 354868 . + . gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 4; exon_id "ENSE00001822456.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA CDS 353210 353632 . + 0 gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 4; exon_id "ENSE00001822456.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA stop_codon 353633 353635 . + 0 gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 4; exon_id "ENSE00001822456.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA UTR 347024 347142 . + . gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 1; exon_id "ENSE00001831391.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; diff --git a/src/qualimap/qualimap_rnaseq/test_data/script.sh b/src/qualimap/qualimap_rnaseq/test_data/script.sh new file mode 100755 index 00000000..801fe405 --- /dev/null +++ b/src/qualimap/qualimap_rnaseq/test_data/script.sh @@ -0,0 +1,10 @@ +# qualimap test data + +# Test data was obtained from https://github.com/snakemake/snakemake-wrappers/raw/master/bio/qualimap/rnaseq/test + +if [ ! -d /tmp/snakemake-wrappers ]; then + git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers +fi + +cp -r /tmp/snakemake-wrappers/bio/qualimap/rnaseq/test/mapped/a.bam src/qualimap/qualimap_rnaseq/test_data +cp -r /tmp/snakemake-wrappers/bio/qualimap/rnaseq/test/annotation.gtf src/qualimap/qualimap_rnaseq/test_data diff --git a/src/rsem/rsem_prepare_reference/config.vsh.yaml b/src/rsem/rsem_prepare_reference/config.vsh.yaml new file mode 100644 index 00000000..44915a2f --- /dev/null +++ b/src/rsem/rsem_prepare_reference/config.vsh.yaml @@ -0,0 +1,196 @@ +name: rsem_prepare_reference +namespace: rsem +description: | + RSEM is a software package for estimating gene and isoform expression levels from RNA-Seq data. This component prepares transcript references for RSEM. +keywords: ["Transcriptome", "Index"] +links: + homepage: http://deweylab.github.io/RSEM + documentation: https://deweylab.github.io/RSEM/rsem-prepare-reference.html + repository: https://github.com/deweylab/RSEM +references: + doi: 10.1186/1471-2105-12-323 +license: GPL-3.0 +requirements: + commands: [ rsem-prepare-reference ] +authors: + - __merge__: /src/_authors/sai_nirmayi_yasa.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --reference_fasta_files + type: file + description: | + Semi-colon separated list of Multi-FASTA formatted files OR a directory name. If a directory name is specified, RSEM will read all files with suffix ".fa" or ".fasta" in this directory. The files should contain either the sequences of transcripts or an entire genome, depending on whether the '--gtf' option is used. + required: true + multiple: true + example: read1.fasta + - name: --reference_name + type: string + description: | + The name of the reference used. RSEM will generate several reference-related files that are prefixed by this name. This name can contain path information (e.g. '/ref/mm9'). + required: true + example: /ref/mm9 + + - name: Outputs + arguments: + - name: --output + type: file + description: Directory containing reference files generated by RSEM. + required: true + direction: output + + - name: Other options + arguments: + - name: --gtf + type: file + description: Assume that 'reference_fasta_files' contains the sequence of a genome, and extract transcript reference sequences using the gene annotations specified in the GTF file. If this and '--gff3' options are not provided, RSEM will assume 'reference_fasta_files' contains the reference transcripts. In this case, RSEM assumes that name of each sequence in the Multi-FASTA files is its transcript_id. + example: annotations.gtf + - name: --gff3 + type: file + description: GFF3 annotation file. Converted to GTF format with the file name 'reference_name.gtf'. Please make sure that 'reference_name.gtf' does not exist. + example: annotations.gff + - name: --gff3_rna_patterns + type: string + description: List of transcript categories (separated by semi-colon). Only transcripts that match the string will be extracted. + multiple: true + example: mRNA;rRNA + - name: --gff3_genes_as_transcripts + type: boolean_true + description: This option is designed for untypical organisms, such as viruses, whose GFF3 files only contain genes. RSEM will assume each gene as a unique transcript when it converts the GFF3 file into GTF format. + - name: --trusted_sources + type: string + description: List of trusted sources (separated by semi-colon). Only transcripts coming from these sources will be extracted. If this option is off, all sources are accepted. + multiple: true + example: ENSEMBL;HAVANA + - name: --transcript_to_gene_map + type: file + description: | + Use information from this file to map from transcript (isoform) ids to gene ids. Each line of this file should be of the form: + gene_id transcript_id + with the two fields separated by a tab character. + If you are using a GTF file for the "UCSC Genes" gene set from the UCSC Genome Browser, then the "knownIsoforms.txt" file (obtained from the "Downloads" section of the UCSC Genome Browser site) is of this format. + If this option is off, then the mapping of isoforms to genes depends on whether the '--gtf' option is specified. If '--gtf' is specified, then RSEM uses the "gene_id" and "transcript_id" attributes in the GTF file. Otherwise, RSEM assumes that each sequence in the reference sequence files is a separate gene. + example: isoforms.txt + - name: --allele_to_gene_map + type: file + description: | + Use information from to provide gene_id and transcript_id information for each allele-specific transcript. Each line of should be of the form: + gene_id transcript_id allele_id + with the fields separated by a tab character. + This option is designed for quantifying allele-specific expression. It is only valid if '--gtf' option is not specified. allele_id should be the sequence names presented in the Multi-FASTA-formatted files. + - name: --polyA + type: boolean_true + description: Add poly(A) tails to the end of all reference isoforms. The length of poly(A) tail added is specified by '--polyA-length' option. STAR aligner users may not want to use this option. + - name: --polyA_length + type: integer + description: The length of the poly(A) tails to be added. + example: 125 + - name: --no_polyA_subset + type: file + description: Only meaningful if '--polyA' is specified. Do not add poly(A) tails to those transcripts listed in this file containing a list of transcript_ids. + example: transcript_ids.txt + - name: --bowtie + type: boolean_true + description: Build Bowtie indices. + - name: --bowtie2 + type: boolean_true + description: Build Bowtie 2 indices. + - name: --star + type: boolean_true + description: Build STAR indices. + - name: --star_sjdboverhang + type: integer + description: Length of the genomic sequence around annotated junction. It is only used for STAR to build splice junctions database and not needed for Bowtie or Bowtie2. It will be passed as the --sjdbOverhang option to STAR. According to STAR's manual, its ideal value is max(ReadLength)-1, e.g. for 2x101 paired-end reads, the ideal value is 101-1=100. In most cases, the default value of 100 will work as well as the ideal value. (Default is 100) + example: 100 + - name: --hisat2_hca + type: boolean_true + description: Build HISAT2 indices on the transcriptome according to Human Cell Atlas (HCA) SMART-Seq2 pipeline. + - name: --quiet + alternatives: -q + type: boolean_true + description: Suppress the output of logging information. + + - name: Prior-enhanced RSEM options + arguments: + - name: --prep_pRSEM + type: boolean_true + description: A Boolean indicating whether to prepare reference files for pRSEM, including building Bowtie indices for a genome and selecting training set isoforms. The index files will be used for aligning ChIP-seq reads in prior-enhanced RSEM and the training set isoforms will be used for learning prior. A path to Bowtie executables and a mappability file in bigWig format are required when this option is on. Currently, Bowtie2 is not supported for prior-enhanced RSEM. + - name: --mappability_bigwig_file + type: file + description: Full path to a whole-genome mappability file in bigWig format. This file is required for running prior-enhanced RSEM. It is used for selecting a training set of isoforms for prior-learning. This file can be either downloaded from UCSC Genome Browser or generated by GEM (Derrien et al., 2012, PLoS One). + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: +- type: docker + image: ubuntu:22.04 + setup: + - type: apt + packages: + - build-essential + - gcc + - g++ + - make + - wget + - zlib1g-dev + - unzip xxd + - perl + - r-base + - bowtie2 + - pip + - git + - type: python + packages: bowtie + - type: docker + env: + - STAR_VERSION=2.7.11b + - RSEM_VERSION=1.3.3 + - BOWTIE_VERSION=1.3.1 + - TZ=Europe/Brussels + run: | + ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone && \ + cd /tmp && \ + wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip && \ + unzip ${STAR_VERSION}.zip && \ + cd STAR-${STAR_VERSION}/source && \ + make STARstatic CXXFLAGS_SIMD=-std=c++11 && \ + cp STAR /usr/local/bin && \ + cd /tmp && \ + wget --no-check-certificate https://github.com/deweylab/RSEM/archive/refs/tags/v${RSEM_VERSION}.zip && \ + unzip v${RSEM_VERSION}.zip && \ + cd RSEM-${RSEM_VERSION} && \ + make && \ + make install && \ + cd /tmp && \ + wget --no-check-certificate -O bowtie-${BOWTIE_VERSION}-linux-x86_64.zip https://sourceforge.net/projects/bowtie-bio/files/bowtie/${BOWTIE_VERSION}/bowtie-${BOWTIE_VERSION}-linux-x86_64.zip/download && \ + unzip bowtie-${BOWTIE_VERSION}-linux-x86_64.zip && \ + cp bowtie-${BOWTIE_VERSION}-linux-x86_64/bowtie* /usr/local/bin && \ + cd /tmp && \ + git clone https://github.com/DaehwanKimLab/hisat2.git /tmp/hisat2 && \ + cd /tmp/hisat2 && \ + make && \ + cp -r hisat2* /usr/local/bin && \ + cd && \ + rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip /tmp/bowtie-${BOWTIE_VERSION}-linux-x86_64 /tmp/hisat2 && \ + apt-get --purge autoremove -y ${PACKAGES} && \ + apt-get clean + + - type: docker + run: | + echo "RSEM: `rsem-calculate-expression --version | sed -e 's/Current version: RSEM v//g'`" > /var/software_versions.txt && \ + echo "STAR: `STAR --version`" >> /var/software_versions.txt && \ + echo "bowtie2: `bowtie2 --version | grep -oP '\d+\.\d+\.\d+'`" >> /var/software_versions.txt && \ + echo "bowtie: `bowtie --version | grep -oP 'bowtie-align-s version \K\d+\.\d+\.\d+'`" >> /var/software_versions.txt && \ + echo "HISAT2: `hisat2 --version | grep -oP 'hisat2-align-s version \K\d+\.\d+\.\d+'`" >> /var/software_versions.txt + +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/rsem/rsem_prepare_reference/help.txt b/src/rsem/rsem_prepare_reference/help.txt new file mode 100644 index 00000000..c69899ec --- /dev/null +++ b/src/rsem/rsem_prepare_reference/help.txt @@ -0,0 +1,207 @@ +```bash +rsem-prepare-reference --help +``` + +NAME +rsem-prepare-reference - Prepare transcript references for RSEM and optionally build BOWTIE/BOWTIE2/STAR/HISAT2(transcriptome) indices. + +SYNOPSIS + rsem-prepare-reference [options] reference_fasta_file(s) reference_name +ARGUMENTS +reference_fasta_file(s) +Either a comma-separated list of Multi-FASTA formatted files OR a directory name. If a directory name is specified, RSEM will read all files with suffix ".fa" or ".fasta" in this directory. The files should contain either the sequences of transcripts or an entire genome, depending on whether the '--gtf' option is used. + +reference name +The name of the reference used. RSEM will generate several reference-related files that are prefixed by this name. This name can contain path information (e.g. '/ref/mm9'). + +OPTIONS +--gtf +If this option is on, RSEM assumes that 'reference_fasta_file(s)' contains the sequence of a genome, and will extract transcript reference sequences using the gene annotations specified in , which should be in GTF format. + +If this and '--gff3' options are off, RSEM will assume 'reference_fasta_file(s)' contains the reference transcripts. In this case, RSEM assumes that name of each sequence in the Multi-FASTA files is its transcript_id. + +(Default: off) + +--gff3 +The annotation file is in GFF3 format instead of GTF format. RSEM will first convert it to GTF format with the file name 'reference_name.gtf'. Please make sure that 'reference_name.gtf' does not exist. (Default: off) + +--gff3-RNA-patterns + is a comma-separated list of transcript categories, e.g. "mRNA,rRNA". Only transcripts that match the will be extracted. (Default: "mRNA") + +--gff3-genes-as-transcripts +This option is designed for untypical organisms, such as viruses, whose GFF3 files only contain genes. RSEM will assume each gene as a unique transcript when it converts the GFF3 file into GTF format. + +--trusted-sources + is a comma-separated list of trusted sources, e.g. "ENSEMBL,HAVANA". Only transcripts coming from these sources will be extracted. If this option is off, all sources are accepted. (Default: off) + +--transcript-to-gene-map +Use information from to map from transcript (isoform) ids to gene ids. Each line of should be of the form: + +gene_id transcript_id + +with the two fields separated by a tab character. + +If you are using a GTF file for the "UCSC Genes" gene set from the UCSC Genome Browser, then the "knownIsoforms.txt" file (obtained from the "Downloads" section of the UCSC Genome Browser site) is of this format. + +If this option is off, then the mapping of isoforms to genes depends on whether the '--gtf' option is specified. If '--gtf' is specified, then RSEM uses the "gene_id" and "transcript_id" attributes in the GTF file. Otherwise, RSEM assumes that each sequence in the reference sequence files is a separate gene. + +(Default: off) + +--allele-to-gene-map +Use information from to provide gene_id and transcript_id information for each allele-specific transcript. Each line of should be of the form: + +gene_id transcript_id allele_id + +with the fields separated by a tab character. + +This option is designed for quantifying allele-specific expression. It is only valid if '--gtf' option is not specified. allele_id should be the sequence names presented in the Multi-FASTA-formatted files. + +(Default: off) + +--polyA +Add poly(A) tails to the end of all reference isoforms. The length of poly(A) tail added is specified by '--polyA-length' option. STAR aligner users may not want to use this option. (Default: do not add poly(A) tail to any of the isoforms) + +--polyA-length +The length of the poly(A) tails to be added. (Default: 125) + +--no-polyA-subset +Only meaningful if '--polyA' is specified. Do not add poly(A) tails to those transcripts listed in . is a file containing a list of transcript_ids. (Default: off) + +--bowtie +Build Bowtie indices. (Default: off) + +--bowtie-path +The path to the Bowtie executables. (Default: the path to Bowtie executables is assumed to be in the user's PATH environment variable) + +--bowtie2 +Build Bowtie 2 indices. (Default: off) + +--bowtie2-path +The path to the Bowtie 2 executables. (Default: the path to Bowtie 2 executables is assumed to be in the user's PATH environment variable) + +--star +Build STAR indices. (Default: off) + +--star-path +The path to STAR's executable. (Default: the path to STAR executable is assumed to be in user's PATH environment variable) + +--star-sjdboverhang +Length of the genomic sequence around annotated junction. It is only used for STAR to build splice junctions database and not needed for Bowtie or Bowtie2. It will be passed as the --sjdbOverhang option to STAR. According to STAR's manual, its ideal value is max(ReadLength)-1, e.g. for 2x101 paired-end reads, the ideal value is 101-1=100. In most cases, the default value of 100 will work as well as the ideal value. (Default: 100) + +--hisat2-hca +Build HISAT2 indices on the transcriptome according to Human Cell Atlas (HCA) SMART-Seq2 pipeline. (Default: off) + +--hisat2-path +The path to the HISAT2 executables. (Default: the path to HISAT2 executables is assumed to be in the user's PATH environment variable) + +-p/--num-threads +Number of threads to use for building STAR's genome indices. (Default: 1) + +-q/--quiet +Suppress the output of logging information. (Default: off) + +-h/--help +Show help information. + +PRIOR-ENHANCED RSEM OPTIONS +--prep-pRSEM +A Boolean indicating whether to prepare reference files for pRSEM, including building Bowtie indices for a genome and selecting training set isoforms. The index files will be used for aligning ChIP-seq reads in prior-enhanced RSEM and the training set isoforms will be used for learning prior. A path to Bowtie executables and a mappability file in bigWig format are required when this option is on. Currently, Bowtie2 is not supported for prior-enhanced RSEM. (Default: off) + +--mappability-bigwig-file +Full path to a whole-genome mappability file in bigWig format. This file is required for running prior-enhanced RSEM. It is used for selecting a training set of isoforms for prior-learning. This file can be either downloaded from UCSC Genome Browser or generated by GEM (Derrien et al., 2012, PLoS One). (Default: "") + +DESCRIPTION +This program extracts/preprocesses the reference sequences for RSEM and prior-enhanced RSEM. It can optionally build Bowtie indices (with '--bowtie' option) and/or Bowtie 2 indices (with '--bowtie2' option) using their default parameters. It can also optionally build STAR indices (with '--star' option) using parameters from ENCODE3's STAR-RSEM pipeline. For prior-enhanced RSEM, it can build Bowtie genomic indices and select training set isoforms (with options '--prep-pRSEM' and '--mappability-bigwig-file '). If an alternative aligner is to be used, indices for that particular aligner can be built from either 'reference_name.idx.fa' or 'reference_name.n2g.idx.fa' (see OUTPUT for details). This program is used in conjunction with the 'rsem-calculate-expression' program. + +OUTPUT +This program will generate 'reference_name.grp', 'reference_name.ti', 'reference_name.transcripts.fa', 'reference_name.seq', 'reference_name.chrlist' (if '--gtf' is on), 'reference_name.idx.fa', 'reference_name.n2g.idx.fa', optional Bowtie/Bowtie 2 index files, and optional STAR index files. + +'reference_name.grp', 'reference_name.ti', 'reference_name.seq', and 'reference_name.chrlist' are used by RSEM internally. + +'reference_name.transcripts.fa' contains the extracted reference transcripts in Multi-FASTA format. Poly(A) tails are not added and it may contain lower case bases in its sequences if the corresponding genomic regions are soft-masked. + +'reference_name.idx.fa' and 'reference_name.n2g.idx.fa' are used by aligners to build their own indices. In these two files, all sequence bases are converted into upper case. In addition, poly(A) tails are added if '--polyA' option is set. The only difference between 'reference_name.idx.fa' and 'reference_name.n2g.idx.fa' is that 'reference_name.n2g.idx.fa' in addition converts all 'N' characters to 'G' characters. This conversion is in particular desired for aligners (e.g. Bowtie) that do not allow reads to overlap with 'N' characters in the reference sequences. Otherwise, 'reference_name.idx.fa' should be used to build the aligner's index files. RSEM uses 'reference_name.idx.fa' to build Bowtie 2 indices and 'reference_name.n2g.idx.fa' to build Bowtie indices. For visualizing the transcript-coordinate-based BAM files generated by RSEM in IGV, 'reference_name.idx.fa' should be imported as a "genome" (see Visualization section in README.md for details). + +If the whole genome is indexed for prior-enhanced RSEM, all the index files will be generated with prefix as 'reference_name_prsem'. Selected isoforms for training set are listed in the file 'reference_name_prsem.training_tr_crd' + +EXAMPLES +1) Suppose we have mouse RNA-Seq data and want to use the UCSC mm9 version of the mouse genome. We have downloaded the UCSC Genes transcript annotations in GTF format (as mm9.gtf) using the Table Browser and the knownIsoforms.txt file for mm9 from the UCSC Downloads. We also have all chromosome files for mm9 in the directory '/data/mm9'. We want to put the generated reference files under '/ref' with name 'mouse_0'. We do not add any poly(A) tails. Please note that GTF files generated from UCSC's Table Browser do not contain isoform-gene relationship information. For the UCSC Genes annotation, this information can be obtained from the knownIsoforms.txt file. Suppose we want to build Bowtie indices and Bowtie executables are found in '/sw/bowtie'. + +There are two ways to write the command: + + rsem-prepare-reference --gtf mm9.gtf \ + --transcript-to-gene-map knownIsoforms.txt \ + --bowtie \ + --bowtie-path /sw/bowtie \ + /data/mm9/chr1.fa,/data/mm9/chr2.fa,...,/data/mm9/chrM.fa \ + /ref/mouse_0 +OR + + rsem-prepare-reference --gtf mm9.gtf \ + --transcript-to-gene-map knownIsoforms.txt \ + --bowtie \ + --bowtie-path /sw/bowtie \ + /data/mm9 \ + /ref/mouse_0 +2) Suppose we also want to build Bowtie 2 indices in the above example and Bowtie 2 executables are found in '/sw/bowtie2', the command will be: + + rsem-prepare-reference --gtf mm9.gtf \ + --transcript-to-gene-map knownIsoforms.txt \ + --bowtie \ + --bowtie-path /sw/bowtie \ + --bowtie2 \ + --bowtie2-path /sw/bowtie2 \ + /data/mm9 \ + /ref/mouse_0 +3) Suppose we want to build STAR indices in the above example and save index files under '/ref' with name 'mouse_0'. Assuming STAR executable is '/sw/STAR', the command will be: + + rsem-prepare-reference --gtf mm9.gtf \ + --transcript-to-gene-map knownIsoforms.txt \ + --star \ + --star-path /sw/STAR \ + -p 8 \ + /data/mm9/chr1.fa,/data/mm9/chr2.fa,...,/data/mm9/chrM.fa \ + /ref/mouse_0 +OR + + rsem-prepare-reference --gtf mm9.gtf \ + --transcript-to-gene-map knownIsoforms.txt \ + --star \ + --star-path /sw/STAR \ + -p 8 \ + /data/mm9 + /ref/mouse_0 +STAR genome index files will be saved under '/ref/'. + +4) Suppose we want to prepare references for prior-enhanced RSEM in the above example. In this scenario, both STAR and Bowtie are required to build genomic indices - STAR for RNA-seq reads and Bowtie for ChIP-seq reads. Assuming their executables are under '/sw/STAR' and '/sw/Bowtie', respectively. Also, assuming the mappability file for mouse genome is '/data/mm9.bigWig'. The command will be: + + rsem-prepare-reference --gtf mm9.gtf \ + --transcript-to-gene-map knownIsoforms.txt \ + --star \ + --star-path /sw/STAR \ + -p 8 \ + --prep-pRSEM \ + --bowtie-path /sw/Bowtie \ + --mappability-bigwig-file /data/mm9.bigWig \ + /data/mm9/chr1.fa,/data/mm9/chr2.fa,...,/data/mm9/chrM.fa \ + /ref/mouse_0 +OR + + rsem-prepare-reference --gtf mm9.gtf \ + --transcript-to-gene-map knownIsoforms.txt \ + --star \ + --star-path /sw/STAR \ + -p 8 \ + --prep-pRSEM \ + --bowtie-path /sw/Bowtie \ + --mappability-bigwig-file /data/mm9.bigWig \ + /data/mm9 + /ref/mouse_0 +Both STAR and Bowtie's index files will be saved under '/ref/'. Bowtie files will have name prefix 'mouse_0_prsem' + +5) Suppose we only have transcripts from EST tags stored in 'mm9.fasta' and isoform-gene information stored in 'mapping.txt'. We want to add 125bp long poly(A) tails to all transcripts. The reference_name is set as 'mouse_125'. In addition, we do not want to build Bowtie/Bowtie 2 indices, and will use an alternative aligner to align reads against either 'mouse_125.idx.fa' or 'mouse_125.idx.n2g.fa': + + rsem-prepare-reference --transcript-to-gene-map mapping.txt \ + --polyA + mm9.fasta \ + mouse_125 \ No newline at end of file diff --git a/src/rsem/rsem_prepare_reference/script.sh b/src/rsem/rsem_prepare_reference/script.sh new file mode 100644 index 00000000..806804d8 --- /dev/null +++ b/src/rsem/rsem_prepare_reference/script.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -eo pipefail + +unset_if_false=( par_gff3_genes_as_transcripts par_polyA par_bowtie par_bowtie2 par_star par_hisat2_hca par_quiet par_prep_pRSEM ) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +# replace ';' with ',' +par_reference_fasta_files=$(echo $par_reference_fasta_files | tr ';' ',') +par_gff3_rna_patterns=$(echo $par_gff3_rna_patterns | tr ';' ',') +par_trusted_sources=$(echo $par_trusted_sources | tr ';' ',') + +echo "$par_reference_fasta_files" +rsem-prepare-reference \ + ${par_gtf:+--gtf "${par_gtf}"} \ + ${par_gff3:+--gff3 "${par_gff3}"} \ + ${par_gff3_rna_patterns:+--gff3-RNA-patterns "${par_gff3_rna_patterns}"} \ + ${par_gff3_genes_as_transcripts:+--gff3-genes-as-transcripts "${par_gff3_genes_as_transcripts}"} \ + ${par_trusted_sources:+--trusted-sources "${par_trusted_sources}"} \ + ${par_transcript_to_gene_map:+--transcript-to-gene-map "${par_transcript_to_gene_map}"} \ + ${par_allele_to_gene_map:+--allele-to-gene-map "${par_allele_to_gene_map}"} \ + ${par_polyA:+--polyA} \ + ${par_polyA_length:+--polyA-length "${par_polyA_length}"} \ + ${par_no_polyA_subset:+--no-polyA-subset "${par_no_polyA_subset}"} \ + ${par_bowtie:+--bowtie} \ + ${par_bowtie2:+--bowtie2} \ + ${par_star:+--star} \ + ${par_star_sjdboverhang:+--star-sjdboverhang "${par_star_sjdboverhang}"} \ + ${par_hisat2_hca:+--hisat2-hca} \ + ${par_quiet:+--quiet} \ + ${par_prep_pRSEM:+--prep-pRSEM} \ + ${par_mappability_bigwig_file:+--mappability-bigwig-file "${par_mappability_bigwig_file}"} \ + ${meta_cpus:+--num-threads "${meta_cpus}"} \ + "${par_reference_fasta_files}" \ + "${par_reference_name}" + +mkdir -p "${par_output}" +mv ${par_reference_name}.* "${par_output}/" diff --git a/src/rsem/rsem_prepare_reference/test.sh b/src/rsem/rsem_prepare_reference/test.sh new file mode 100644 index 00000000..b38dd0a9 --- /dev/null +++ b/src/rsem/rsem_prepare_reference/test.sh @@ -0,0 +1,37 @@ + +#!/bin/bash + +set -e pipefail + +echo ">>> Testing $meta_functionality_name" + +cat > genome.fasta <<'EOF' +>Sheila +GCTAGCTCAGAAAAaaaNNN +EOF + +echo ">>> Prepare RSEM reference without gene annotations" +"$meta_executable" \ + --reference_fasta_files genome.fasta \ + --reference_name test \ + --output RSEM_index + +echo ">>> Checking whether output files exist" +[ ! -d "RSEM_index" ] && echo "RSEM index does not exist!" && exit 1 +[ ! -f "RSEM_index/test.grp" ] && echo "test.grp does not exist!" && exit 1 +[ ! -f "RSEM_index/test.n2g.idx.fa" ] && echo "test.n2g.idx.fa does not exist!" && exit 1 +[ ! -f "RSEM_index/test.ti" ] && echo "test.ti does not exist!" && exit 1 +[ ! -f "RSEM_index/test.idx.fa" ] && echo "test.idx.fa does not exist!" && exit 1 +[ ! -f "RSEM_index/test.seq" ] && echo "test.seq does not exist!" && exit 1 +[ ! -f "RSEM_index/test.transcripts.fa" ] && echo "test.transcripts.fa does not exist!" && exit 1 + +echo ">>> Checking whether output is correct" +[ ! -s "RSEM_index/test.grp" ] && echo "test.grp is empty!" && exit 1 +[ ! -s "RSEM_index/test.ti" ] && echo "test.ti is empty!" && exit 1 +[ ! -s "RSEM_index/test.seq" ] && echo "test.seq is empty!" && exit 1 +grep -q "GCTAGCTCAGAAAAaaaNNN" "RSEM_index/test.transcripts.fa" || { echo "The content of file 'test.transcripts.fa' seems to be incorrect." && exit 1; } +grep -q "GCTAGCTCAGAAAAAAANNN" "RSEM_index/test.idx.fa" || { echo "The content of file 'test.idx.fa' seems to be incorrect." && exit 1; } +grep -q "GCTAGCTCAGAAAAAAAGGG" "RSEM_index/test.n2g.idx.fa" || { echo "The content of file 'test.n2g.idx.fa' seems to be incorrect." && exit 1; } + +echo "All tests succeeded!" +exit 0