diff --git a/CHANGELOG.md b/CHANGELOG.md index 3fc134fd..0e32edb1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ * `rsem/rsem_calculate_expression`: Calculate expression levels (PR #93). * `rseqc`: + - `rseqc/rseqc_inner_distance`: Calculate inner distance between read pairs (PR #159). - `rseqc/rseqc_inferexperiment`: Infer strandedness from sequencing reads (PR #158). - `rseqc/bam_stat`: Generate statistics from a bam file (PR #155). diff --git a/src/rseqc/rseqc_inner_distance/config.vsh.yaml b/src/rseqc/rseqc_inner_distance/config.vsh.yaml new file mode 100644 index 00000000..e050bb24 --- /dev/null +++ b/src/rseqc/rseqc_inner_distance/config.vsh.yaml @@ -0,0 +1,116 @@ +name: "rseqc_inner_distance" +namespace: "rseqc" +description: | + Calculate inner distance between read pairs. +links: + homepage: https://rseqc.sourceforge.net/ + documentation: https://rseqc.sourceforge.net/#inner-distance-py + issue_tracker: https://github.com/MonashBioinformaticsPlatform/RSeQC/issues + repository: https://github.com/MonashBioinformaticsPlatform/RSeQC +references: + doi: 10.1093/bioinformatics/bts356 +license: GPL-3.0 +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] + +argument_groups: +- name: "Input" + arguments: + - name: "--input_file" + alternatives: ["-i"] + type: file + required: true + description: input alignment file in BAM or SAM format + + - name: "--refgene" + alternatives: ["-r"] + type: file + required: true + description: Reference gene model in bed format + + - name: "--sample_size" + alternatives: ["-k"] + type: integer + example: 1000000 + description: Numer of reads sampled from SAM/BAM file, default = 1000000. + + - name: "--mapq" + alternatives: ["-q"] + type: integer + example: 30 + description: Minimum mapping quality (phred scaled) to determine uniquely mapped reads, default=30. + + - name: "--lower_bound" + alternatives: ["-l"] + type: integer + example: -250 + description: Lower bound of inner distance (bp). This option is used for ploting histograme, default=-250. + + - name: "--upper_bound" + alternatives: ["-u"] + type: integer + example: 250 + description: Upper bound of inner distance (bp). This option is used for ploting histograme, default=250. + + - name: "--step" + alternatives: ["-s"] + type: integer + example: 5 + description: Step size (bp) of histograme. This option is used for plotting histogram, default=5. + +- name: "Output" + arguments: + - name: "--output_prefix" + alternatives: ["-o"] + type: string + required: true + description: Rrefix of output files. + + - name: "--output_stats" + type: file + direction: output + description: output file (txt) with summary statistics of inner distances of paired reads + + - name: "--output_dist" + type: file + direction: output + description: output file (txt) with inner distances of all paired reads + + - name: "--output_freq" + type: file + direction: output + description: output file (txt) with frequencies of inner distances of all paired reads + + - name: "--output_plot" + type: file + direction: output + description: output file (pdf) with histogram plot of of inner distances of all paired reads + + - name: "--output_plot_r" + type: file + direction: output + description: output file (R) with script of histogram plot of of inner distances of all paired reads + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: +- type: docker + image: python:3.10 + setup: + - type: apt + packages: [r-base] + - type: python + packages: [ RSeQC ] + - type: docker + run: | + echo "RSeQC - inner_distance.py: $(inner_distance.py --version | cut -d' ' -f2)" > /var/software_versions.txt +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/rseqc/rseqc_inner_distance/help.txt b/src/rseqc/rseqc_inner_distance/help.txt new file mode 100644 index 00000000..18f97bb6 --- /dev/null +++ b/src/rseqc/rseqc_inner_distance/help.txt @@ -0,0 +1,43 @@ +``` +inner_distance.py --help +``` + +Usage: inner_distance.py [options] + +Calculate the inner distance (insert size) of RNA-seq fragments. + + RNA fragment + _________________||_________________ +| | +| | +||||||||||------------------|||||||||| + read_1 insert_size read_2 + +fragment size = read_1 + insert_size + read_2 + + + +Options: + --version show program's version number and exit + -h, --help show this help message and exit + -i INPUT_FILE, --input-file=INPUT_FILE + Alignment file in BAM or SAM format. + -o OUTPUT_PREFIX, --out-prefix=OUTPUT_PREFIX + Prefix of output files(s) + -r REF_GENE, --refgene=REF_GENE + Reference gene model in BED format. + -k SAMPLESIZE, --sample-size=SAMPLESIZE + Number of read-pairs used to estimate inner distance. + default=1000000 + -l LOWER_BOUND_SIZE, --lower-bound=LOWER_BOUND_SIZE + Lower bound of inner distance (bp). This option is + used for ploting histograme. default=-250 + -u UPPER_BOUND_SIZE, --upper-bound=UPPER_BOUND_SIZE + Upper bound of inner distance (bp). This option is + used for plotting histogram. default=250 + -s STEP_SIZE, --step=STEP_SIZE + Step size (bp) of histograme. This option is used for + plotting histogram. default=5 + -q MAP_QUAL, --mapq=MAP_QUAL + Minimum mapping quality (phred scaled) for an + alignment to be called "uniquely mapped". default=30 \ No newline at end of file diff --git a/src/rseqc/rseqc_inner_distance/script.sh b/src/rseqc/rseqc_inner_distance/script.sh new file mode 100644 index 00000000..fe00c590 --- /dev/null +++ b/src/rseqc/rseqc_inner_distance/script.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +set -exo pipefail + + +inner_distance.py \ + -i $par_input_file \ + -r $par_refgene \ + -o $par_output_prefix \ + ${par_sample_size:+-k "${par_sample_size}"} \ + ${par_lower_bound:+-l "${par_lower_bound}"} \ + ${par_upper_bound:+-u "${par_upper_bound}"} \ + ${par_step:+-s "${par_step}"} \ + ${par_mapq:+-q "${par_mapq}"} \ +> stdout.txt + +if [[ -n $par_output_stats ]]; then head -n 2 stdout.txt > $par_output_stats; fi + + +[[ -n "$par_output_dist" && -f "$par_output_prefix.inner_distance.txt" ]] && mv $par_output_prefix.inner_distance.txt $par_output_dist +[[ -n "$par_output_plot" && -f "$par_output_prefix.inner_distance_plot.pdf" ]] && mv $par_output_prefix.inner_distance_plot.pdf $par_output_plot +[[ -n "$par_output_plot_r" && -f "$par_output_prefix.inner_distance_plot.r" ]] && mv $par_output_prefix.inner_distance_plot.r $par_output_plot_r +[[ -n "$par_output_freq" && -f "$par_output_prefix.inner_distance_freq.txt" ]] && mv $par_output_prefix.inner_distance_freq.txt $par_output_freq + +exit 0 \ No newline at end of file diff --git a/src/rseqc/rseqc_inner_distance/test.sh b/src/rseqc/rseqc_inner_distance/test.sh new file mode 100644 index 00000000..927a69a9 --- /dev/null +++ b/src/rseqc/rseqc_inner_distance/test.sh @@ -0,0 +1,77 @@ +#!/bin/bash + + +# define input and output for script +input_bam="$meta_resources_dir/test_data/test.paired_end.sorted.bam" +input_bed="$meta_resources_dir/test_data/test.bed12" + +output_stats="inner_distance_stats.txt" +output_dist="inner_distance.txt" +output_plot="inner_distance_plot.pdf" +output_plot_r="inner_distance_plot.r" +output_freq="inner_distance_freq.txt" + +# Run executable +echo "> Running $meta_functionality_name" + +"$meta_executable" \ + --input_file $input_bam \ + --refgene $input_bed \ + --output_prefix "test" \ + --output_stats $output_stats \ + --output_dist $output_dist \ + --output_plot $output_plot \ + --output_plot_r $output_plot_r \ + --output_freq $output_freq + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +echo ">> Check whether output is present and not empty" + +[[ -f "$output_stats" ]] || { echo "$output_stats was not created"; exit 1; } +[[ -s "$output_stats" ]] || { echo "$output_stats is empty"; exit 1; } +[[ -f "$output_dist" ]] || { echo "$output_dist was not created"; exit 1; } +[[ -s "$output_dist" ]] || { echo "$output_dist is empty"; exit 1; } +[[ -f "$output_plot" ]] || { echo "$output_plot was not created"; exit 1; } +[[ -s "$output_plot" ]] || { echo "$output_plot is empty"; exit 1; } +[[ -f "$output_plot_r" ]] || { echo "$output_plot_r was not created"; exit 1; } +[[ -s "$output_plot_r" ]] || { echo "$output_plot_r is empty"; exit 1; } +[[ -f "$output_freq" ]] || { echo "$output_freq was created"; exit 1; } +[[ -s "$output_freq" ]] || { echo "$output_freq is empty"; exit 1; } + +echo ">> Check whether output is correct" +diff "$output_freq" "$meta_resources_dir/test_data/test1.inner_distance_freq.txt" || { echo "Output is not correct"; exit 1; } +diff "$output_dist" "$meta_resources_dir/test_data/test1.inner_distance.txt" || { echo "Output is not correct"; exit 1; } + +# clean up +rm "$output_stats" "$output_dist" "$output_plot" "$output_plot_r" "$output_freq" +################################################################################ + +echo "> Running $meta_functionality_name with non-default parameters and default output file names" +"$meta_executable" \ + --input_file $input_bam \ + --refgene $input_bed \ + --output_prefix "test" \ + --sample_size 4 \ + --mapq 10 + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +echo ">> Check whether output is present and not empty" + +[[ -f "test.inner_distance.txt" ]] || { echo "test.inner_distance.txt was not created"; exit 1; } +[[ -s "test.inner_distance.txt" ]] || { echo "test.inner_distance.txt is empty"; exit 1; } +[[ -f "test.inner_distance_plot.pdf" ]] || { echo "test.inner_distance_plot.pdf was not created"; exit 1; } +[[ -s "test.inner_distance_plot.pdf" ]] || { echo "test.inner_distance_plot.pdf is empty"; exit 1; } +[[ -f "test.inner_distance_plot.r" ]] || { echo "test.inner_distance_plot.r was not created"; exit 1; } +[[ -s "test.inner_distance_plot.r" ]] || { echo "test.inner_distance_plot.r is empty"; exit 1; } +[[ -f "test.inner_distance_freq.txt" ]] || { echo "test.inner_distance_freq.txt was created"; exit 1; } +[[ -s "test.inner_distance_freq.txt" ]] || { echo "test.inner_distance_freq.txt is empty"; exit 1; } + +echo ">> Check whether output is correct" +diff "test.inner_distance_freq.txt" "$meta_resources_dir/test_data/test2.inner_distance_freq.txt" || { echo "Output is not correct"; exit 1; } +diff "test.inner_distance.txt" "$meta_resources_dir/test_data/test2.inner_distance.txt" || { echo "Output is not correct"; exit 1; } + +exit 0 \ No newline at end of file diff --git a/src/rseqc/rseqc_inner_distance/test_data/test.bed12 b/src/rseqc/rseqc_inner_distance/test_data/test.bed12 new file mode 100644 index 00000000..33a46951 --- /dev/null +++ b/src/rseqc/rseqc_inner_distance/test_data/test.bed12 @@ -0,0 +1,4 @@ +MT192765.1 1242 1264 nCoV-2019_5_LEFT 1 + 1242 1264 0 2 10,12, 0,10, +MT192765.1 1573 1595 nCoV-2019_6_LEFT 2 + 1573 1595 0 2 7,15, 0,7, +MT192765.1 1623 1651 nCoV-2019_5_RIGHT 1 - 1623 1651 0 2 14,14, 0,14, +MT192765.1 1942 1964 nCoV-2019_6_RIGHT 2 - 1942 1964 0 2 11,11 0,11, diff --git a/src/rseqc/rseqc_inner_distance/test_data/test.paired_end.sorted.bam b/src/rseqc/rseqc_inner_distance/test_data/test.paired_end.sorted.bam new file mode 100644 index 00000000..8b215e12 Binary files /dev/null and b/src/rseqc/rseqc_inner_distance/test_data/test.paired_end.sorted.bam differ diff --git a/src/rseqc/rseqc_inner_distance/test_data/test1.inner_distance.txt b/src/rseqc/rseqc_inner_distance/test_data/test1.inner_distance.txt new file mode 100644 index 00000000..e5f09f8f --- /dev/null +++ b/src/rseqc/rseqc_inner_distance/test_data/test1.inner_distance.txt @@ -0,0 +1,49 @@ +ERR5069949.29668 -4 sameTranscript=No,dist=genomic +ERR5069949.114870 -45 sameTranscript=No,dist=genomic +ERR5069949.147998 94 sameTranscript=No,dist=genomic +ERR5069949.155944 -105 sameTranscript=No,dist=genomic +ERR5069949.184542 49 sameTranscript=No,dist=genomic +ERR5069949.169513 -92 sameTranscript=No,dist=genomic +ERR5069949.257821 -139 sameTranscript=No,dist=genomic +ERR5069949.309410 13 sameTranscript=No,dist=genomic +ERR5069949.376959 -66 sameTranscript=No,dist=genomic +ERR5069949.366975 -106 sameTranscript=No,dist=genomic +ERR5069949.465452 -19 sameTranscript=No,dist=genomic +ERR5069949.479807 5 sameTranscript=No,dist=genomic +ERR5069949.501486 -82 sameTranscript=No,dist=genomic +ERR5069949.532979 -96 sameTranscript=No,dist=genomic +ERR5069949.540529 -61 sameTranscript=No,dist=genomic +ERR5069949.573706 -63 sameTranscript=No,dist=genomic +ERR5069949.576388 -77 sameTranscript=No,dist=genomic +ERR5069949.611123 -125 sameTranscript=No,dist=genomic +ERR5069949.651338 -33 sameTranscript=No,dist=genomic +ERR5069949.686090 -29 sameTranscript=No,dist=genomic +ERR5069949.786562 42 sameTranscript=No,dist=genomic +ERR5069949.870926 -22 sameTranscript=No,dist=genomic +ERR5069949.856527 -69 sameTranscript=No,dist=genomic +ERR5069949.885966 -32 sameTranscript=No,dist=genomic +ERR5069949.937422 18 sameTranscript=No,dist=genomic +ERR5069949.919671 -116 sameTranscript=No,dist=genomic +ERR5069949.973930 -79 sameTranscript=No,dist=genomic +ERR5069949.986441 -22 sameTranscript=No,dist=genomic +ERR5069949.1014693 -150 sameTranscript=No,dist=genomic +ERR5069949.1020777 -122 sameTranscript=No,dist=genomic +ERR5069949.1066259 -4 sameTranscript=No,dist=genomic +ERR5069949.1062611 -124 sameTranscript=No,dist=genomic +ERR5069949.1067032 -103 sameTranscript=No,dist=genomic +ERR5069949.1088785 -101 sameTranscript=No,dist=genomic +ERR5069949.1132353 -142 sameTranscript=No,dist=genomic +ERR5069949.1151736 -55 sameTranscript=No,dist=genomic +ERR5069949.1258508 62 sameTranscript=No,dist=genomic +ERR5069949.1189252 -98 sameTranscript=No,dist=genomic +ERR5069949.1261808 -88 sameTranscript=No,dist=genomic +ERR5069949.1246538 -122 sameTranscript=No,dist=genomic +ERR5069949.1328186 -64 sameTranscript=No,dist=genomic +ERR5069949.1331889 -132 sameTranscript=No,dist=genomic +ERR5069949.1372331 -29 sameTranscript=No,dist=genomic +ERR5069949.1340552 -140 sameTranscript=No,dist=genomic +ERR5069949.1412839 -117 sameTranscript=No,dist=genomic +ERR5069949.1476386 -98 sameTranscript=No,dist=genomic +ERR5069949.1538968 -133 sameTranscript=No,dist=genomic +ERR5069949.1552198 -67 sameTranscript=No,dist=genomic +ERR5069949.1561137 -59 sameTranscript=No,dist=genomic diff --git a/src/rseqc/rseqc_inner_distance/test_data/test1.inner_distance_freq.txt b/src/rseqc/rseqc_inner_distance/test_data/test1.inner_distance_freq.txt new file mode 100644 index 00000000..908326ff --- /dev/null +++ b/src/rseqc/rseqc_inner_distance/test_data/test1.inner_distance_freq.txt @@ -0,0 +1,100 @@ +-250 -245 0 +-245 -240 0 +-240 -235 0 +-235 -230 0 +-230 -225 0 +-225 -220 0 +-220 -215 0 +-215 -210 0 +-210 -205 0 +-205 -200 0 +-200 -195 0 +-195 -190 0 +-190 -185 0 +-185 -180 0 +-180 -175 0 +-175 -170 0 +-170 -165 0 +-165 -160 0 +-160 -155 0 +-155 -150 1 +-150 -145 0 +-145 -140 2 +-140 -135 1 +-135 -130 2 +-130 -125 1 +-125 -120 3 +-120 -115 2 +-115 -110 0 +-110 -105 2 +-105 -100 2 +-100 -95 3 +-95 -90 1 +-90 -85 1 +-85 -80 1 +-80 -75 2 +-75 -70 0 +-70 -65 3 +-65 -60 3 +-60 -55 2 +-55 -50 0 +-50 -45 1 +-45 -40 0 +-40 -35 0 +-35 -30 2 +-30 -25 2 +-25 -20 2 +-20 -15 1 +-15 -10 0 +-10 -5 0 +-5 0 2 +0 5 1 +5 10 0 +10 15 1 +15 20 1 +20 25 0 +25 30 0 +30 35 0 +35 40 0 +40 45 1 +45 50 1 +50 55 0 +55 60 0 +60 65 1 +65 70 0 +70 75 0 +75 80 0 +80 85 0 +85 90 0 +90 95 1 +95 100 0 +100 105 0 +105 110 0 +110 115 0 +115 120 0 +120 125 0 +125 130 0 +130 135 0 +135 140 0 +140 145 0 +145 150 0 +150 155 0 +155 160 0 +160 165 0 +165 170 0 +170 175 0 +175 180 0 +180 185 0 +185 190 0 +190 195 0 +195 200 0 +200 205 0 +205 210 0 +210 215 0 +215 220 0 +220 225 0 +225 230 0 +230 235 0 +235 240 0 +240 245 0 +245 250 0 diff --git a/src/rseqc/rseqc_inner_distance/test_data/test2.inner_distance.txt b/src/rseqc/rseqc_inner_distance/test_data/test2.inner_distance.txt new file mode 100644 index 00000000..a1930c9e --- /dev/null +++ b/src/rseqc/rseqc_inner_distance/test_data/test2.inner_distance.txt @@ -0,0 +1,4 @@ +ERR5069949.29668 -4 sameTranscript=No,dist=genomic +ERR5069949.114870 -45 sameTranscript=No,dist=genomic +ERR5069949.147998 94 sameTranscript=No,dist=genomic +ERR5069949.155944 -105 sameTranscript=No,dist=genomic diff --git a/src/rseqc/rseqc_inner_distance/test_data/test2.inner_distance_freq.txt b/src/rseqc/rseqc_inner_distance/test_data/test2.inner_distance_freq.txt new file mode 100644 index 00000000..021311a2 --- /dev/null +++ b/src/rseqc/rseqc_inner_distance/test_data/test2.inner_distance_freq.txt @@ -0,0 +1,100 @@ +-250 -245 0 +-245 -240 0 +-240 -235 0 +-235 -230 0 +-230 -225 0 +-225 -220 0 +-220 -215 0 +-215 -210 0 +-210 -205 0 +-205 -200 0 +-200 -195 0 +-195 -190 0 +-190 -185 0 +-185 -180 0 +-180 -175 0 +-175 -170 0 +-170 -165 0 +-165 -160 0 +-160 -155 0 +-155 -150 0 +-150 -145 0 +-145 -140 0 +-140 -135 0 +-135 -130 0 +-130 -125 0 +-125 -120 0 +-120 -115 0 +-115 -110 0 +-110 -105 1 +-105 -100 0 +-100 -95 0 +-95 -90 0 +-90 -85 0 +-85 -80 0 +-80 -75 0 +-75 -70 0 +-70 -65 0 +-65 -60 0 +-60 -55 0 +-55 -50 0 +-50 -45 1 +-45 -40 0 +-40 -35 0 +-35 -30 0 +-30 -25 0 +-25 -20 0 +-20 -15 0 +-15 -10 0 +-10 -5 0 +-5 0 1 +0 5 0 +5 10 0 +10 15 0 +15 20 0 +20 25 0 +25 30 0 +30 35 0 +35 40 0 +40 45 0 +45 50 0 +50 55 0 +55 60 0 +60 65 0 +65 70 0 +70 75 0 +75 80 0 +80 85 0 +85 90 0 +90 95 1 +95 100 0 +100 105 0 +105 110 0 +110 115 0 +115 120 0 +120 125 0 +125 130 0 +130 135 0 +135 140 0 +140 145 0 +145 150 0 +150 155 0 +155 160 0 +160 165 0 +165 170 0 +170 175 0 +175 180 0 +180 185 0 +185 190 0 +190 195 0 +195 200 0 +200 205 0 +205 210 0 +210 215 0 +215 220 0 +220 225 0 +225 230 0 +230 235 0 +235 240 0 +240 245 0 +245 250 0