viash-hub · tverbeiren · Sep 6, 2024 · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -29,6 +29,7 @@
 * `bedtools`:
   - `bedtools/bedtools_intersect`: Allows one to screen for overlaps between two sets of genomic features (PR #94).
   - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98).
+  - `bedtools/bedtools_genomecov`: Compute the coverage of a feature file (bed/gff/vcf/bam) among a genome (PR #128).
   - `bedtools/bedtools_groupby`: Summarizes a dataset column based upon common column groupings. Akin to the SQL "group by" command (PR #123).
   - `bedtools/bedtools_merge`: Merges overlapping BED/GFF/VCF entries into a single interval (PR #118).
   - `bedtools/bedtools_bamtofastq`: Convert BAM alignments to FASTQ files (PR #101).
@@ -45,6 +46,7 @@
 
 * `fastqc`: High throughput sequence quality control analysis tool (PR #92).
 
+
 ## MINOR CHANGES
 
 * `busco` components: update BUSCO to `5.7.1` (PR #72).

diff --git a/src/bedtools/bedtools_genomecov/config.vsh.yaml b/src/bedtools/bedtools_genomecov/config.vsh.yaml
@@ -0,0 +1,208 @@
+name: bedtools_genomecov
+namespace: bedtools
+description: |
+  Compute the coverage of a feature file among a genome.
+keywords: [genome coverage, BED, GFF, VCF, BAM]
+links:
+  homepage: https://bedtools.readthedocs.io/en/latest/#
+  documentation: https://bedtools.readthedocs.io/en/latest/content/tools/genomecov.html
+  repository: https://github.com/arq5x/bedtools2
+  issue_tracker: https://github.com/arq5x/bedtools2/issues
+references:
+  doi: 10.1093/bioinformatics/btq033
+license: MIT
+requirements:
+  commands: [bedtools]
+authors:
+  - __merge__: /src/_authors/theodoro_gasperin.yaml
+    roles: [ author, maintainer ]
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --input
+        alternatives: -i
+        type: file
+        direction: input
+        description: |
+          The input file (BED/GFF/VCF) to be used.
+        example: input.bed
+
+      - name: --input_bam
+        alternatives: -ibam
+        type: file
+        description: |
+          The input file is in BAM format.
+          Note: BAM _must_ be sorted by positions.
+          '--genome' option is ignored if you use '--input_bam' option!
+
+      - name: --genome
+        alternatives: -g
+        type: file
+        direction: input
+        description: |
+          The genome file to be used.
+        example: genome.txt
+
+  - name: Outputs
+    arguments:
+      - name: --output
+        type: file
+        direction: output
+        description: | 
+          The output BED file. 
+        required: true
+        example: output.bed
+
+  - name: Options
+    arguments:
+
+      - name: --depth
+        alternatives: -d
+        type: boolean_true
+        description: |
+          Report the depth at each genome position (with one-based coordinates).
+          Default behavior is to report a histogram.
+
+      - name: --depth_zero
+        alternatives: -dz
+        type: boolean_true
+        description: |
+          Report the depth at each genome position (with zero-based coordinates).
+          Reports only non-zero positions.
+          Default behavior is to report a histogram.
+
+      - name: --bed_graph
+        alternatives: -bg
+        type: boolean_true
+        description: |
+          Report depth in BedGraph format. For details, see:
+          genome.ucsc.edu/goldenPath/help/bedgraph.html
+
+      - name: --bed_graph_zero_coverage
+        alternatives: -bga
+        type: boolean_true
+        description: |
+          Report depth in BedGraph format, as above (-bg).
+          However with this option, regions with zero 
+          coverage are also reported. This allows one to
+          quickly extract all regions of a genome with 0 
+          coverage by applying: "grep -w 0$" to the output.
+
+      - name: --split
+        type: boolean_true
+        description: |
+          Treat "split" BAM or BED12 entries as distinct BED intervals.
+          when computing coverage.
+          For BAM files, this uses the CIGAR "N" and "D" operations 
+          to infer the blocks for computing coverage.
+          For BED12 files, this uses the BlockCount, BlockStarts, and BlockEnds
+          fields (i.e., columns 10,11,12).
+
+      - name: --ignore_deletion
+        alternatives: -ignoreD
+        type: boolean_true
+        description: |
+          Ignore local deletions (CIGAR "D" operations) in BAM entries
+          when computing coverage.
+
+      - name: --strand
+        type: string
+        choices: ["+", "-"]
+        description: |
+          Calculate coverage of intervals from a specific strand.
+          With BED files, requires at least 6 columns (strand is column 6). 
+
+      - name: --pair_end_coverage
+        alternatives: -pc
+        type: boolean_true
+        description: |
+          Calculate coverage of pair-end fragments.
+          Works for BAM files only
+
+      - name: --fragment_size
+        alternatives: -fs
+        type: boolean_true
+        description: |
+          Force to use provided fragment size instead of read length
+          Works for BAM files only
+
+      - name: --du
+        type: boolean_true
+        description: |
+          Change strand af the mate read (so both reads from the same strand) useful for strand specific
+          Works for BAM files only
+
+      - name: --five_prime
+        alternatives: -5
+        type: boolean_true
+        description: |
+          Calculate coverage of 5" positions (instead of entire interval).
+
+      - name: --three_prime
+        alternatives: -3
+        type: boolean_true
+        description: |
+          Calculate coverage of 3" positions (instead of entire interval).
+
+      - name: --max
+        type: integer
+        min: 0
+        description: |
+          Combine all positions with a depth >= max into
+          a single bin in the histogram. Irrelevant
+          for -d and -bedGraph
+          - (INTEGER)
+
+      - name: --scale
+        type: double
+        min: 0
+        description: |
+          Scale the coverage by a constant factor.
+          Each coverage value is multiplied by this factor before being reported.
+          Useful for normalizing coverage by, e.g., reads per million (RPM).
+          - Default is 1.0; i.e., unscaled.
+          - (FLOAT)
+
+      - name: --trackline
+        type: boolean_true
+        description: |
+          Adds a UCSC/Genome-Browser track line definition in the first line of the output.
+          - See here for more details about track line definition:
+                http://genome.ucsc.edu/goldenPath/help/bedgraph.html
+          - NOTE: When adding a trackline definition, the output BedGraph can be easily
+                uploaded to the Genome Browser as a custom track,
+                BUT CAN NOT be converted into a BigWig file (w/o removing the first line).
+
+      - name: --trackopts
+        type: string
+        description: |
+          Writes additional track line definition parameters in the first line.
+          - Example:
+            -trackopts 'name="My Track" visibility=2 color=255,30,30'
+            Note the use of single-quotes if you have spaces in your parameters.
+          - (TEXT)
+        multiple: true
+
+resources:
+  - type: bash_script
+    path: script.sh
+
+test_resources:
+  - type: bash_script
+    path: test.sh
+  - path: test_data
+
+engines:
+  - type: docker
+    image: debian:stable-slim
+    setup:
+      - type: apt
+        packages: [bedtools, procps]
+      - type: docker
+        run: |
+          echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt
+
+runners:
+  - type: executable
+  - type: nextflow
diff --git a/src/bedtools/bedtools_genomecov/help.txt b/src/bedtools/bedtools_genomecov/help.txt
@@ -0,0 +1,101 @@
+```bash
+bedtools genomecov
+```
+
+Tool:    bedtools genomecov (aka genomeCoverageBed)
+Version: v2.30.0
+Summary: Compute the coverage of a feature file among a genome.
+
+Usage: bedtools genomecov [OPTIONS] -i <bed/gff/vcf> -g <genome>
+
+Options: 
+	-ibam		The input file is in BAM format.
+			Note: BAM _must_ be sorted by position
+
+	-d		Report the depth at each genome position (with one-based coordinates).
+			Default behavior is to report a histogram.
+
+	-dz		Report the depth at each genome position (with zero-based coordinates).
+			Reports only non-zero positions.
+			Default behavior is to report a histogram.
+
+	-bg		Report depth in BedGraph format. For details, see:
+			genome.ucsc.edu/goldenPath/help/bedgraph.html
+
+	-bga		Report depth in BedGraph format, as above (-bg).
+			However with this option, regions with zero 
+			coverage are also reported. This allows one to
+			quickly extract all regions of a genome with 0 
+			coverage by applying: "grep -w 0$" to the output.
+
+	-split		Treat "split" BAM or BED12 entries as distinct BED intervals.
+			when computing coverage.
+			For BAM files, this uses the CIGAR "N" and "D" operations 
+			to infer the blocks for computing coverage.
+			For BED12 files, this uses the BlockCount, BlockStarts, and BlockEnds
+			fields (i.e., columns 10,11,12).
+
+	-ignoreD	Ignore local deletions (CIGAR "D" operations) in BAM entries
+			when computing coverage.
+
+	-strand		Calculate coverage of intervals from a specific strand.
+			With BED files, requires at least 6 columns (strand is column 6). 
+			- (STRING): can be + or -
+
+	-pc		Calculate coverage of pair-end fragments.
+			Works for BAM files only
+	-fs		Force to use provided fragment size instead of read length
+			Works for BAM files only
+	-du		Change strand af the mate read (so both reads from the same strand) useful for strand specific
+			Works for BAM files only
+	-5		Calculate coverage of 5" positions (instead of entire interval).
+
+	-3		Calculate coverage of 3" positions (instead of entire interval).
+
+	-max		Combine all positions with a depth >= max into
+			a single bin in the histogram. Irrelevant
+			for -d and -bedGraph
+			- (INTEGER)
+
+	-scale		Scale the coverage by a constant factor.
+			Each coverage value is multiplied by this factor before being reported.
+			Useful for normalizing coverage by, e.g., reads per million (RPM).
+			- Default is 1.0; i.e., unscaled.
+			- (FLOAT)
+
+	-trackline	Adds a UCSC/Genome-Browser track line definition in the first line of the output.
+			- See here for more details about track line definition:
+			      http://genome.ucsc.edu/goldenPath/help/bedgraph.html
+			- NOTE: When adding a trackline definition, the output BedGraph can be easily
+			      uploaded to the Genome Browser as a custom track,
+			      BUT CAN NOT be converted into a BigWig file (w/o removing the first line).
+
+	-trackopts	Writes additional track line definition parameters in the first line.
+			- Example:
+			   -trackopts 'name="My Track" visibility=2 color=255,30,30'
+			   Note the use of single-quotes if you have spaces in your parameters.
+			- (TEXT)
+
+Notes: 
+	(1) The genome file should tab delimited and structured as follows:
+	 <chromName><TAB><chromSize>
+
+	For example, Human (hg19):
+	chr1	249250621
+	chr2	243199373
+	...
+	chr18_gl000207_random	4262
+
+	(2) The input BED (-i) file must be grouped by chromosome.
+	 A simple "sort -k 1,1 <BED> > <BED>.sorted" will suffice.
+
+	(3) The input BAM (-ibam) file must be sorted by position.
+	 A "samtools sort <BAM>" should suffice.
+
+Tips: 
+	One can use the UCSC Genome Browser's MySQL database to extract
+	chromosome sizes. For example, H. sapiens:
+
+	mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \
+	"select chrom, size from hg19.chromInfo" > hg19.genome
+
diff --git a/src/bedtools/bedtools_genomecov/script.sh b/src/bedtools/bedtools_genomecov/script.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+# Exit on error
+set -eo pipefail
+
+# Unset variables
+unset_if_false=(
+    par_input_bam
+    par_depth
+    par_depth_zero
+    par_bed_graph
+    par_bed_graph_zero_coverage
+    par_split
+    par_ignore_deletion
+    par_pair_end_coverage
+    par_fragment_size
+    par_du
+    par_five_prime
+    par_three_prime
+    par_trackline
+)
+
+for par in ${unset_if_false[@]}; do
+    test_val="${!par}"
+    [[ "$test_val" == "false" ]] && unset $par
+done
+
+# Create input array 
+IFS=";" read -ra trackopts <<< $par_trackopts
+
+bedtools genomecov \
+    ${par_depth:+-d} \
+    ${par_depth_zero:+-dz} \
+    ${par_bed_graph:+-bg} \
+    ${par_bed_graph_zero_coverage:+-bga} \
+    ${par_split:+-split} \
+    ${par_ignore_deletion:+-ignoreD} \
+    ${par_du:+-du} \
+    ${par_five_prime:+-5} \
+    ${par_three_prime:+-3} \
+    ${par_trackline:+-trackline} \
+    ${par_strand:+-strand "$par_strand"} \
+    ${par_max:+-max "$par_max"} \
+    ${par_scale:+-scale "$par_scale"} \
+    ${par_trackopts:+-trackopts "${trackopts[*]}"} \
+    ${par_input_bam:+-ibam "$par_input_bam"} \
+    ${par_input:+-i "$par_input"} \
+    ${par_genome:+-g "$par_genome"} \
+    ${par_pair_end_coverage:+-pc} \
+    ${par_fragment_size:+-fs} \
+    > "$par_output"
+