diff --git a/CHANGELOG.md b/CHANGELOG.md index 98e78c17..8f772450 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ * `bedtools`: - `bedtools/bedtools_intersect`: Allows one to screen for overlaps between two sets of genomic features (PR #94). - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98). + - `bedtools/bedtools_genomecov`: Compute the coverage of a feature file (bed/gff/vcf/bam) among a genome (PR #128). - `bedtools/bedtools_groupby`: Summarizes a dataset column based upon common column groupings. Akin to the SQL "group by" command (PR #123). - `bedtools/bedtools_merge`: Merges overlapping BED/GFF/VCF entries into a single interval (PR #118). - `bedtools/bedtools_bamtofastq`: Convert BAM alignments to FASTQ files (PR #101). @@ -45,6 +46,7 @@ * `fastqc`: High throughput sequence quality control analysis tool (PR #92). + ## MINOR CHANGES * `busco` components: update BUSCO to `5.7.1` (PR #72). diff --git a/src/bedtools/bedtools_genomecov/config.vsh.yaml b/src/bedtools/bedtools_genomecov/config.vsh.yaml new file mode 100644 index 00000000..775587de --- /dev/null +++ b/src/bedtools/bedtools_genomecov/config.vsh.yaml @@ -0,0 +1,208 @@ +name: bedtools_genomecov +namespace: bedtools +description: | + Compute the coverage of a feature file among a genome. +keywords: [genome coverage, BED, GFF, VCF, BAM] +links: + homepage: https://bedtools.readthedocs.io/en/latest/# + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/genomecov.html + repository: https://github.com/arq5x/bedtools2 + issue_tracker: https://github.com/arq5x/bedtools2/issues +references: + doi: 10.1093/bioinformatics/btq033 +license: MIT +requirements: + commands: [bedtools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + direction: input + description: | + The input file (BED/GFF/VCF) to be used. + example: input.bed + + - name: --input_bam + alternatives: -ibam + type: file + description: | + The input file is in BAM format. + Note: BAM _must_ be sorted by positions. + '--genome' option is ignored if you use '--input_bam' option! + + - name: --genome + alternatives: -g + type: file + direction: input + description: | + The genome file to be used. + example: genome.txt + + - name: Outputs + arguments: + - name: --output + type: file + direction: output + description: | + The output BED file. + required: true + example: output.bed + + - name: Options + arguments: + + - name: --depth + alternatives: -d + type: boolean_true + description: | + Report the depth at each genome position (with one-based coordinates). + Default behavior is to report a histogram. + + - name: --depth_zero + alternatives: -dz + type: boolean_true + description: | + Report the depth at each genome position (with zero-based coordinates). + Reports only non-zero positions. + Default behavior is to report a histogram. + + - name: --bed_graph + alternatives: -bg + type: boolean_true + description: | + Report depth in BedGraph format. For details, see: + genome.ucsc.edu/goldenPath/help/bedgraph.html + + - name: --bed_graph_zero_coverage + alternatives: -bga + type: boolean_true + description: | + Report depth in BedGraph format, as above (-bg). + However with this option, regions with zero + coverage are also reported. This allows one to + quickly extract all regions of a genome with 0 + coverage by applying: "grep -w 0$" to the output. + + - name: --split + type: boolean_true + description: | + Treat "split" BAM or BED12 entries as distinct BED intervals. + when computing coverage. + For BAM files, this uses the CIGAR "N" and "D" operations + to infer the blocks for computing coverage. + For BED12 files, this uses the BlockCount, BlockStarts, and BlockEnds + fields (i.e., columns 10,11,12). + + - name: --ignore_deletion + alternatives: -ignoreD + type: boolean_true + description: | + Ignore local deletions (CIGAR "D" operations) in BAM entries + when computing coverage. + + - name: --strand + type: string + choices: ["+", "-"] + description: | + Calculate coverage of intervals from a specific strand. + With BED files, requires at least 6 columns (strand is column 6). + + - name: --pair_end_coverage + alternatives: -pc + type: boolean_true + description: | + Calculate coverage of pair-end fragments. + Works for BAM files only + + - name: --fragment_size + alternatives: -fs + type: boolean_true + description: | + Force to use provided fragment size instead of read length + Works for BAM files only + + - name: --du + type: boolean_true + description: | + Change strand af the mate read (so both reads from the same strand) useful for strand specific + Works for BAM files only + + - name: --five_prime + alternatives: -5 + type: boolean_true + description: | + Calculate coverage of 5" positions (instead of entire interval). + + - name: --three_prime + alternatives: -3 + type: boolean_true + description: | + Calculate coverage of 3" positions (instead of entire interval). + + - name: --max + type: integer + min: 0 + description: | + Combine all positions with a depth >= max into + a single bin in the histogram. Irrelevant + for -d and -bedGraph + - (INTEGER) + + - name: --scale + type: double + min: 0 + description: | + Scale the coverage by a constant factor. + Each coverage value is multiplied by this factor before being reported. + Useful for normalizing coverage by, e.g., reads per million (RPM). + - Default is 1.0; i.e., unscaled. + - (FLOAT) + + - name: --trackline + type: boolean_true + description: | + Adds a UCSC/Genome-Browser track line definition in the first line of the output. + - See here for more details about track line definition: + http://genome.ucsc.edu/goldenPath/help/bedgraph.html + - NOTE: When adding a trackline definition, the output BedGraph can be easily + uploaded to the Genome Browser as a custom track, + BUT CAN NOT be converted into a BigWig file (w/o removing the first line). + + - name: --trackopts + type: string + description: | + Writes additional track line definition parameters in the first line. + - Example: + -trackopts 'name="My Track" visibility=2 color=255,30,30' + Note the use of single-quotes if you have spaces in your parameters. + - (TEXT) + multiple: true + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bedtools, procps] + - type: docker + run: | + echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/bedtools/bedtools_genomecov/help.txt b/src/bedtools/bedtools_genomecov/help.txt new file mode 100644 index 00000000..f13a71d3 --- /dev/null +++ b/src/bedtools/bedtools_genomecov/help.txt @@ -0,0 +1,101 @@ +```bash +bedtools genomecov +``` + +Tool: bedtools genomecov (aka genomeCoverageBed) +Version: v2.30.0 +Summary: Compute the coverage of a feature file among a genome. + +Usage: bedtools genomecov [OPTIONS] -i -g + +Options: + -ibam The input file is in BAM format. + Note: BAM _must_ be sorted by position + + -d Report the depth at each genome position (with one-based coordinates). + Default behavior is to report a histogram. + + -dz Report the depth at each genome position (with zero-based coordinates). + Reports only non-zero positions. + Default behavior is to report a histogram. + + -bg Report depth in BedGraph format. For details, see: + genome.ucsc.edu/goldenPath/help/bedgraph.html + + -bga Report depth in BedGraph format, as above (-bg). + However with this option, regions with zero + coverage are also reported. This allows one to + quickly extract all regions of a genome with 0 + coverage by applying: "grep -w 0$" to the output. + + -split Treat "split" BAM or BED12 entries as distinct BED intervals. + when computing coverage. + For BAM files, this uses the CIGAR "N" and "D" operations + to infer the blocks for computing coverage. + For BED12 files, this uses the BlockCount, BlockStarts, and BlockEnds + fields (i.e., columns 10,11,12). + + -ignoreD Ignore local deletions (CIGAR "D" operations) in BAM entries + when computing coverage. + + -strand Calculate coverage of intervals from a specific strand. + With BED files, requires at least 6 columns (strand is column 6). + - (STRING): can be + or - + + -pc Calculate coverage of pair-end fragments. + Works for BAM files only + -fs Force to use provided fragment size instead of read length + Works for BAM files only + -du Change strand af the mate read (so both reads from the same strand) useful for strand specific + Works for BAM files only + -5 Calculate coverage of 5" positions (instead of entire interval). + + -3 Calculate coverage of 3" positions (instead of entire interval). + + -max Combine all positions with a depth >= max into + a single bin in the histogram. Irrelevant + for -d and -bedGraph + - (INTEGER) + + -scale Scale the coverage by a constant factor. + Each coverage value is multiplied by this factor before being reported. + Useful for normalizing coverage by, e.g., reads per million (RPM). + - Default is 1.0; i.e., unscaled. + - (FLOAT) + + -trackline Adds a UCSC/Genome-Browser track line definition in the first line of the output. + - See here for more details about track line definition: + http://genome.ucsc.edu/goldenPath/help/bedgraph.html + - NOTE: When adding a trackline definition, the output BedGraph can be easily + uploaded to the Genome Browser as a custom track, + BUT CAN NOT be converted into a BigWig file (w/o removing the first line). + + -trackopts Writes additional track line definition parameters in the first line. + - Example: + -trackopts 'name="My Track" visibility=2 color=255,30,30' + Note the use of single-quotes if you have spaces in your parameters. + - (TEXT) + +Notes: + (1) The genome file should tab delimited and structured as follows: + + + For example, Human (hg19): + chr1 249250621 + chr2 243199373 + ... + chr18_gl000207_random 4262 + + (2) The input BED (-i) file must be grouped by chromosome. + A simple "sort -k 1,1 > .sorted" will suffice. + + (3) The input BAM (-ibam) file must be sorted by position. + A "samtools sort " should suffice. + +Tips: + One can use the UCSC Genome Browser's MySQL database to extract + chromosome sizes. For example, H. sapiens: + + mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \ + "select chrom, size from hg19.chromInfo" > hg19.genome + diff --git a/src/bedtools/bedtools_genomecov/script.sh b/src/bedtools/bedtools_genomecov/script.sh new file mode 100644 index 00000000..20fbd968 --- /dev/null +++ b/src/bedtools/bedtools_genomecov/script.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +# Unset variables +unset_if_false=( + par_input_bam + par_depth + par_depth_zero + par_bed_graph + par_bed_graph_zero_coverage + par_split + par_ignore_deletion + par_pair_end_coverage + par_fragment_size + par_du + par_five_prime + par_three_prime + par_trackline +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +# Create input array +IFS=";" read -ra trackopts <<< $par_trackopts + +bedtools genomecov \ + ${par_depth:+-d} \ + ${par_depth_zero:+-dz} \ + ${par_bed_graph:+-bg} \ + ${par_bed_graph_zero_coverage:+-bga} \ + ${par_split:+-split} \ + ${par_ignore_deletion:+-ignoreD} \ + ${par_du:+-du} \ + ${par_five_prime:+-5} \ + ${par_three_prime:+-3} \ + ${par_trackline:+-trackline} \ + ${par_strand:+-strand "$par_strand"} \ + ${par_max:+-max "$par_max"} \ + ${par_scale:+-scale "$par_scale"} \ + ${par_trackopts:+-trackopts "${trackopts[*]}"} \ + ${par_input_bam:+-ibam "$par_input_bam"} \ + ${par_input:+-i "$par_input"} \ + ${par_genome:+-g "$par_genome"} \ + ${par_pair_end_coverage:+-pc} \ + ${par_fragment_size:+-fs} \ + > "$par_output" + \ No newline at end of file diff --git a/src/bedtools/bedtools_genomecov/test.sh b/src/bedtools/bedtools_genomecov/test.sh new file mode 100644 index 00000000..7e4487da --- /dev/null +++ b/src/bedtools/bedtools_genomecov/test.sh @@ -0,0 +1,333 @@ +#!/bin/bash + +# exit on error +set -eo pipefail + +## VIASH START +meta_executable="target/executable/bedtools/bedtools_intersect/bedtools_intersect" +meta_resources_dir="src/bedtools/bedtools_intersect" +## VIASH END + +# directory of the bam file +test_data="$meta_resources_dir/test_data" + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create and populate input files +printf "chr1\t248956422\nchr2\t198295559\nchr3\t242193529\n" > "$TMPDIR/genome.txt" +printf "chr2\t128\t228\tmy_read/1\t37\t+\nchr2\t428\t528\tmy_read/2\t37\t-\n" > "$TMPDIR/example.bed" +printf "chr2\t128\t228\tmy_read/1\t60\t+\t128\t228\t255,0,0\t1\t100\t0\nchr2\t428\t528\tmy_read/2\t60\t-\t428\t528\t255,0,0\t1\t100\t0\n" > "$TMPDIR/example.bed12" +printf "chr2\t100\t103\n" > "$TMPDIR/example_dz.bed" + +# expected outputs +cat > "$TMPDIR/expected_default.bed" < "$TMPDIR/expected_ibam.bed" < "$TMPDIR/expected_ibam_pc.bed" < "$TMPDIR/expected_ibam_fs.bed" < "$TMPDIR/expected_dz.bed" < "$TMPDIR/expected_strand.bed" < "$TMPDIR/expected_5.bed" < "$TMPDIR/expected_bg_scale.bed" < "$TMPDIR/expected_trackopts.bed" < "$TMPDIR/expected_split.bed" < "$TMPDIR/expected_ignoreD_du.bed" < /dev/null + +echo "> Run bedtools_genomecov on BED file" +"$meta_executable" \ + --input "../example.bed" \ + --genome "../genome.txt" \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_default.bed" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: ibam option +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bedtools_genomecov on BAM file with -ibam" +"$meta_executable" \ + --input_bam "$test_data/example.bam" \ + --output "output.bed" \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_ibam.bed" +echo "- test2 succeeded -" + +popd > /dev/null + +# Test 3: depth option +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null + +echo "> Run bedtools_genomecov on BED file with -dz" +"$meta_executable" \ + --input "../example_dz.bed" \ + --genome "../genome.txt" \ + --output "output.bed" \ + --depth_zero + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_dz.bed" +echo "- test3 succeeded -" + +popd > /dev/null + +# Test 4: strand option +mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null + +echo "> Run bedtools_genomecov on BED file with -strand" +"$meta_executable" \ + --input "../example.bed" \ + --genome "../genome.txt" \ + --output "output.bed" \ + --strand "-" \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_strand.bed" +echo "- test4 succeeded -" + +popd > /dev/null + +# Test 5: 5' end option +mkdir "$TMPDIR/test5" && pushd "$TMPDIR/test5" > /dev/null + +echo "> Run bedtools_genomecov on BED file with -5" +"$meta_executable" \ + --input "../example.bed" \ + --genome "../genome.txt" \ + --output "output.bed" \ + --five_prime \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_5.bed" +echo "- test5 succeeded -" + +popd > /dev/null + +# Test 6: max option +mkdir "$TMPDIR/test6" && pushd "$TMPDIR/test6" > /dev/null + +echo "> Run bedtools_genomecov on BED file with -max" +"$meta_executable" \ + --input "../example.bed" \ + --genome "../genome.txt" \ + --output "output.bed" \ + --max 100 \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_default.bed" +echo "- test6 succeeded -" + +popd > /dev/null + +# Test 7: bedgraph and scale option +mkdir "$TMPDIR/test7" && pushd "$TMPDIR/test7" > /dev/null + +echo "> Run bedtools_genomecov on BED file with -bg and -scale" +"$meta_executable" \ + --input "../example.bed" \ + --genome "../genome.txt" \ + --output "output.bed" \ + --bed_graph \ + --scale 100 \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_bg_scale.bed" +echo "- test7 succeeded -" + +popd > /dev/null + +# Test 8: trackopts option +mkdir "$TMPDIR/test8" && pushd "$TMPDIR/test8" > /dev/null + +echo "> Run bedtools_genomecov on BED file with -bg and -trackopts" +"$meta_executable" \ + --input "../example.bed" \ + --genome "../genome.txt" \ + --output "output.bed" \ + --bed_graph \ + --trackopts "name=example" \ + --trackopts "llama=Alpaco" \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_trackopts.bed" +echo "- test8 succeeded -" + +popd > /dev/null + +# Test 9: ibam pc options +mkdir "$TMPDIR/test9" && pushd "$TMPDIR/test9" > /dev/null + +echo "> Run bedtools_genomecov on BAM file with -ibam, -pc" +"$meta_executable" \ + --input_bam "$test_data/example.bam" \ + --output "output.bed" \ + --fragment_size \ + --pair_end_coverage \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_ibam_pc.bed" +echo "- test9 succeeded -" + +popd > /dev/null + +# Test 10: ibam fs options +mkdir "$TMPDIR/test10" && pushd "$TMPDIR/test10" > /dev/null + +echo "> Run bedtools_genomecov on BAM file with -ibam, -fs" +"$meta_executable" \ + --input_bam "$test_data/example.bam" \ + --output "output.bed" \ + --fragment_size \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_ibam_fs.bed" +echo "- test10 succeeded -" + +popd > /dev/null + +# Test 11: split +mkdir "$TMPDIR/test11" && pushd "$TMPDIR/test11" > /dev/null + +echo "> Run bedtools_genomecov on BED12 file with -split" +"$meta_executable" \ + --input "../example.bed12" \ + --genome "../genome.txt" \ + --output "output.bed" \ + --split \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_split.bed" +echo "- test11 succeeded -" + +popd > /dev/null + +# Test 12: ignore deletion and du +mkdir "$TMPDIR/test12" && pushd "$TMPDIR/test12" > /dev/null + +echo "> Run bedtools_genomecov on BAM file with -ignoreD and -du" +"$meta_executable" \ + --input_bam "$test_data/example.bam" \ + --output "output.bed" \ + --ignore_deletion \ + --du \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_ignoreD_du.bed" +echo "- test12 succeeded -" + +popd > /dev/null + +echo "---- All tests succeeded! ----" +exit 0 diff --git a/src/bedtools/bedtools_genomecov/test_data/example.bam b/src/bedtools/bedtools_genomecov/test_data/example.bam new file mode 100644 index 00000000..ffc075ab Binary files /dev/null and b/src/bedtools/bedtools_genomecov/test_data/example.bam differ