From 81c5e28300602e886cad9b5aa514562a53245de1 Mon Sep 17 00:00:00 2001 From: tgaspe Date: Sat, 10 Aug 2024 19:35:12 +0200 Subject: [PATCH] Working on suggested changes - still need to add content check using samtools view --- CHANGELOG.md | 2 +- .../bedtools_bedtobam/config.vsh.yaml | 8 +- src/bedtools/bedtools_bedtobam/test.sh | 73 +++++---- src/bedtools/bedtools_bedtobam/test_copy.sh | 143 ++++++++++++++++++ 4 files changed, 186 insertions(+), 40 deletions(-) create mode 100644 src/bedtools/bedtools_bedtobam/test_copy.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fb754b8..3332342a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,7 +26,7 @@ * `bedtools`: - `bedtools/bedtools_intersect`: Allows one to screen for overlaps between two sets of genomic features (PR #94). - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98). - - `bedtools/bedtools_bedtobam`: Converts feature records (bed/gff/vcf) to BAM format (PR #111). + - `bedtools/bedtools_bedtobam`: Converts genomic feature records (bed/gff/vcf) to BAM format (PR #111). ## MINOR CHANGES diff --git a/src/bedtools/bedtools_bedtobam/config.vsh.yaml b/src/bedtools/bedtools_bedtobam/config.vsh.yaml index a01f243b..c2c9667c 100644 --- a/src/bedtools/bedtools_bedtobam/config.vsh.yaml +++ b/src/bedtools/bedtools_bedtobam/config.vsh.yaml @@ -7,7 +7,7 @@ links: repository: https://github.com/arq5x/bedtools2 references: doi: 10.1093/bioinformatics/btq033 -license: GPL-2.0, MIT +license: MIT requirements: commands: [bedtools] authors: @@ -28,7 +28,7 @@ argument_groups: type: file description: | Input genome file. - NOTE: This is not a fasta file. This is a two-column file + NOTE: This is not a fasta file. This is a two-column tab-delimited file where the first column is the chromosome name and the second their sizes. required: true @@ -79,6 +79,10 @@ engines: - type: docker run: | echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt + test_setup: + - type: apt + packages: + - samtools runners: - type: executable diff --git a/src/bedtools/bedtools_bedtobam/test.sh b/src/bedtools/bedtools_bedtobam/test.sh index 2d9b8d5c..fdf714f6 100644 --- a/src/bedtools/bedtools_bedtobam/test.sh +++ b/src/bedtools/bedtools_bedtobam/test.sh @@ -1,7 +1,7 @@ #!/bin/bash # exit on error -set -e +set -eo pipefail ############################################# # helper functions @@ -22,29 +22,32 @@ assert_identical_content() { # Create directories for tests echo "Creating Test Data..." -mkdir -p test_data +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT # Create and populate input files -printf "chr1\t248956422\nchr3\t242193529\nchr2\t198295559\n" > "test_data/genome.txt" -printf "chr2:172936693-172938111\t128\t228\tmy_read/1\t37\t+\nchr2:172936693-172938111\t428\t528\tmy_read/2\t37\t-\n" > "test_data/example.bed" -printf "chr2:172936693-172938111\t128\t228\tmy_read/1\t60\t+\t128\t228\t255,0,0\t1\t100\t0\nchr2:172936693-172938111\t428\t528\tmy_read/2\t60\t-\t428\t528\t255,0,0\t1\t100\t0\n" > "test_data/example.bed12" +printf "chr1\t248956422\nchr3\t242193529\nchr2\t198295559\n" > "$TMPDIR/genome.txt" +printf "chr2:172936693-172938111\t128\t228\tmy_read/1\t37\t+\nchr2:172936693-172938111\t428\t528\tmy_read/2\t37\t-\n" > "$TMPDIR/example.bed" +printf "chr2:172936693-172938111\t128\t228\tmy_read/1\t60\t+\t128\t228\t255,0,0\t1\t100\t0\nchr2:172936693-172938111\t428\t528\tmy_read/2\t60\t-\t428\t528\t255,0,0\t1\t100\t0\n" > "$TMPDIR/example.bed12" # Create and populate example.gff file -printf "##gff-version 3\n" > "test_data/example.gff" -printf "chr1\t.\tgene\t1000\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/example.gff" -printf "chr3\t.\tmRNA\t1000\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/example.gff" -printf "chr1\t.\texon\t1000\t1200\t.\t+\t.\tID=exon1;Parent=transcript1\n" >> "test_data/example.gff" -printf "chr2\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/example.gff" -printf "chr1\t.\tCDS\t1000\t1200\t.\t+\t0\tID=cds1;Parent=transcript1\n" >> "test_data/example.gff" -printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/example.gff" +printf "##gff-version 3\n" > "$TMPDIR/example.gff" +printf "chr1\t.\tgene\t1000\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "$TMPDIR/example.gff" +printf "chr3\t.\tmRNA\t1000\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "$TMPDIR/example.gff" +printf "chr1\t.\texon\t1000\t1200\t.\t+\t.\tID=exon1;Parent=transcript1\n" >> "$TMPDIR/example.gff" +printf "chr2\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "$TMPDIR/example.gff" +printf "chr1\t.\tCDS\t1000\t1200\t.\t+\t0\tID=cds1;Parent=transcript1\n" >> "$TMPDIR/example.gff" +printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "$TMPDIR/example.gff" # Test 1: Default conversion BED to BAM -mkdir test1 -cd test1 +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null echo "> Run bedtools_bedtobam on BED file" "$meta_executable" \ - --input "../test_data/example.bed" \ - --genome "../test_data/genome.txt" \ + --input "../example.bed" \ + --genome "../genome.txt" \ --output "output.bam" # checks @@ -52,7 +55,7 @@ assert_file_exists "output.bam" assert_file_not_empty "output.bam" echo "- test1 succeeded -" -cd .. +popd > /dev/null # I could not assert_identical_content (bam file is compressed and -ubam option does not seem to work on this version). # could use samtools view to check the content of the bam file, would need to add samtools as a dependency. @@ -61,13 +64,12 @@ cd .. # However, its output is still somewhat compressed. # Test 2: BED12 file -mkdir test2 -cd test2 +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null echo "> Run bedtools_bedtobam on BED12 file" "$meta_executable" \ - --input "../test_data/example.bed12" \ - --genome "../test_data/genome.txt" \ + --input "../example.bed12" \ + --genome "../genome.txt" \ --output "output.bam" \ --bed12 \ # --uncompress_bam @@ -78,16 +80,15 @@ assert_file_not_empty "output.bam" # assert_identical_content "output.bam" "../test_data/expected12.sam" echo "- test2 succeeded -" -cd .. +popd > /dev/null # Test 3: Uncompressed BAM file -mkdir test3 -cd test3 +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null echo "> Run bedtools_bedtobam on BED file with uncompressed BAM output" "$meta_executable" \ - --input "../test_data/example.bed" \ - --genome "../test_data/genome.txt" \ + --input "../example.bed" \ + --genome "../genome.txt" \ --output "output.bam" \ --uncompress_bam @@ -98,16 +99,15 @@ assert_file_not_empty "output.bam" echo "- test3 succeeded -" -cd .. +popd > /dev/null # Test 4: Map quality -mkdir test4 -cd test4 +mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null echo "> Run bedtools_bedtobam on BED file with map quality" "$meta_executable" \ - --input "../test_data/example.bed" \ - --genome "../test_data/genome.txt" \ + --input "../example.bed" \ + --genome "../genome.txt" \ --output "output.bam" \ --map_quality 10 \ # --uncompress_bam @@ -118,16 +118,15 @@ assert_file_not_empty "output.bam" #assert_identical_content "output.bam" "../test_data/expected_mapquality.sam" echo "- test4 succeeded -" -cd .. +popd > /dev/null # Test 5: gff to bam conversion -mkdir test5 -cd test5 +mkdir "$TMPDIR/test5" && pushd "$TMPDIR/test5" > /dev/null echo "> Run bedtools_bedtobam on GFF file" "$meta_executable" \ - --input "../test_data/example.gff" \ - --genome "../test_data/genome.txt" \ + --input "../example.gff" \ + --genome "../genome.txt" \ --output "output.bam" # --uncompress_bam @@ -137,7 +136,7 @@ assert_file_not_empty "output.bam" # assert_identical_content "output.bam" "../test_data/expected_gff.sam" echo "- test5 succeeded -" -cd .. +popd > /dev/null echo "---- All tests succeeded! ----" exit 0 diff --git a/src/bedtools/bedtools_bedtobam/test_copy.sh b/src/bedtools/bedtools_bedtobam/test_copy.sh new file mode 100644 index 00000000..68b7da0f --- /dev/null +++ b/src/bedtools/bedtools_bedtobam/test_copy.sh @@ -0,0 +1,143 @@ +#!/bin/bash + +# exit on error +set -eo pipefail + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +mkdir -p test_data + +# Create and populate input files +printf "chr1\t248956422\nchr3\t242193529\nchr2\t198295559\n" > "test_data/genome.txt" +printf "chr2:172936693-172938111\t128\t228\tmy_read/1\t37\t+\nchr2:172936693-172938111\t428\t528\tmy_read/2\t37\t-\n" > "test_data/example.bed" +printf "chr2:172936693-172938111\t128\t228\tmy_read/1\t60\t+\t128\t228\t255,0,0\t1\t100\t0\nchr2:172936693-172938111\t428\t528\tmy_read/2\t60\t-\t428\t528\t255,0,0\t1\t100\t0\n" > "test_data/example.bed12" +# Create and populate example.gff file +printf "##gff-version 3\n" > "test_data/example.gff" +printf "chr1\t.\tgene\t1000\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/example.gff" +printf "chr3\t.\tmRNA\t1000\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/example.gff" +printf "chr1\t.\texon\t1000\t1200\t.\t+\t.\tID=exon1;Parent=transcript1\n" >> "test_data/example.gff" +printf "chr2\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/example.gff" +printf "chr1\t.\tCDS\t1000\t1200\t.\t+\t0\tID=cds1;Parent=transcript1\n" >> "test_data/example.gff" +printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/example.gff" + +# Test 1: Default conversion BED to BAM +mkdir test1 +cd test1 + +echo "> Run bedtools_bedtobam on BED file" +"$meta_executable" \ + --input "../test_data/example.bed" \ + --genome "../test_data/genome.txt" \ + --output "output.bam" + +# checks +assert_file_exists "output.bam" +assert_file_not_empty "output.bam" +echo "- test1 succeeded -" + +cd .. + +# I could not assert_identical_content (bam file is compressed and -ubam option does not seem to work on this version). +# could use samtools view to check the content of the bam file, would need to add samtools as a dependency. +# TODO: check if older versions works with -ubam option!!! +# Version 2.27.1 (bedtools) -ubam outputs better than version 2.30.0 (bedtools). +# However, its output is still somewhat compressed. + +# Test 2: BED12 file +mkdir test2 +cd test2 + +echo "> Run bedtools_bedtobam on BED12 file" +"$meta_executable" \ + --input "../test_data/example.bed12" \ + --genome "../test_data/genome.txt" \ + --output "output.bam" \ + --bed12 \ +# --uncompress_bam + +# checks +assert_file_exists "output.bam" +assert_file_not_empty "output.bam" +# assert_identical_content "output.bam" "../test_data/expected12.sam" +echo "- test2 succeeded -" + +cd .. + +# Test 3: Uncompressed BAM file +mkdir test3 +cd test3 + +echo "> Run bedtools_bedtobam on BED file with uncompressed BAM output" +"$meta_executable" \ + --input "../test_data/example.bed" \ + --genome "../test_data/genome.txt" \ + --output "output.bam" \ + --uncompress_bam + +# checks +assert_file_exists "output.bam" +assert_file_not_empty "output.bam" +# assert_identical_content "output.bam" "../test_data/expected.sam" + +echo "- test3 succeeded -" + +cd .. + +# Test 4: Map quality +mkdir test4 +cd test4 + +echo "> Run bedtools_bedtobam on BED file with map quality" +"$meta_executable" \ + --input "../test_data/example.bed" \ + --genome "../test_data/genome.txt" \ + --output "output.bam" \ + --map_quality 10 \ +# --uncompress_bam + +# checks +assert_file_exists "output.bam" +assert_file_not_empty "output.bam" +#assert_identical_content "output.bam" "../test_data/expected_mapquality.sam" +echo "- test4 succeeded -" + +cd .. + +# Test 5: gff to bam conversion +mkdir test5 +cd test5 + +echo "> Run bedtools_bedtobam on GFF file" +"$meta_executable" \ + --input "../test_data/example.gff" \ + --genome "../test_data/genome.txt" \ + --output "output.bam" +# --uncompress_bam + +# checks +assert_file_exists "output.bam" +assert_file_not_empty "output.bam" +# assert_identical_content "output.bam" "../test_data/expected_gff.sam" +echo "- test5 succeeded -" + +cd .. + +echo "---- All tests succeeded! ----" +exit 0