Skip to content

Commit

Permalink
Working on suggested changes
Browse files Browse the repository at this point in the history
- still need to add content check using samtools view
  • Loading branch information
tgaspe committed Aug 10, 2024
1 parent 53fe29a commit 81c5e28
Show file tree
Hide file tree
Showing 4 changed files with 186 additions and 40 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
* `bedtools`:
- `bedtools/bedtools_intersect`: Allows one to screen for overlaps between two sets of genomic features (PR #94).
- `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98).
- `bedtools/bedtools_bedtobam`: Converts feature records (bed/gff/vcf) to BAM format (PR #111).
- `bedtools/bedtools_bedtobam`: Converts genomic feature records (bed/gff/vcf) to BAM format (PR #111).


## MINOR CHANGES
Expand Down
8 changes: 6 additions & 2 deletions src/bedtools/bedtools_bedtobam/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ links:
repository: https://github.com/arq5x/bedtools2
references:
doi: 10.1093/bioinformatics/btq033
license: GPL-2.0, MIT
license: MIT
requirements:
commands: [bedtools]
authors:
Expand All @@ -28,7 +28,7 @@ argument_groups:
type: file
description: |
Input genome file.
NOTE: This is not a fasta file. This is a two-column file
NOTE: This is not a fasta file. This is a two-column tab-delimited file
where the first column is the chromosome name and the second their sizes.
required: true

Expand Down Expand Up @@ -79,6 +79,10 @@ engines:
- type: docker
run: |
echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt
test_setup:
- type: apt
packages:
- samtools

runners:
- type: executable
Expand Down
73 changes: 36 additions & 37 deletions src/bedtools/bedtools_bedtobam/test.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

# exit on error
set -e
set -eo pipefail

#############################################
# helper functions
Expand All @@ -22,37 +22,40 @@ assert_identical_content() {

# Create directories for tests
echo "Creating Test Data..."
mkdir -p test_data
TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX")
function clean_up {
[[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
}
trap clean_up EXIT

# Create and populate input files
printf "chr1\t248956422\nchr3\t242193529\nchr2\t198295559\n" > "test_data/genome.txt"
printf "chr2:172936693-172938111\t128\t228\tmy_read/1\t37\t+\nchr2:172936693-172938111\t428\t528\tmy_read/2\t37\t-\n" > "test_data/example.bed"
printf "chr2:172936693-172938111\t128\t228\tmy_read/1\t60\t+\t128\t228\t255,0,0\t1\t100\t0\nchr2:172936693-172938111\t428\t528\tmy_read/2\t60\t-\t428\t528\t255,0,0\t1\t100\t0\n" > "test_data/example.bed12"
printf "chr1\t248956422\nchr3\t242193529\nchr2\t198295559\n" > "$TMPDIR/genome.txt"
printf "chr2:172936693-172938111\t128\t228\tmy_read/1\t37\t+\nchr2:172936693-172938111\t428\t528\tmy_read/2\t37\t-\n" > "$TMPDIR/example.bed"
printf "chr2:172936693-172938111\t128\t228\tmy_read/1\t60\t+\t128\t228\t255,0,0\t1\t100\t0\nchr2:172936693-172938111\t428\t528\tmy_read/2\t60\t-\t428\t528\t255,0,0\t1\t100\t0\n" > "$TMPDIR/example.bed12"
# Create and populate example.gff file
printf "##gff-version 3\n" > "test_data/example.gff"
printf "chr1\t.\tgene\t1000\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/example.gff"
printf "chr3\t.\tmRNA\t1000\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/example.gff"
printf "chr1\t.\texon\t1000\t1200\t.\t+\t.\tID=exon1;Parent=transcript1\n" >> "test_data/example.gff"
printf "chr2\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/example.gff"
printf "chr1\t.\tCDS\t1000\t1200\t.\t+\t0\tID=cds1;Parent=transcript1\n" >> "test_data/example.gff"
printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/example.gff"
printf "##gff-version 3\n" > "$TMPDIR/example.gff"
printf "chr1\t.\tgene\t1000\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "$TMPDIR/example.gff"
printf "chr3\t.\tmRNA\t1000\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "$TMPDIR/example.gff"
printf "chr1\t.\texon\t1000\t1200\t.\t+\t.\tID=exon1;Parent=transcript1\n" >> "$TMPDIR/example.gff"
printf "chr2\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "$TMPDIR/example.gff"
printf "chr1\t.\tCDS\t1000\t1200\t.\t+\t0\tID=cds1;Parent=transcript1\n" >> "$TMPDIR/example.gff"
printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "$TMPDIR/example.gff"

# Test 1: Default conversion BED to BAM
mkdir test1
cd test1
mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null

echo "> Run bedtools_bedtobam on BED file"
"$meta_executable" \
--input "../test_data/example.bed" \
--genome "../test_data/genome.txt" \
--input "../example.bed" \
--genome "../genome.txt" \
--output "output.bam"

# checks
assert_file_exists "output.bam"
assert_file_not_empty "output.bam"
echo "- test1 succeeded -"

cd ..
popd > /dev/null

# I could not assert_identical_content (bam file is compressed and -ubam option does not seem to work on this version).
# could use samtools view to check the content of the bam file, would need to add samtools as a dependency.
Expand All @@ -61,13 +64,12 @@ cd ..
# However, its output is still somewhat compressed.

# Test 2: BED12 file
mkdir test2
cd test2
mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null

echo "> Run bedtools_bedtobam on BED12 file"
"$meta_executable" \
--input "../test_data/example.bed12" \
--genome "../test_data/genome.txt" \
--input "../example.bed12" \
--genome "../genome.txt" \
--output "output.bam" \
--bed12 \
# --uncompress_bam
Expand All @@ -78,16 +80,15 @@ assert_file_not_empty "output.bam"
# assert_identical_content "output.bam" "../test_data/expected12.sam"
echo "- test2 succeeded -"

cd ..
popd > /dev/null

# Test 3: Uncompressed BAM file
mkdir test3
cd test3
mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null

echo "> Run bedtools_bedtobam on BED file with uncompressed BAM output"
"$meta_executable" \
--input "../test_data/example.bed" \
--genome "../test_data/genome.txt" \
--input "../example.bed" \
--genome "../genome.txt" \
--output "output.bam" \
--uncompress_bam

Expand All @@ -98,16 +99,15 @@ assert_file_not_empty "output.bam"

echo "- test3 succeeded -"

cd ..
popd > /dev/null

# Test 4: Map quality
mkdir test4
cd test4
mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null

echo "> Run bedtools_bedtobam on BED file with map quality"
"$meta_executable" \
--input "../test_data/example.bed" \
--genome "../test_data/genome.txt" \
--input "../example.bed" \
--genome "../genome.txt" \
--output "output.bam" \
--map_quality 10 \
# --uncompress_bam
Expand All @@ -118,16 +118,15 @@ assert_file_not_empty "output.bam"
#assert_identical_content "output.bam" "../test_data/expected_mapquality.sam"
echo "- test4 succeeded -"

cd ..
popd > /dev/null

# Test 5: gff to bam conversion
mkdir test5
cd test5
mkdir "$TMPDIR/test5" && pushd "$TMPDIR/test5" > /dev/null

echo "> Run bedtools_bedtobam on GFF file"
"$meta_executable" \
--input "../test_data/example.gff" \
--genome "../test_data/genome.txt" \
--input "../example.gff" \
--genome "../genome.txt" \
--output "output.bam"
# --uncompress_bam

Expand All @@ -137,7 +136,7 @@ assert_file_not_empty "output.bam"
# assert_identical_content "output.bam" "../test_data/expected_gff.sam"
echo "- test5 succeeded -"

cd ..
popd > /dev/null

echo "---- All tests succeeded! ----"
exit 0
143 changes: 143 additions & 0 deletions src/bedtools/bedtools_bedtobam/test_copy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#!/bin/bash

# exit on error
set -eo pipefail

#############################################
# helper functions
assert_file_exists() {
[ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; }
}
assert_file_not_empty() {
[ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; }
}
assert_file_contains() {
grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; }
}
assert_identical_content() {
diff -a "$2" "$1" \
|| (echo "Files are not identical!" && exit 1)
}
#############################################

# Create directories for tests
echo "Creating Test Data..."
mkdir -p test_data

# Create and populate input files
printf "chr1\t248956422\nchr3\t242193529\nchr2\t198295559\n" > "test_data/genome.txt"
printf "chr2:172936693-172938111\t128\t228\tmy_read/1\t37\t+\nchr2:172936693-172938111\t428\t528\tmy_read/2\t37\t-\n" > "test_data/example.bed"
printf "chr2:172936693-172938111\t128\t228\tmy_read/1\t60\t+\t128\t228\t255,0,0\t1\t100\t0\nchr2:172936693-172938111\t428\t528\tmy_read/2\t60\t-\t428\t528\t255,0,0\t1\t100\t0\n" > "test_data/example.bed12"
# Create and populate example.gff file
printf "##gff-version 3\n" > "test_data/example.gff"
printf "chr1\t.\tgene\t1000\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/example.gff"
printf "chr3\t.\tmRNA\t1000\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/example.gff"
printf "chr1\t.\texon\t1000\t1200\t.\t+\t.\tID=exon1;Parent=transcript1\n" >> "test_data/example.gff"
printf "chr2\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/example.gff"
printf "chr1\t.\tCDS\t1000\t1200\t.\t+\t0\tID=cds1;Parent=transcript1\n" >> "test_data/example.gff"
printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/example.gff"

# Test 1: Default conversion BED to BAM
mkdir test1
cd test1

echo "> Run bedtools_bedtobam on BED file"
"$meta_executable" \
--input "../test_data/example.bed" \
--genome "../test_data/genome.txt" \
--output "output.bam"

# checks
assert_file_exists "output.bam"
assert_file_not_empty "output.bam"
echo "- test1 succeeded -"

cd ..

# I could not assert_identical_content (bam file is compressed and -ubam option does not seem to work on this version).
# could use samtools view to check the content of the bam file, would need to add samtools as a dependency.
# TODO: check if older versions works with -ubam option!!!
# Version 2.27.1 (bedtools) -ubam outputs better than version 2.30.0 (bedtools).
# However, its output is still somewhat compressed.

# Test 2: BED12 file
mkdir test2
cd test2

echo "> Run bedtools_bedtobam on BED12 file"
"$meta_executable" \
--input "../test_data/example.bed12" \
--genome "../test_data/genome.txt" \
--output "output.bam" \
--bed12 \
# --uncompress_bam

# checks
assert_file_exists "output.bam"
assert_file_not_empty "output.bam"
# assert_identical_content "output.bam" "../test_data/expected12.sam"
echo "- test2 succeeded -"

cd ..

# Test 3: Uncompressed BAM file
mkdir test3
cd test3

echo "> Run bedtools_bedtobam on BED file with uncompressed BAM output"
"$meta_executable" \
--input "../test_data/example.bed" \
--genome "../test_data/genome.txt" \
--output "output.bam" \
--uncompress_bam

# checks
assert_file_exists "output.bam"
assert_file_not_empty "output.bam"
# assert_identical_content "output.bam" "../test_data/expected.sam"

echo "- test3 succeeded -"

cd ..

# Test 4: Map quality
mkdir test4
cd test4

echo "> Run bedtools_bedtobam on BED file with map quality"
"$meta_executable" \
--input "../test_data/example.bed" \
--genome "../test_data/genome.txt" \
--output "output.bam" \
--map_quality 10 \
# --uncompress_bam

# checks
assert_file_exists "output.bam"
assert_file_not_empty "output.bam"
#assert_identical_content "output.bam" "../test_data/expected_mapquality.sam"
echo "- test4 succeeded -"

cd ..

# Test 5: gff to bam conversion
mkdir test5
cd test5

echo "> Run bedtools_bedtobam on GFF file"
"$meta_executable" \
--input "../test_data/example.gff" \
--genome "../test_data/genome.txt" \
--output "output.bam"
# --uncompress_bam

# checks
assert_file_exists "output.bam"
assert_file_not_empty "output.bam"
# assert_identical_content "output.bam" "../test_data/expected_gff.sam"
echo "- test5 succeeded -"

cd ..

echo "---- All tests succeeded! ----"
exit 0

0 comments on commit 81c5e28

Please sign in to comment.