From 8f525f5e40301ad51bc1cd9587c0febbef84bd7d Mon Sep 17 00:00:00 2001 From: Theodoro Gasperin Terra Camargo <98555209+tgaspe@users.noreply.github.com> Date: Wed, 31 Jul 2024 16:20:11 -0300 Subject: [PATCH] Bedtools_Intersect (#94) * Initial Commit * Update config.vsh.yaml * creating templates * Update config.vsh.yaml * Update script.sh * Added output * Update config.vsh.yaml * Update test.sh * Update test.sh * More tests * small changes * update - change some var names - debugged - added more test * Update CHANGELOG.md * Update * Update help.txt --- CHANGELOG.md | 3 + .../bedtools_intersect/config.vsh.yaml | 255 +++++++++++++ src/bedtools/bedtools_intersect/help.txt | 119 ++++++ src/bedtools/bedtools_intersect/script.sh | 61 ++++ src/bedtools/bedtools_intersect/test.sh | 340 ++++++++++++++++++ 5 files changed, 778 insertions(+) create mode 100644 src/bedtools/bedtools_intersect/config.vsh.yaml create mode 100644 src/bedtools/bedtools_intersect/help.txt create mode 100644 src/bedtools/bedtools_intersect/script.sh create mode 100644 src/bedtools/bedtools_intersect/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index c4575cb9..36681056 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,9 @@ * `agat/agat_convert_sp_gff2gtf`: convert any GTF/GFF file into a proper GTF file (PR #76). +* `bedtools`: + - `bedtools/bedtools_intersect`: Allows one to screen for overlaps between two sets of genomic features (PR #94). + ## MINOR CHANGES * `busco` components: update BUSCO to `5.7.1` (PR #72). diff --git a/src/bedtools/bedtools_intersect/config.vsh.yaml b/src/bedtools/bedtools_intersect/config.vsh.yaml new file mode 100644 index 00000000..73dc0047 --- /dev/null +++ b/src/bedtools/bedtools_intersect/config.vsh.yaml @@ -0,0 +1,255 @@ +name: bedtools_intersect +namespace: bedtools +description: | + bedtools intersect allows one to screen for overlaps between two sets of genomic features. + Moreover, it allows one to have fine control as to how the intersections are reported. + bedtools intersect works with both BED/GFF/VCF and BAM files as input. +keywords: [feature intersection, BAM, BED, GFF, VCF] +links: + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/intersect.html + repository: https://github.com/arq5x/bedtools2 +references: + doi: 10.1093/bioinformatics/btq033 +license: GPL-2.0, MIT +requirements: + commands: [bedtools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --input_a + alternatives: -a + type: file + direction: input + description: | + The input file (BED/GFF/VCF/BAM) to be used as the -a file. + required: true + example: input_a.bed + + - name: --input_b + alternatives: -b + type: file + direction: input + multiple: true + description: | + The input file(s) (BED/GFF/VCF/BAM) to be used as the -b file(s). + required: true + example: input_b.bed + + - name: Outputs + arguments: + - name: --output + type: file + direction: output + description: | + The output BED file. + required: true + example: output.bed + + - name: Options + arguments: + - name: --write_a + alternatives: -wa + type: boolean_true + description: Write the original A entry for each overlap. + + - name: --write_b + alternatives: -wb + type: boolean_true + description: | + Write the original B entry for each overlap. + Useful for knowing _what_ A overlaps. Restricted by -f and -r. + + - name: --left_outer_join + alternatives: -loj + type: boolean_true + description: | + Perform a "left outer join". That is, for each feature in A report each overlap with B. + If no overlaps are found, report a NULL feature for B. + + - name: --write_overlap + alternatives: -wo + type: boolean_true + description: | + Write the original A and B entries plus the number of base pairs of overlap between the two features. + - Overlaps restricted by -f and -r. + Only A features with overlap are reported. + + - name: --write_overlap_plus + alternatives: -wao + type: boolean_true + description: | + Write the original A and B entries plus the number of base pairs of overlap between the two features. + - Overlaps restricted by -f and -r. + However, A features w/o overlap are also reported with a NULL B feature and overlap = 0. + + - name: --report_A_if_no_overlap + alternatives: -u + type: boolean_true + description: | + Write the original A entry _if_ no overlap is found. + - In other words, just report the fact >=1 hit was found. + - Overlaps restricted by -f and -r. + + - name: --number_of_overlaps_A + alternatives: -c + type: boolean_true + description: | + For each entry in A, report the number of overlaps with B. + - Reports 0 for A entries that have no overlap with B. + - Overlaps restricted by -f and -r. + + - name: --report_no_overlaps_A + alternatives: -v + type: boolean_true + description: | + Only report those entries in A that have _no overlaps_ with B. + - Similar to "grep -v" (an homage). + + - name: --uncompressed_bam + alternatives: -ubam + type: boolean_true + description: Write uncompressed BAM output. Default writes compressed BAM. + + - name: --same_strand + alternatives: -s + type: boolean_true + description: | + Require same strandedness. That is, only report hits in B. + that overlap A on the _same_ strand. + - By default, overlaps are reported without respect to strand. + + - name: --opposite_strand + alternatives: -S + type: boolean_true + description: | + Require different strandedness. That is, only report hits in B + that overlap A on the _opposite_ strand. + - By default, overlaps are reported without respect to strand. + + - name: --min_overlap_A + alternatives: -f + type: double + description: | + Minimum overlap required as a fraction of A. + - Default is 1E-9 (i.e., 1bp). + - FLOAT (e.g. 0.50) + example: 0.50 + + - name: --min_overlap_B + alternatives: -F + type: double + description: | + Minimum overlap required as a fraction of B. + - Default is 1E-9 (i.e., 1bp). + - FLOAT (e.g. 0.50) + example: 0.50 + + - name: --reciprocal_overlap + alternatives: -r + type: boolean_true + description: | + Require that the fraction overlap be reciprocal for A AND B. + - In other words, if -f is 0.90 and -r is used, this requires + that B overlap 90% of A and A _also_ overlaps 90% of B. + + - name: --either_overlap + alternatives: -e + type: boolean_true + description: | + Require that the minimum fraction be satisfied for A OR B. + - In other words, if -e is used with -f 0.90 and -F 0.10 this requires + that either 90% of A is covered OR 10% of B is covered. + Without -e, both fractions would have to be satisfied. + + - name: --split + type: boolean_true + description: Treat "split" BAM or BED12 entries as distinct BED intervals. + + - name: --genome + alternatives: -g + type: file + description: | + Provide a genome file to enforce consistent chromosome + sort order across input files. Only applies when used + with -sorted option. + example: genome.txt + + - name: --nonamecheck + type: boolean_true + description: | + For sorted data, don't throw an error if the file + has different naming conventions for the same chromosome + (e.g., "chr1" vs "chr01"). + + - name: --sorted + type: boolean_true + description: | + Use the "chromsweep" algorithm for sorted (-k1,1 -k2,2n) input. + + - name: --names + type: string + description: | + When using multiple databases, provide an alias + for each that will appear instead of a fileId when + also printing the DB record. + + - name: --filenames + type: boolean_true + description: When using multiple databases, show each complete filename instead of a fileId when also printing the DB record. + + - name: --sortout + type: boolean_true + description: When using multiple databases, sort the output DB hits for each record. + + - name: --bed + type: boolean_true + description: If using BAM input, write output as BED. + + - name: --header + type: boolean_true + description: Print the header from the A file prior to results. + + - name: --no_buffer_output + alternatives: --nobuf + type: boolean_true + description: | + Disable buffered output. Using this option will cause each line + of output to be printed as it is generated, rather than saved + in a buffer. This will make printing large output files + noticeably slower, but can be useful in conjunction with + other software tools and scripts that need to process one + line of bedtools output at a time. + + - name: --io_buffer_size + alternatives: --iobuf + type: integer + description: | + Specify amount of memory to use for input buffer. + Takes an integer argument. Optional suffixes K/M/G supported. + Note: currently has no effect with compressed files. + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bedtools, procps] + - type: docker + run: | + echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/bedtools/bedtools_intersect/help.txt b/src/bedtools/bedtools_intersect/help.txt new file mode 100644 index 00000000..d1bbdc20 --- /dev/null +++ b/src/bedtools/bedtools_intersect/help.txt @@ -0,0 +1,119 @@ +```bash +bedtools intersect +``` + +Tool: bedtools intersect (aka intersectBed) +Version: v2.30.0 +Summary: Report overlaps between two feature files. + +Usage: bedtools intersect [OPTIONS] -a -b + + Note: -b may be followed with multiple databases and/or + wildcard (*) character(s). +Options: + -wa Write the original entry in A for each overlap. + + -wb Write the original entry in B for each overlap. + - Useful for knowing _what_ A overlaps. Restricted by -f and -r. + + -loj Perform a "left outer join". That is, for each feature in A + report each overlap with B. If no overlaps are found, + report a NULL feature for B. + + -wo Write the original A and B entries plus the number of base + pairs of overlap between the two features. + - Overlaps restricted by -f and -r. + Only A features with overlap are reported. + + -wao Write the original A and B entries plus the number of base + pairs of overlap between the two features. + - Overlapping features restricted by -f and -r. + However, A features w/o overlap are also reported + with a NULL B feature and overlap = 0. + + -u Write the original A entry _once_ if _any_ overlaps found in B. + - In other words, just report the fact >=1 hit was found. + - Overlaps restricted by -f and -r. + + -c For each entry in A, report the number of overlaps with B. + - Reports 0 for A entries that have no overlap with B. + - Overlaps restricted by -f, -F, -r, and -s. + + -C For each entry in A, separately report the number of + - overlaps with each B file on a distinct line. + - Reports 0 for A entries that have no overlap with B. + - Overlaps restricted by -f, -F, -r, and -s. + + -v Only report those entries in A that have _no overlaps_ with B. + - Similar to "grep -v" (an homage). + + -ubam Write uncompressed BAM output. Default writes compressed BAM. + + -s Require same strandedness. That is, only report hits in B + that overlap A on the _same_ strand. + - By default, overlaps are reported without respect to strand. + + -S Require different strandedness. That is, only report hits in B + that overlap A on the _opposite_ strand. + - By default, overlaps are reported without respect to strand. + + -f Minimum overlap required as a fraction of A. + - Default is 1E-9 (i.e., 1bp). + - FLOAT (e.g. 0.50) + + -F Minimum overlap required as a fraction of B. + - Default is 1E-9 (i.e., 1bp). + - FLOAT (e.g. 0.50) + + -r Require that the fraction overlap be reciprocal for A AND B. + - In other words, if -f is 0.90 and -r is used, this requires + that B overlap 90% of A and A _also_ overlaps 90% of B. + + -e Require that the minimum fraction be satisfied for A OR B. + - In other words, if -e is used with -f 0.90 and -F 0.10 this requires + that either 90% of A is covered OR 10% of B is covered. + Without -e, both fractions would have to be satisfied. + + -split Treat "split" BAM or BED12 entries as distinct BED intervals. + + -g Provide a genome file to enforce consistent chromosome sort order + across input files. Only applies when used with -sorted option. + + -nonamecheck For sorted data, don't throw an error if the file has different naming conventions + for the same chromosome. ex. "chr1" vs "chr01". + + -sorted Use the "chromsweep" algorithm for sorted (-k1,1 -k2,2n) input. + + -names When using multiple databases, provide an alias for each that + will appear instead of a fileId when also printing the DB record. + + -filenames When using multiple databases, show each complete filename + instead of a fileId when also printing the DB record. + + -sortout When using multiple databases, sort the output DB hits + for each record. + + -bed If using BAM input, write output as BED. + + -header Print the header from the A file prior to results. + + -nobuf Disable buffered output. Using this option will cause each line + of output to be printed as it is generated, rather than saved + in a buffer. This will make printing large output files + noticeably slower, but can be useful in conjunction with + other software tools and scripts that need to process one + line of bedtools output at a time. + + -iobuf Specify amount of memory to use for input buffer. + Takes an integer argument. Optional suffixes K/M/G supported. + Note: currently has no effect with compressed files. + +Notes: + (1) When a BAM file is used for the A file, the alignment is retained if overlaps exist, + and excluded if an overlap cannot be found. If multiple overlaps exist, they are not + reported, as we are only testing for one or more overlaps. + + + + +***** ERROR: No input file given. Exiting. ***** diff --git a/src/bedtools/bedtools_intersect/script.sh b/src/bedtools/bedtools_intersect/script.sh new file mode 100644 index 00000000..2141859d --- /dev/null +++ b/src/bedtools/bedtools_intersect/script.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +[[ "$par_write_a" == "false" ]] && unset par_write_a +[[ "$par_write_b" == "false" ]] && unset par_write_b +[[ "$par_left_outer_join" == "false" ]] && unset par_left_outer_join +[[ "$par_write_overlap" == "false" ]] && unset par_write_overlap +[[ "$par_write_overlap_plus" == "false" ]] && unset par_write_overlap_plus +[[ "$par_report_A_if_no_overlap" == "false" ]] && unset par_report_A_if_no_overlap +[[ "$par_number_of_overlaps_A" == "false" ]] && unset par_number_of_overlaps_A +[[ "$par_report_no_overlaps_A" == "false" ]] && unset par_report_no_overlaps_A +[[ "$par_uncompressed_bam" == "false" ]] && unset par_uncompressed_bam +[[ "$par_same_strand" == "false" ]] && unset par_same_strand +[[ "$par_opposite_strand" == "false" ]] && unset par_opposite_strand +[[ "$par_reciprocal_overlap" == "false" ]] && unset par_reciprocal_overlap +[[ "$par_either_overlap" == "false" ]] && unset par_either_overlap +[[ "$par_split" == "false" ]] && unset par_split +[[ "$par_nonamecheck" == "false" ]] && unset par_nonamecheck +[[ "$par_sorted" == "false" ]] && unset par_sorted +[[ "$par_filenames" == "false" ]] && unset par_filenames +[[ "$par_sortout" == "false" ]] && unset par_sortout +[[ "$par_bed" == "false" ]] && unset par_bed +[[ "$par_header" == "false" ]] && unset par_header +[[ "$par_no_buffer_output" == "false" ]] && unset par_no_buffer_output + +# Create input array +IFS=";" read -ra input <<< $par_input_b + +bedtools intersect \ + ${par_write_a:+-wa} \ + ${par_write_b:+-wb} \ + ${par_left_outer_join:+-loj} \ + ${par_write_overlap:+-wo} \ + ${par_write_overlap_plus:+-wao} \ + ${par_report_A_if_no_overlap:+-u} \ + ${par_number_of_overlaps_A:+-c} \ + ${par_report_no_overlaps_A:+-v} \ + ${par_uncompressed_bam:+-ubam} \ + ${par_same_strand:+-s} \ + ${par_opposite_strand:+-S} \ + ${par_min_overlap_A:+-f "$par_min_overlap_A"} \ + ${par_min_overlap_B:+-F "$par_min_overlap_B"} \ + ${par_reciprocal_overlap:+-r} \ + ${par_either_overlap:+-e} \ + ${par_split:+-split} \ + ${par_genome:+-g "$par_genome"} \ + ${par_nonamecheck:+-nonamecheck} \ + ${par_sorted:+-sorted} \ + ${par_names:+-names "$par_names"} \ + ${par_filenames:+-filenames} \ + ${par_sortout:+-sortout} \ + ${par_bed:+-bed} \ + ${par_header:+-header} \ + ${par_no_buffer_output:+-nobuf} \ + ${par_io_buffer_size:+-iobuf "$par_io_buffer_size"} \ + -a "$par_input_a" \ + ${par_input_b:+ -b ${input[*]}} \ + > "$par_output" + \ No newline at end of file diff --git a/src/bedtools/bedtools_intersect/test.sh b/src/bedtools/bedtools_intersect/test.sh new file mode 100644 index 00000000..b9405a59 --- /dev/null +++ b/src/bedtools/bedtools_intersect/test.sh @@ -0,0 +1,340 @@ +#!/bin/bash + +# exit on error +set -e + +## VIASH START +meta_executable="target/executable/bedtools/bedtools_intersect/bedtools_intersect" +meta_resources_dir="src/bedtools/bedtools_intersect" +## VIASH END + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +mkdir -p test_data + +# Create and populate featuresA.bed +printf "chr1\t100\t200\nchr1\t150\t250\nchr1\t300\t400\n" > "test_data/featuresA.bed" + +# Create and populate featuresB.bed +printf "chr1\t180\t280\nchr1\t290\t390\nchr1\t500\t600\n" > "test_data/featuresB.bed" + +# Create and populate featuresC.bed +printf "chr1\t120\t220\nchr1\t250\t350\nchr1\t500\t580\n" > "test_data/featuresC.bed" + +# Create and populate examples gff files +# example1.gff +printf "##gff-version 3\n" > "test_data/example1.gff" +printf "chr1\t.\tgene\t1000\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/example1.gff" +printf "chr1\t.\tmRNA\t1000\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/example1.gff" +printf "chr1\t.\texon\t1000\t1200\t.\t+\t.\tID=exon1;Parent=transcript1\n" >> "test_data/example1.gff" +printf "chr1\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/example1.gff" +printf "chr1\t.\tCDS\t1000\t1200\t.\t+\t0\tID=cds1;Parent=transcript1\n" >> "test_data/example1.gff" +printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/example1.gff" +# example2.gff +printf "##gff-version 3\n" > "test_data/example2.gff" +printf "chr1\t.\tgene\t1200\t1800\t.\t-\t.\tID=gene2;Name=Gene2\n" >> "test_data/example2.gff" +printf "chr1\t.\tmRNA\t1400\t2000\t.\t-\t.\tID=transcript2;Parent=gene2\n" >> "test_data/example2.gff" +printf "chr1\t.\texon\t1400\t2000\t.\t-\t.\tID=exon3;Parent=transcript2\n" >> "test_data/example2.gff" +printf "chr1\t.\texon\t1600\t2000\t.\t-\t.\tID=exon4;Parent=transcript2\n" >> "test_data/example2.gff" +printf "chr1\t.\tCDS\t3000\t3200\t.\t-\t1\tID=cds3;Parent=transcript2\n" >> "test_data/example2.gff" +printf "chr1\t.\tCDS\t3500\t3700\t.\t-\t0\tID=cds4;Parent=transcript2\n" >> "test_data/example2.gff" + +# Create and populate expected output files for different tests +printf "chr1\t180\t200\nchr1\t180\t250\nchr1\t300\t390\n" > "test_data/expected_default.bed" +printf "chr1\t100\t200\nchr1\t150\t250\nchr1\t300\t400\n" > "test_data/expected_wa.bed" +printf "chr1\t180\t200\tchr1\t180\t280\nchr1\t180\t250\tchr1\t180\t280\nchr1\t300\t390\tchr1\t290\t390\n" > "test_data/expected_wb.bed" +printf "chr1\t100\t200\tchr1\t180\t280\nchr1\t150\t250\tchr1\t180\t280\nchr1\t300\t400\tchr1\t290\t390\n" > "test_data/expected_loj.bed" +printf "chr1\t100\t200\tchr1\t180\t280\t20\nchr1\t150\t250\tchr1\t180\t280\t70\nchr1\t300\t400\tchr1\t290\t390\t90\n" > "test_data/expected_wo.bed" +printf "chr1\t100\t200\nchr1\t150\t250\nchr1\t300\t400\n" > "test_data/expected_u.bed" +printf "chr1\t100\t200\t1\nchr1\t150\t250\t1\nchr1\t300\t400\t1\n" > "test_data/expected_c.bed" +printf "chr1\t180\t250\nchr1\t300\t390\n" > "test_data/expected_f50.bed" +printf "chr1\t180\t250\nchr1\t300\t390\n" > "test_data/expected_f30.bed" +printf "chr1\t180\t200\nchr1\t180\t250\nchr1\t300\t390\n" > "test_data/expected_f10.bed" +printf "chr1\t180\t200\nchr1\t180\t250\nchr1\t300\t390\n" > "test_data/expected_r.bed" +printf "chr1\t180\t200\nchr1\t120\t200\nchr1\t180\t250\nchr1\t150\t220\nchr1\t300\t390\nchr1\t300\t350\n" > "test_data/expected_multiple.bed" +# expected gff file +printf "chr1\t.\tgene\t1200\t1800\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/expected.gff" +printf "chr1\t.\tgene\t1400\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/expected.gff" +printf "chr1\t.\tgene\t1400\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/expected.gff" +printf "chr1\t.\tgene\t1600\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/expected.gff" +printf "chr1\t.\tmRNA\t1200\t1800\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/expected.gff" +printf "chr1\t.\tmRNA\t1400\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/expected.gff" +printf "chr1\t.\tmRNA\t1400\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/expected.gff" +printf "chr1\t.\tmRNA\t1600\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/expected.gff" +printf "chr1\t.\texon\t1200\t1200\t.\t+\t.\tID=exon1;Parent=transcript1\n" >> "test_data/expected.gff" +printf "chr1\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/expected.gff" +printf "chr1\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/expected.gff" +printf "chr1\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/expected.gff" +printf "chr1\t.\texon\t1600\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/expected.gff" +printf "chr1\t.\tCDS\t1200\t1200\t.\t+\t0\tID=cds1;Parent=transcript1\n" >> "test_data/expected.gff" +printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/expected.gff" +printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/expected.gff" +printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/expected.gff" +printf "chr1\t.\tCDS\t1600\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/expected.gff" + +# Test 1: Default intersect +mkdir test1 +cd test1 + +echo "> Run bedtools_intersect on BED files with default intersect" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_default.bed" +echo "- test1 succeeded -" + +cd .. + +# Test 2: Write A option +mkdir test2 +cd test2 + +echo "> Run bedtools_intersect on BED files with -wa option" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" \ + --write_a + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_wa.bed" +echo "- test2 succeeded -" + +cd .. + +# Test 3: -wb option +mkdir test3 +cd test3 + +echo "> Run bedtools_intersect on BED files with -wb option" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" \ + --write_b + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_wb.bed" +echo "- test3 succeeded -" + +cd .. + +# Test 4: -loj option +mkdir test4 +cd test4 + +echo "> Run bedtools_intersect on BED files with -loj option" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" \ + --left_outer_join + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_loj.bed" +echo "- test4 succeeded -" + +cd .. + +# Test 5: -wo option +mkdir test5 +cd test5 + +echo "> Run bedtools_intersect on BED files with -wo option" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" \ + --write_overlap + + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_wo.bed" +echo "- test5 succeeded -" + +cd .. + +# Test 6: -u option +mkdir test6 +cd test6 + +echo "> Run bedtools_intersect on BED files with -u option" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" \ + --report_A_if_no_overlap true + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_u.bed" +echo "- test6 succeeded -" + +cd .. + +# Test 7: -c option +mkdir test7 +cd test7 + +echo "> Run bedtools_intersect on BED files with -c option" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" \ + --number_of_overlaps_A true + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_c.bed" +echo "- test7 succeeded -" + +cd .. + +# Test 8: -f 0.50 option +mkdir test8 +cd test8 + +echo "> Run bedtools_intersect on BED files with -f 0.50 option" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" \ + --min_overlap_A 0.50 + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_f50.bed" +echo "- test8 succeeded -" + +cd .. + +# Test 9: -f 0.30 option +mkdir test9 +cd test9 + +echo "> Run bedtools_intersect on BED files with -f 0.30 option" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" \ + --min_overlap_A 0.30 + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_f30.bed" +echo "- test9 succeeded -" + +cd .. + +# Test 10: -f 0.10 option +mkdir test10 +cd test10 + +echo "> Run bedtools_intersect on BED files with -f 0.10 option" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" \ + --min_overlap_A 0.10 + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_f10.bed" +echo "- test10 succeeded -" + +cd .. + +# Test 11: -r option +mkdir test11 +cd test11 + +echo "> Run bedtools_intersect on BED files with -r option" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" \ + --reciprocal_overlap true + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_r.bed" +echo "- test11 succeeded -" + +cd .. + + +# Test 12: Multiple files +mkdir test12 +cd test12 + +echo "> Run bedtools_intersect on multiple BED files" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --input_b "../test_data/featuresC.bed" \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_multiple.bed" +echo "- test12 succeeded -" + +cd .. + +# Test 13: VCF file format +mkdir test13 +cd test13 + +echo "> Run bedtools_intersect on GFF files" +"$meta_executable" \ + --input_a "../test_data/example1.gff" \ + --input_b "../test_data/example2.gff" \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected.gff" +echo "- test13 succeeded -" + +cd .. + +echo "---- All tests succeeded! ----" +exit 0