From 8f525f5e40301ad51bc1cd9587c0febbef84bd7d Mon Sep 17 00:00:00 2001 From: Theodoro Gasperin Terra Camargo <98555209+tgaspe@users.noreply.github.com> Date: Wed, 31 Jul 2024 16:20:11 -0300 Subject: [PATCH 1/6] Bedtools_Intersect (#94) * Initial Commit * Update config.vsh.yaml * creating templates * Update config.vsh.yaml * Update script.sh * Added output * Update config.vsh.yaml * Update test.sh * Update test.sh * More tests * small changes * update - change some var names - debugged - added more test * Update CHANGELOG.md * Update * Update help.txt --- CHANGELOG.md | 3 + .../bedtools_intersect/config.vsh.yaml | 255 +++++++++++++ src/bedtools/bedtools_intersect/help.txt | 119 ++++++ src/bedtools/bedtools_intersect/script.sh | 61 ++++ src/bedtools/bedtools_intersect/test.sh | 340 ++++++++++++++++++ 5 files changed, 778 insertions(+) create mode 100644 src/bedtools/bedtools_intersect/config.vsh.yaml create mode 100644 src/bedtools/bedtools_intersect/help.txt create mode 100644 src/bedtools/bedtools_intersect/script.sh create mode 100644 src/bedtools/bedtools_intersect/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index c4575cb9..36681056 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,9 @@ * `agat/agat_convert_sp_gff2gtf`: convert any GTF/GFF file into a proper GTF file (PR #76). +* `bedtools`: + - `bedtools/bedtools_intersect`: Allows one to screen for overlaps between two sets of genomic features (PR #94). + ## MINOR CHANGES * `busco` components: update BUSCO to `5.7.1` (PR #72). diff --git a/src/bedtools/bedtools_intersect/config.vsh.yaml b/src/bedtools/bedtools_intersect/config.vsh.yaml new file mode 100644 index 00000000..73dc0047 --- /dev/null +++ b/src/bedtools/bedtools_intersect/config.vsh.yaml @@ -0,0 +1,255 @@ +name: bedtools_intersect +namespace: bedtools +description: | + bedtools intersect allows one to screen for overlaps between two sets of genomic features. + Moreover, it allows one to have fine control as to how the intersections are reported. + bedtools intersect works with both BED/GFF/VCF and BAM files as input. +keywords: [feature intersection, BAM, BED, GFF, VCF] +links: + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/intersect.html + repository: https://github.com/arq5x/bedtools2 +references: + doi: 10.1093/bioinformatics/btq033 +license: GPL-2.0, MIT +requirements: + commands: [bedtools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --input_a + alternatives: -a + type: file + direction: input + description: | + The input file (BED/GFF/VCF/BAM) to be used as the -a file. + required: true + example: input_a.bed + + - name: --input_b + alternatives: -b + type: file + direction: input + multiple: true + description: | + The input file(s) (BED/GFF/VCF/BAM) to be used as the -b file(s). + required: true + example: input_b.bed + + - name: Outputs + arguments: + - name: --output + type: file + direction: output + description: | + The output BED file. + required: true + example: output.bed + + - name: Options + arguments: + - name: --write_a + alternatives: -wa + type: boolean_true + description: Write the original A entry for each overlap. + + - name: --write_b + alternatives: -wb + type: boolean_true + description: | + Write the original B entry for each overlap. + Useful for knowing _what_ A overlaps. Restricted by -f and -r. + + - name: --left_outer_join + alternatives: -loj + type: boolean_true + description: | + Perform a "left outer join". That is, for each feature in A report each overlap with B. + If no overlaps are found, report a NULL feature for B. + + - name: --write_overlap + alternatives: -wo + type: boolean_true + description: | + Write the original A and B entries plus the number of base pairs of overlap between the two features. + - Overlaps restricted by -f and -r. + Only A features with overlap are reported. + + - name: --write_overlap_plus + alternatives: -wao + type: boolean_true + description: | + Write the original A and B entries plus the number of base pairs of overlap between the two features. + - Overlaps restricted by -f and -r. + However, A features w/o overlap are also reported with a NULL B feature and overlap = 0. + + - name: --report_A_if_no_overlap + alternatives: -u + type: boolean_true + description: | + Write the original A entry _if_ no overlap is found. + - In other words, just report the fact >=1 hit was found. + - Overlaps restricted by -f and -r. + + - name: --number_of_overlaps_A + alternatives: -c + type: boolean_true + description: | + For each entry in A, report the number of overlaps with B. + - Reports 0 for A entries that have no overlap with B. + - Overlaps restricted by -f and -r. + + - name: --report_no_overlaps_A + alternatives: -v + type: boolean_true + description: | + Only report those entries in A that have _no overlaps_ with B. + - Similar to "grep -v" (an homage). + + - name: --uncompressed_bam + alternatives: -ubam + type: boolean_true + description: Write uncompressed BAM output. Default writes compressed BAM. + + - name: --same_strand + alternatives: -s + type: boolean_true + description: | + Require same strandedness. That is, only report hits in B. + that overlap A on the _same_ strand. + - By default, overlaps are reported without respect to strand. + + - name: --opposite_strand + alternatives: -S + type: boolean_true + description: | + Require different strandedness. That is, only report hits in B + that overlap A on the _opposite_ strand. + - By default, overlaps are reported without respect to strand. + + - name: --min_overlap_A + alternatives: -f + type: double + description: | + Minimum overlap required as a fraction of A. + - Default is 1E-9 (i.e., 1bp). + - FLOAT (e.g. 0.50) + example: 0.50 + + - name: --min_overlap_B + alternatives: -F + type: double + description: | + Minimum overlap required as a fraction of B. + - Default is 1E-9 (i.e., 1bp). + - FLOAT (e.g. 0.50) + example: 0.50 + + - name: --reciprocal_overlap + alternatives: -r + type: boolean_true + description: | + Require that the fraction overlap be reciprocal for A AND B. + - In other words, if -f is 0.90 and -r is used, this requires + that B overlap 90% of A and A _also_ overlaps 90% of B. + + - name: --either_overlap + alternatives: -e + type: boolean_true + description: | + Require that the minimum fraction be satisfied for A OR B. + - In other words, if -e is used with -f 0.90 and -F 0.10 this requires + that either 90% of A is covered OR 10% of B is covered. + Without -e, both fractions would have to be satisfied. + + - name: --split + type: boolean_true + description: Treat "split" BAM or BED12 entries as distinct BED intervals. + + - name: --genome + alternatives: -g + type: file + description: | + Provide a genome file to enforce consistent chromosome + sort order across input files. Only applies when used + with -sorted option. + example: genome.txt + + - name: --nonamecheck + type: boolean_true + description: | + For sorted data, don't throw an error if the file + has different naming conventions for the same chromosome + (e.g., "chr1" vs "chr01"). + + - name: --sorted + type: boolean_true + description: | + Use the "chromsweep" algorithm for sorted (-k1,1 -k2,2n) input. + + - name: --names + type: string + description: | + When using multiple databases, provide an alias + for each that will appear instead of a fileId when + also printing the DB record. + + - name: --filenames + type: boolean_true + description: When using multiple databases, show each complete filename instead of a fileId when also printing the DB record. + + - name: --sortout + type: boolean_true + description: When using multiple databases, sort the output DB hits for each record. + + - name: --bed + type: boolean_true + description: If using BAM input, write output as BED. + + - name: --header + type: boolean_true + description: Print the header from the A file prior to results. + + - name: --no_buffer_output + alternatives: --nobuf + type: boolean_true + description: | + Disable buffered output. Using this option will cause each line + of output to be printed as it is generated, rather than saved + in a buffer. This will make printing large output files + noticeably slower, but can be useful in conjunction with + other software tools and scripts that need to process one + line of bedtools output at a time. + + - name: --io_buffer_size + alternatives: --iobuf + type: integer + description: | + Specify amount of memory to use for input buffer. + Takes an integer argument. Optional suffixes K/M/G supported. + Note: currently has no effect with compressed files. + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bedtools, procps] + - type: docker + run: | + echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/bedtools/bedtools_intersect/help.txt b/src/bedtools/bedtools_intersect/help.txt new file mode 100644 index 00000000..d1bbdc20 --- /dev/null +++ b/src/bedtools/bedtools_intersect/help.txt @@ -0,0 +1,119 @@ +```bash +bedtools intersect +``` + +Tool: bedtools intersect (aka intersectBed) +Version: v2.30.0 +Summary: Report overlaps between two feature files. + +Usage: bedtools intersect [OPTIONS] -a -b + + Note: -b may be followed with multiple databases and/or + wildcard (*) character(s). +Options: + -wa Write the original entry in A for each overlap. + + -wb Write the original entry in B for each overlap. + - Useful for knowing _what_ A overlaps. Restricted by -f and -r. + + -loj Perform a "left outer join". That is, for each feature in A + report each overlap with B. If no overlaps are found, + report a NULL feature for B. + + -wo Write the original A and B entries plus the number of base + pairs of overlap between the two features. + - Overlaps restricted by -f and -r. + Only A features with overlap are reported. + + -wao Write the original A and B entries plus the number of base + pairs of overlap between the two features. + - Overlapping features restricted by -f and -r. + However, A features w/o overlap are also reported + with a NULL B feature and overlap = 0. + + -u Write the original A entry _once_ if _any_ overlaps found in B. + - In other words, just report the fact >=1 hit was found. + - Overlaps restricted by -f and -r. + + -c For each entry in A, report the number of overlaps with B. + - Reports 0 for A entries that have no overlap with B. + - Overlaps restricted by -f, -F, -r, and -s. + + -C For each entry in A, separately report the number of + - overlaps with each B file on a distinct line. + - Reports 0 for A entries that have no overlap with B. + - Overlaps restricted by -f, -F, -r, and -s. + + -v Only report those entries in A that have _no overlaps_ with B. + - Similar to "grep -v" (an homage). + + -ubam Write uncompressed BAM output. Default writes compressed BAM. + + -s Require same strandedness. That is, only report hits in B + that overlap A on the _same_ strand. + - By default, overlaps are reported without respect to strand. + + -S Require different strandedness. That is, only report hits in B + that overlap A on the _opposite_ strand. + - By default, overlaps are reported without respect to strand. + + -f Minimum overlap required as a fraction of A. + - Default is 1E-9 (i.e., 1bp). + - FLOAT (e.g. 0.50) + + -F Minimum overlap required as a fraction of B. + - Default is 1E-9 (i.e., 1bp). + - FLOAT (e.g. 0.50) + + -r Require that the fraction overlap be reciprocal for A AND B. + - In other words, if -f is 0.90 and -r is used, this requires + that B overlap 90% of A and A _also_ overlaps 90% of B. + + -e Require that the minimum fraction be satisfied for A OR B. + - In other words, if -e is used with -f 0.90 and -F 0.10 this requires + that either 90% of A is covered OR 10% of B is covered. + Without -e, both fractions would have to be satisfied. + + -split Treat "split" BAM or BED12 entries as distinct BED intervals. + + -g Provide a genome file to enforce consistent chromosome sort order + across input files. Only applies when used with -sorted option. + + -nonamecheck For sorted data, don't throw an error if the file has different naming conventions + for the same chromosome. ex. "chr1" vs "chr01". + + -sorted Use the "chromsweep" algorithm for sorted (-k1,1 -k2,2n) input. + + -names When using multiple databases, provide an alias for each that + will appear instead of a fileId when also printing the DB record. + + -filenames When using multiple databases, show each complete filename + instead of a fileId when also printing the DB record. + + -sortout When using multiple databases, sort the output DB hits + for each record. + + -bed If using BAM input, write output as BED. + + -header Print the header from the A file prior to results. + + -nobuf Disable buffered output. Using this option will cause each line + of output to be printed as it is generated, rather than saved + in a buffer. This will make printing large output files + noticeably slower, but can be useful in conjunction with + other software tools and scripts that need to process one + line of bedtools output at a time. + + -iobuf Specify amount of memory to use for input buffer. + Takes an integer argument. Optional suffixes K/M/G supported. + Note: currently has no effect with compressed files. + +Notes: + (1) When a BAM file is used for the A file, the alignment is retained if overlaps exist, + and excluded if an overlap cannot be found. If multiple overlaps exist, they are not + reported, as we are only testing for one or more overlaps. + + + + +***** ERROR: No input file given. Exiting. ***** diff --git a/src/bedtools/bedtools_intersect/script.sh b/src/bedtools/bedtools_intersect/script.sh new file mode 100644 index 00000000..2141859d --- /dev/null +++ b/src/bedtools/bedtools_intersect/script.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +[[ "$par_write_a" == "false" ]] && unset par_write_a +[[ "$par_write_b" == "false" ]] && unset par_write_b +[[ "$par_left_outer_join" == "false" ]] && unset par_left_outer_join +[[ "$par_write_overlap" == "false" ]] && unset par_write_overlap +[[ "$par_write_overlap_plus" == "false" ]] && unset par_write_overlap_plus +[[ "$par_report_A_if_no_overlap" == "false" ]] && unset par_report_A_if_no_overlap +[[ "$par_number_of_overlaps_A" == "false" ]] && unset par_number_of_overlaps_A +[[ "$par_report_no_overlaps_A" == "false" ]] && unset par_report_no_overlaps_A +[[ "$par_uncompressed_bam" == "false" ]] && unset par_uncompressed_bam +[[ "$par_same_strand" == "false" ]] && unset par_same_strand +[[ "$par_opposite_strand" == "false" ]] && unset par_opposite_strand +[[ "$par_reciprocal_overlap" == "false" ]] && unset par_reciprocal_overlap +[[ "$par_either_overlap" == "false" ]] && unset par_either_overlap +[[ "$par_split" == "false" ]] && unset par_split +[[ "$par_nonamecheck" == "false" ]] && unset par_nonamecheck +[[ "$par_sorted" == "false" ]] && unset par_sorted +[[ "$par_filenames" == "false" ]] && unset par_filenames +[[ "$par_sortout" == "false" ]] && unset par_sortout +[[ "$par_bed" == "false" ]] && unset par_bed +[[ "$par_header" == "false" ]] && unset par_header +[[ "$par_no_buffer_output" == "false" ]] && unset par_no_buffer_output + +# Create input array +IFS=";" read -ra input <<< $par_input_b + +bedtools intersect \ + ${par_write_a:+-wa} \ + ${par_write_b:+-wb} \ + ${par_left_outer_join:+-loj} \ + ${par_write_overlap:+-wo} \ + ${par_write_overlap_plus:+-wao} \ + ${par_report_A_if_no_overlap:+-u} \ + ${par_number_of_overlaps_A:+-c} \ + ${par_report_no_overlaps_A:+-v} \ + ${par_uncompressed_bam:+-ubam} \ + ${par_same_strand:+-s} \ + ${par_opposite_strand:+-S} \ + ${par_min_overlap_A:+-f "$par_min_overlap_A"} \ + ${par_min_overlap_B:+-F "$par_min_overlap_B"} \ + ${par_reciprocal_overlap:+-r} \ + ${par_either_overlap:+-e} \ + ${par_split:+-split} \ + ${par_genome:+-g "$par_genome"} \ + ${par_nonamecheck:+-nonamecheck} \ + ${par_sorted:+-sorted} \ + ${par_names:+-names "$par_names"} \ + ${par_filenames:+-filenames} \ + ${par_sortout:+-sortout} \ + ${par_bed:+-bed} \ + ${par_header:+-header} \ + ${par_no_buffer_output:+-nobuf} \ + ${par_io_buffer_size:+-iobuf "$par_io_buffer_size"} \ + -a "$par_input_a" \ + ${par_input_b:+ -b ${input[*]}} \ + > "$par_output" + \ No newline at end of file diff --git a/src/bedtools/bedtools_intersect/test.sh b/src/bedtools/bedtools_intersect/test.sh new file mode 100644 index 00000000..b9405a59 --- /dev/null +++ b/src/bedtools/bedtools_intersect/test.sh @@ -0,0 +1,340 @@ +#!/bin/bash + +# exit on error +set -e + +## VIASH START +meta_executable="target/executable/bedtools/bedtools_intersect/bedtools_intersect" +meta_resources_dir="src/bedtools/bedtools_intersect" +## VIASH END + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +mkdir -p test_data + +# Create and populate featuresA.bed +printf "chr1\t100\t200\nchr1\t150\t250\nchr1\t300\t400\n" > "test_data/featuresA.bed" + +# Create and populate featuresB.bed +printf "chr1\t180\t280\nchr1\t290\t390\nchr1\t500\t600\n" > "test_data/featuresB.bed" + +# Create and populate featuresC.bed +printf "chr1\t120\t220\nchr1\t250\t350\nchr1\t500\t580\n" > "test_data/featuresC.bed" + +# Create and populate examples gff files +# example1.gff +printf "##gff-version 3\n" > "test_data/example1.gff" +printf "chr1\t.\tgene\t1000\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/example1.gff" +printf "chr1\t.\tmRNA\t1000\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/example1.gff" +printf "chr1\t.\texon\t1000\t1200\t.\t+\t.\tID=exon1;Parent=transcript1\n" >> "test_data/example1.gff" +printf "chr1\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/example1.gff" +printf "chr1\t.\tCDS\t1000\t1200\t.\t+\t0\tID=cds1;Parent=transcript1\n" >> "test_data/example1.gff" +printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/example1.gff" +# example2.gff +printf "##gff-version 3\n" > "test_data/example2.gff" +printf "chr1\t.\tgene\t1200\t1800\t.\t-\t.\tID=gene2;Name=Gene2\n" >> "test_data/example2.gff" +printf "chr1\t.\tmRNA\t1400\t2000\t.\t-\t.\tID=transcript2;Parent=gene2\n" >> "test_data/example2.gff" +printf "chr1\t.\texon\t1400\t2000\t.\t-\t.\tID=exon3;Parent=transcript2\n" >> "test_data/example2.gff" +printf "chr1\t.\texon\t1600\t2000\t.\t-\t.\tID=exon4;Parent=transcript2\n" >> "test_data/example2.gff" +printf "chr1\t.\tCDS\t3000\t3200\t.\t-\t1\tID=cds3;Parent=transcript2\n" >> "test_data/example2.gff" +printf "chr1\t.\tCDS\t3500\t3700\t.\t-\t0\tID=cds4;Parent=transcript2\n" >> "test_data/example2.gff" + +# Create and populate expected output files for different tests +printf "chr1\t180\t200\nchr1\t180\t250\nchr1\t300\t390\n" > "test_data/expected_default.bed" +printf "chr1\t100\t200\nchr1\t150\t250\nchr1\t300\t400\n" > "test_data/expected_wa.bed" +printf "chr1\t180\t200\tchr1\t180\t280\nchr1\t180\t250\tchr1\t180\t280\nchr1\t300\t390\tchr1\t290\t390\n" > "test_data/expected_wb.bed" +printf "chr1\t100\t200\tchr1\t180\t280\nchr1\t150\t250\tchr1\t180\t280\nchr1\t300\t400\tchr1\t290\t390\n" > "test_data/expected_loj.bed" +printf "chr1\t100\t200\tchr1\t180\t280\t20\nchr1\t150\t250\tchr1\t180\t280\t70\nchr1\t300\t400\tchr1\t290\t390\t90\n" > "test_data/expected_wo.bed" +printf "chr1\t100\t200\nchr1\t150\t250\nchr1\t300\t400\n" > "test_data/expected_u.bed" +printf "chr1\t100\t200\t1\nchr1\t150\t250\t1\nchr1\t300\t400\t1\n" > "test_data/expected_c.bed" +printf "chr1\t180\t250\nchr1\t300\t390\n" > "test_data/expected_f50.bed" +printf "chr1\t180\t250\nchr1\t300\t390\n" > "test_data/expected_f30.bed" +printf "chr1\t180\t200\nchr1\t180\t250\nchr1\t300\t390\n" > "test_data/expected_f10.bed" +printf "chr1\t180\t200\nchr1\t180\t250\nchr1\t300\t390\n" > "test_data/expected_r.bed" +printf "chr1\t180\t200\nchr1\t120\t200\nchr1\t180\t250\nchr1\t150\t220\nchr1\t300\t390\nchr1\t300\t350\n" > "test_data/expected_multiple.bed" +# expected gff file +printf "chr1\t.\tgene\t1200\t1800\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/expected.gff" +printf "chr1\t.\tgene\t1400\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/expected.gff" +printf "chr1\t.\tgene\t1400\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/expected.gff" +printf "chr1\t.\tgene\t1600\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/expected.gff" +printf "chr1\t.\tmRNA\t1200\t1800\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/expected.gff" +printf "chr1\t.\tmRNA\t1400\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/expected.gff" +printf "chr1\t.\tmRNA\t1400\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/expected.gff" +printf "chr1\t.\tmRNA\t1600\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/expected.gff" +printf "chr1\t.\texon\t1200\t1200\t.\t+\t.\tID=exon1;Parent=transcript1\n" >> "test_data/expected.gff" +printf "chr1\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/expected.gff" +printf "chr1\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/expected.gff" +printf "chr1\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/expected.gff" +printf "chr1\t.\texon\t1600\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/expected.gff" +printf "chr1\t.\tCDS\t1200\t1200\t.\t+\t0\tID=cds1;Parent=transcript1\n" >> "test_data/expected.gff" +printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/expected.gff" +printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/expected.gff" +printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/expected.gff" +printf "chr1\t.\tCDS\t1600\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/expected.gff" + +# Test 1: Default intersect +mkdir test1 +cd test1 + +echo "> Run bedtools_intersect on BED files with default intersect" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_default.bed" +echo "- test1 succeeded -" + +cd .. + +# Test 2: Write A option +mkdir test2 +cd test2 + +echo "> Run bedtools_intersect on BED files with -wa option" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" \ + --write_a + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_wa.bed" +echo "- test2 succeeded -" + +cd .. + +# Test 3: -wb option +mkdir test3 +cd test3 + +echo "> Run bedtools_intersect on BED files with -wb option" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" \ + --write_b + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_wb.bed" +echo "- test3 succeeded -" + +cd .. + +# Test 4: -loj option +mkdir test4 +cd test4 + +echo "> Run bedtools_intersect on BED files with -loj option" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" \ + --left_outer_join + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_loj.bed" +echo "- test4 succeeded -" + +cd .. + +# Test 5: -wo option +mkdir test5 +cd test5 + +echo "> Run bedtools_intersect on BED files with -wo option" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" \ + --write_overlap + + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_wo.bed" +echo "- test5 succeeded -" + +cd .. + +# Test 6: -u option +mkdir test6 +cd test6 + +echo "> Run bedtools_intersect on BED files with -u option" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" \ + --report_A_if_no_overlap true + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_u.bed" +echo "- test6 succeeded -" + +cd .. + +# Test 7: -c option +mkdir test7 +cd test7 + +echo "> Run bedtools_intersect on BED files with -c option" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" \ + --number_of_overlaps_A true + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_c.bed" +echo "- test7 succeeded -" + +cd .. + +# Test 8: -f 0.50 option +mkdir test8 +cd test8 + +echo "> Run bedtools_intersect on BED files with -f 0.50 option" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" \ + --min_overlap_A 0.50 + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_f50.bed" +echo "- test8 succeeded -" + +cd .. + +# Test 9: -f 0.30 option +mkdir test9 +cd test9 + +echo "> Run bedtools_intersect on BED files with -f 0.30 option" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" \ + --min_overlap_A 0.30 + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_f30.bed" +echo "- test9 succeeded -" + +cd .. + +# Test 10: -f 0.10 option +mkdir test10 +cd test10 + +echo "> Run bedtools_intersect on BED files with -f 0.10 option" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" \ + --min_overlap_A 0.10 + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_f10.bed" +echo "- test10 succeeded -" + +cd .. + +# Test 11: -r option +mkdir test11 +cd test11 + +echo "> Run bedtools_intersect on BED files with -r option" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --output "output.bed" \ + --reciprocal_overlap true + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_r.bed" +echo "- test11 succeeded -" + +cd .. + + +# Test 12: Multiple files +mkdir test12 +cd test12 + +echo "> Run bedtools_intersect on multiple BED files" +"$meta_executable" \ + --input_a "../test_data/featuresA.bed" \ + --input_b "../test_data/featuresB.bed" \ + --input_b "../test_data/featuresC.bed" \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_multiple.bed" +echo "- test12 succeeded -" + +cd .. + +# Test 13: VCF file format +mkdir test13 +cd test13 + +echo "> Run bedtools_intersect on GFF files" +"$meta_executable" \ + --input_a "../test_data/example1.gff" \ + --input_b "../test_data/example2.gff" \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected.gff" +echo "- test13 succeeded -" + +cd .. + +echo "---- All tests succeeded! ----" +exit 0 From de8b4248b64e0d2e04a6f20c35212403c57a1058 Mon Sep 17 00:00:00 2001 From: Theodoro Gasperin Terra Camargo <98555209+tgaspe@users.noreply.github.com> Date: Wed, 31 Jul 2024 16:21:15 -0300 Subject: [PATCH 2/6] Bedtools sort (#98) * Initial Commmit * config file * Update config.vsh.yaml * Update script.sh * Update test.sh * test files * Update test.sh * adding tests * two more test * more tests * more tests * Update CHANGELOG.md * removing some files * Update help.txt --------- Co-authored-by: Robrecht Cannoodt --- CHANGELOG.md | 1 + src/bedtools/bedtools_sort/config.vsh.yaml | 93 ++++++++ src/bedtools/bedtools_sort/help.txt | 21 ++ src/bedtools/bedtools_sort/script.sh | 27 +++ src/bedtools/bedtools_sort/test.sh | 264 +++++++++++++++++++++ 5 files changed, 406 insertions(+) create mode 100644 src/bedtools/bedtools_sort/config.vsh.yaml create mode 100644 src/bedtools/bedtools_sort/help.txt create mode 100644 src/bedtools/bedtools_sort/script.sh create mode 100644 src/bedtools/bedtools_sort/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 36681056..1debf12b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ * `bedtools`: - `bedtools/bedtools_intersect`: Allows one to screen for overlaps between two sets of genomic features (PR #94). + - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98). ## MINOR CHANGES diff --git a/src/bedtools/bedtools_sort/config.vsh.yaml b/src/bedtools/bedtools_sort/config.vsh.yaml new file mode 100644 index 00000000..5024bd39 --- /dev/null +++ b/src/bedtools/bedtools_sort/config.vsh.yaml @@ -0,0 +1,93 @@ +name: bedtools_sort +namespace: bedtools +description: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria. +keywords: [sort, BED, GFF, VCF] +links: + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/sort.html + repository: https://github.com/arq5x/bedtools2 +references: + doi: 10.1093/bioinformatics/btq033 +license: GPL-2.0, MIT +requirements: + commands: [bedtools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + description: Input file (bed/gff/vcf) to be sorted. + required: true + + - name: Outputs + arguments: + - name: --output + alternatives: -o + type: file + direction: output + description: Output sorted file (bed/gff/vcf) to be written. + + - name: Options + arguments: + - name: --sizeA + type: boolean_true + description: Sort by feature size in ascending order. + + - name: --sizeD + type: boolean_true + description: Sort by feature size in descending order. + + - name: --chrThenSizeA + type: boolean_true + description: Sort by chrom (asc), then feature size (asc). + + - name: --chrThenSizeD + type: boolean_true + description: Sort by chrom (asc), then feature size (desc). + + - name: --chrThenScoreA + type: boolean_true + description: Sort by chrom (asc), then score (asc). + + - name: --chrThenScoreD + type: boolean_true + description: Sort by chrom (asc), then score (desc). + + - name: --genome + alternatives: -g + type: file + description: Sort according to the chromosomes declared in "genome.txt" + + - name: --faidx + type: file + description: Sort according to the chromosomes declared in "names.txt" + + - name: --header + type: boolean_true + description: Print the header from the A file prior to results. + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bedtools, procps] + - type: docker + run: | + echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/bedtools/bedtools_sort/help.txt b/src/bedtools/bedtools_sort/help.txt new file mode 100644 index 00000000..09266c69 --- /dev/null +++ b/src/bedtools/bedtools_sort/help.txt @@ -0,0 +1,21 @@ +```bash +bedtools sort +``` + +Tool: bedtools sort (aka sortBed) +Version: v2.30.0 +Summary: Sorts a feature file in various and useful ways. + +Usage: bedtools sort [OPTIONS] -i + +Options: + -sizeA Sort by feature size in ascending order. + -sizeD Sort by feature size in descending order. + -chrThenSizeA Sort by chrom (asc), then feature size (asc). + -chrThenSizeD Sort by chrom (asc), then feature size (desc). + -chrThenScoreA Sort by chrom (asc), then score (asc). + -chrThenScoreD Sort by chrom (asc), then score (desc). + -g (names.txt) Sort according to the chromosomes declared in "genome.txt" + -faidx (names.txt) Sort according to the chromosomes declared in "names.txt" + -header Print the header from the A file prior to results. + diff --git a/src/bedtools/bedtools_sort/script.sh b/src/bedtools/bedtools_sort/script.sh new file mode 100644 index 00000000..e7f712d7 --- /dev/null +++ b/src/bedtools/bedtools_sort/script.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Unset parameters +[[ "$par_sizeA" == "false" ]] && unset par_sizeA +[[ "$par_sizeD" == "false" ]] && unset par_sizeD +[[ "$par_chrThenSizeA" == "false" ]] && unset par_chrThenSizeA +[[ "$par_chrThenSizeD" == "false" ]] && unset par_chrThenSizeD +[[ "$par_chrThenScoreA" == "false" ]] && unset par_chrThenScoreA +[[ "$par_chrThenScoreD" == "false" ]] && unset par_chrThenScoreD +[[ "$par_header" == "false" ]] && unset par_header + +# Execute bedtools sort with the provided arguments +bedtools sort \ + ${par_sizeA:+-sizeA} \ + ${par_sizeD:+-sizeD} \ + ${par_chrThenSizeA:+-chrThenSizeA} \ + ${par_chrThenSizeD:+-chrThenSizeD} \ + ${par_chrThenScoreA:+-chrThenScoreA} \ + ${par_chrThenScoreD:+-chrThenScoreD} \ + ${par_genome:+-g "$par_genome"} \ + ${par_faidx:+-faidx "$par_faidx"} \ + ${par_header:+-header} \ + -i "$par_input" \ + > "$par_output" diff --git a/src/bedtools/bedtools_sort/test.sh b/src/bedtools/bedtools_sort/test.sh new file mode 100644 index 00000000..bf402c35 --- /dev/null +++ b/src/bedtools/bedtools_sort/test.sh @@ -0,0 +1,264 @@ +#!/bin/bash + +# exit on error +set -e + +## VIASH START +meta_executable="target/executable/bedtools/bedtools_sort/bedtools_sort" +meta_resources_dir="src/bedtools/bedtools_sort" +## VIASH END + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +mkdir -p test_data + +# Create and populate example files +printf "#Header\nchr1\t300\t400\nchr1\t150\t250\nchr1\t100\t200" > "test_data/featureA.bed" +printf "chr2\t290\t400\nchr2\t180\t220\nchr1\t500\t600" > "test_data/featureB.bed" +printf "chr1\t100\t200\tfeature1\t960\nchr1\t150\t250\tfeature2\t850\nchr1\t300\t400\tfeature3\t740\nchr2\t290\t390\tfeature4\t630\nchr2\t180\t280\tfeature5\t920\nchr3\t120\t220\tfeature6\t410\n" > "test_data/featureC.bed" +printf "chr1\nchr3\nchr2\n" > "test_data/genome.txt" +printf "chr1\t248956422\nchr3\t242193529\nchr2\t198295559\n" > "test_data/genome.fai" + +# Create and populate example.gff file +printf "##gff-version 3\n" > "test_data/example.gff" +printf "chr1\t.\tgene\t1000\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/example.gff" +printf "chr3\t.\tmRNA\t1000\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/example.gff" +printf "chr1\t.\texon\t1000\t1200\t.\t+\t.\tID=exon1;Parent=transcript1\n" >> "test_data/example.gff" +printf "chr2\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/example.gff" +printf "chr1\t.\tCDS\t1000\t1200\t.\t+\t0\tID=cds1;Parent=transcript1\n" >> "test_data/example.gff" +printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/example.gff" + +# Create expected output files +printf "chr1\t100\t200\nchr1\t150\t250\nchr1\t300\t400\n" > "test_data/expected_sorted_A.bed" +printf "chr2\t180\t220\nchr1\t500\t600\nchr2\t290\t400\n" > "test_data/expected_sizeA.bed" +printf "chr2\t290\t400\nchr1\t500\t600\nchr2\t180\t220\n" > "test_data/expected_sizeD.bed" +printf "chr1\t500\t600\nchr2\t180\t220\nchr2\t290\t400\n" > "test_data/expected_chrThenSizeA.bed" +printf "chr1\t500\t600\nchr2\t290\t400\nchr2\t180\t220\n" > "test_data/expected_chrThenSizeD.bed" +printf "chr1\t300\t400\tfeature3\t740\nchr1\t150\t250\tfeature2\t850\nchr1\t100\t200\tfeature1\t960\nchr2\t290\t390\tfeature4\t630\nchr2\t180\t280\tfeature5\t920\nchr3\t120\t220\tfeature6\t410\n" > "test_data/expected_chrThenScoreA.bed" +printf "chr1\t100\t200\tfeature1\t960\nchr1\t150\t250\tfeature2\t850\nchr1\t300\t400\tfeature3\t740\nchr2\t180\t280\tfeature5\t920\nchr2\t290\t390\tfeature4\t630\nchr3\t120\t220\tfeature6\t410\n" > "test_data/expected_chrThenScoreD.bed" +printf "chr1\t100\t200\tfeature1\t960\nchr1\t150\t250\tfeature2\t850\nchr1\t300\t400\tfeature3\t740\nchr3\t120\t220\tfeature6\t410\nchr2\t180\t280\tfeature5\t920\nchr2\t290\t390\tfeature4\t630\n" > "test_data/expected_genome.bed" +printf "#Header\nchr1\t100\t200\nchr1\t150\t250\nchr1\t300\t400\n" > "test_data/expected_header.bed" + +# expected_sorted.gff +printf "chr1\t.\tgene\t1000\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/expected_sorted.gff" +printf "chr1\t.\texon\t1000\t1200\t.\t+\t.\tID=exon1;Parent=transcript1\n" >> "test_data/expected_sorted.gff" +printf "chr1\t.\tCDS\t1000\t1200\t.\t+\t0\tID=cds1;Parent=transcript1\n" >> "test_data/expected_sorted.gff" +printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/expected_sorted.gff" +printf "chr2\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/expected_sorted.gff" +printf "chr3\t.\tmRNA\t1000\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/expected_sorted.gff" + +# Test 1: Default sort on BED file +mkdir test1 +cd test1 + +echo "> Run bedtools_sort on BED file" +"$meta_executable" \ + --input "../test_data/featureA.bed" \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_sorted_A.bed" +echo "- test1 succeeded -" + +cd .. + +# Test 2: Default sort on GFF file +mkdir test2 +cd test2 + +echo "> Run bedtools_sort on GFF file" +"$meta_executable" \ + --input "../test_data/example.gff" \ + --output "output.gff" + +# checks +assert_file_exists "output.gff" +assert_file_not_empty "output.gff" +assert_identical_content "output.gff" "../test_data/expected_sorted.gff" +echo "- test2 succeeded -" + +cd .. + +# Test 3: Sort on sizeA +mkdir test3 +cd test3 + +echo "> Run bedtools_sort on BED file with sizeA" +"$meta_executable" \ + --input "../test_data/featureB.bed" \ + --output "output.bed" \ + --sizeA + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_sizeA.bed" +echo "- test3 succeeded -" + +cd .. + +# Test 4: Sort on sizeD +mkdir test4 +cd test4 + +echo "> Run bedtools_sort on BED file with sizeD" +"$meta_executable" \ + --input "../test_data/featureB.bed" \ + --output "output.bed" \ + --sizeD + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_sizeD.bed" +echo "- test4 succeeded -" + +cd .. + +# Test 5: Sort on chrThenSizeA +mkdir test5 +cd test5 + +echo "> Run bedtools_sort on BED file with chrThenSizeA" +"$meta_executable" \ + --input "../test_data/featureB.bed" \ + --output "output.bed" \ + --chrThenSizeA + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_chrThenSizeA.bed" +echo "- test5 succeeded -" + +cd .. + +# Test 6: Sort on chrThenSizeD +mkdir test6 +cd test6 + +echo "> Run bedtools_sort on BED file with chrThenSizeD" +"$meta_executable" \ + --input "../test_data/featureB.bed" \ + --output "output.bed" \ + --chrThenSizeD + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_chrThenSizeD.bed" +echo "- test6 succeeded -" + +cd .. + +# Test 7: Sort on chrThenScoreA +mkdir test7 +cd test7 + +echo "> Run bedtools_sort on BED file with chrThenScoreA" +"$meta_executable" \ + --input "../test_data/featureC.bed" \ + --output "output.bed" \ + --chrThenScoreA + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_chrThenScoreA.bed" +echo "- test7 succeeded -" + +cd .. + +# Test 8: Sort on chrThenScoreD +mkdir test8 +cd test8 + +echo "> Run bedtools_sort on BED file with chrThenScoreD" +"$meta_executable" \ + --input "../test_data/featureC.bed" \ + --output "output.bed" \ + --chrThenScoreD + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_chrThenScoreD.bed" +echo "- test8 succeeded -" + +cd .. + +# Test 9: Sort according to genome file +mkdir test9 +cd test9 + +echo "> Run bedtools_sort on BED file according to genome file" +"$meta_executable" \ + --input "../test_data/featureC.bed" \ + --output "output.bed" \ + --genome "../test_data/genome.txt" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_genome.bed" +echo "- test9 succeeded -" + +cd .. + +# Test 10: Sort according to faidx file +mkdir test10 +cd test10 + +echo "> Run bedtools_sort on BED file according to faidx file" +"$meta_executable" \ + --input "../test_data/featureC.bed" \ + --output "output.bed" \ + --faidx "../test_data/genome.fai" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_genome.bed" +echo "- test10 succeeded -" + +cd .. + +# Test 11: Sort with header +mkdir test11 +cd test11 + +echo "> Run bedtools_sort on BED file with header" +"$meta_executable" \ + --input "../test_data/featureA.bed" \ + --output "output.bed" \ + --header + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_header.bed" +echo "- test11 succeeded -" + +cd .. + +echo "---- All tests succeeded! ----" +exit 0 From 4aa0a893d2f8be5f0d03797afc15a04c53664367 Mon Sep 17 00:00:00 2001 From: Leila011 Date: Wed, 31 Jul 2024 21:23:22 +0200 Subject: [PATCH 3/6] Add agat convert bed2gff (#97) * add config * add help * add script * add test data and expected output file * add script to get test data * add tests * update changelog * fix path to test data --------- Co-authored-by: Robrecht Cannoodt --- CHANGELOG.md | 9 +- src/agat/agat_convert_bed2gff/config.vsh.yaml | 86 ++++++++++++++++++ src/agat/agat_convert_bed2gff/help.txt | 89 +++++++++++++++++++ src/agat/agat_convert_bed2gff/script.sh | 19 ++++ src/agat/agat_convert_bed2gff/test.sh | 27 ++++++ .../test_data/agat_convert_bed2gff_1.gff | 12 +++ .../agat_convert_bed2gff/test_data/script.sh | 10 +++ .../agat_convert_bed2gff/test_data/test.bed | 1 + 8 files changed, 250 insertions(+), 3 deletions(-) create mode 100644 src/agat/agat_convert_bed2gff/config.vsh.yaml create mode 100644 src/agat/agat_convert_bed2gff/help.txt create mode 100644 src/agat/agat_convert_bed2gff/script.sh create mode 100644 src/agat/agat_convert_bed2gff/test.sh create mode 100644 src/agat/agat_convert_bed2gff/test_data/agat_convert_bed2gff_1.gff create mode 100755 src/agat/agat_convert_bed2gff/test_data/script.sh create mode 100644 src/agat/agat_convert_bed2gff/test_data/test.bed diff --git a/CHANGELOG.md b/CHANGELOG.md index 1debf12b..9dd2389c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,11 +19,14 @@ - `seqtk/seqtk_subseq`: Extract the sequences (complete or subsequence) from the FASTA/FASTQ files based on a provided sequence IDs or region coordinates file (PR #85). -* `agat/agat_convert_sp_gff2gtf`: convert any GTF/GFF file into a proper GTF file (PR #76). +* `agat`: + - `agat_convert_sp_gff2gtf`: convert any GTF/GFF file into a proper GTF file (PR #76). + - `/agat_convert_bed2gff`: convert bed file to gff format (PR #97). * `bedtools`: - - `bedtools/bedtools_intersect`: Allows one to screen for overlaps between two sets of genomic features (PR #94). - - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98). + - `bedtools/bedtools_intersect`: Allows one to screen for overlaps between two sets of genomic features (PR #94). + - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98). + ## MINOR CHANGES diff --git a/src/agat/agat_convert_bed2gff/config.vsh.yaml b/src/agat/agat_convert_bed2gff/config.vsh.yaml new file mode 100644 index 00000000..a0fafc44 --- /dev/null +++ b/src/agat/agat_convert_bed2gff/config.vsh.yaml @@ -0,0 +1,86 @@ +name: agat_convert_bed2gff +namespace: agat +description: | + The script takes a bed file as input, and will translate it in gff format. The BED format is described here The script converts 0-based, half-open [start-1, end) bed file to 1-based, closed [start, end] General Feature Format v3 (GFF3). +keywords: [gene annotations, GFF conversion] +links: + homepage: https://github.com/NBISweden/AGAT + documentation: https://agat.readthedocs.io/en/latest/tools/agat_convert_bed2gff.html + issue_tracker: https://github.com/NBISweden/AGAT/issues + repository: https://github.com/NBISweden/AGAT +references: + doi: 10.5281/zenodo.3552717 +license: GPL-3.0 +authors: + - __merge__: /src/_authors/leila_paquay.yaml + roles: [ author, maintainer ] +argument_groups: + - name: Inputs + arguments: + - name: --bed + description: Input bed file that will be converted. + type: file + required: true + direction: input + example: input.bed + - name: Outputs + arguments: + - name: --output + alternatives: [-o, --out, --outfile, --gff] + description: Output GFF file. If no output file is specified, the output will be written to STDOUT. + type: file + direction: output + required: true + example: output.gff + - name: Arguments + arguments: + - name: --source + description: | + The source informs about the tool used to produce the data and is stored in 2nd field of a gff file. Example: Stringtie, Maker, Augustus, etc. [default: data] + type: string + required: false + example: Stringtie + - name: --primary_tag + description: | + The primary_tag corresponds to the data type and is stored in 3rd field of a gff file. Example: gene, mRNA, CDS, etc. [default: gene] + type: string + required: false + example: gene + - name: --inflate_off + description: | + By default we inflate the block fields (blockCount, blockSizes, blockStarts) to create subfeatures of the main feature (primary_tag). The type of subfeature created is based on the inflate_type parameter. If you do not want this inflating behaviour you can deactivate it by using the --inflate_off option. + type: boolean_false + - name: --inflate_type + description: | + Feature type (3rd column in gff) created when inflate parameter activated [default: exon]. + type: string + required: false + example: exon + - name: --verbose + description: add verbosity + type: boolean_true + - name: --config + alternatives: [-c] + description: | + Input agat config file. By default AGAT takes as input agat_config.yaml file from the working directory if any, otherwise it takes the orignal agat_config.yaml shipped with AGAT. To get the agat_config.yaml locally type: "agat config --expose". The --config option gives you the possibility to use your own AGAT config file (located elsewhere or named differently). + type: file + required: false + example: custom_agat_config.yaml +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/agat:1.4.0--pl5321hdfd78af_0 + setup: + - type: docker + run: | + agat --version | sed 's/AGAT\s\(.*\)/agat: "\1"/' > /var/software_versions.txt +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/agat/agat_convert_bed2gff/help.txt b/src/agat/agat_convert_bed2gff/help.txt new file mode 100644 index 00000000..56e953d7 --- /dev/null +++ b/src/agat/agat_convert_bed2gff/help.txt @@ -0,0 +1,89 @@ +```sh +agat_convert_bed2gff.pl --help +``` + ------------------------------------------------------------------------------ +| Another GFF Analysis Toolkit (AGAT) - Version: v1.4.0 | +| https://github.com/NBISweden/AGAT | +| National Bioinformatics Infrastructure Sweden (NBIS) - www.nbis.se | + ------------------------------------------------------------------------------ + + +Name: + agat_convert_bed2gff.pl + +Description: + The script takes a bed file as input, and will translate it in gff + format. The BED format is described here: + https://genome.ucsc.edu/FAQ/FAQformat.html#format1 The script converts + 0-based, half-open [start-1, end) bed file to 1-based, closed [start, + end] General Feature Format v3 (GFF3). + +Usage: + agat_convert_bed2gff.pl --bed infile.bed [ -o outfile ] + agat_convert_bed2gff.pl -h + +Options: + --bed Input bed file that will be converted. + + --source + The source informs about the tool used to produce the data and + is stored in 2nd field of a gff file. Example: + Stringtie,Maker,Augustus,etc. [default: data] + + --primary_tag + The primary_tag corresponds to the data type and is stored in + 3rd field of a gff file. Example: gene,mRNA,CDS,etc. [default: + gene] + + --inflate_off + By default we inflate the block fields (blockCount, blockSizes, + blockStarts) to create subfeatures of the main feature + (primary_tag). The type of subfeature created is based on the + inflate_type parameter. If you do not want this inflating + behaviour you can deactivate it by using the --inflate_off + option. + + --inflate_type + Feature type (3rd column in gff) created when inflate parameter + activated [default: exon]. + + --verbose + add verbosity + + -o , --output , --out , --outfile or --gff + Output GFF file. If no output file is specified, the output will + be written to STDOUT. + + -c or --config + String - Input agat config file. By default AGAT takes as input + agat_config.yaml file from the working directory if any, + otherwise it takes the orignal agat_config.yaml shipped with + AGAT. To get the agat_config.yaml locally type: "agat config + --expose". The --config option gives you the possibility to use + your own AGAT config file (located elsewhere or named + differently). + + -h or --help + Display this helpful text. + +Feedback: + Did you find a bug?: + Do not hesitate to report bugs to help us keep track of the bugs and + their resolution. Please use the GitHub issue tracking system available + at this address: + + https://github.com/NBISweden/AGAT/issues + + Ensure that the bug was not already reported by searching under Issues. + If you're unable to find an (open) issue addressing the problem, open a new one. + Try as much as possible to include in the issue when relevant: + - a clear description, + - as much relevant information as possible, + - the command used, + - a data sample, + - an explanation of the expected behaviour that is not occurring. + + Do you want to contribute?: + You are very welcome, visit this address for the Contributing + guidelines: + https://github.com/NBISweden/AGAT/blob/master/CONTRIBUTING.md diff --git a/src/agat/agat_convert_bed2gff/script.sh b/src/agat/agat_convert_bed2gff/script.sh new file mode 100644 index 00000000..fbeb9206 --- /dev/null +++ b/src/agat/agat_convert_bed2gff/script.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# unset flags +[[ "$par_inflate_off" == "true" ]] && unset par_inflate_off +[[ "$par_verbose" == "false" ]] && unset par_verbose + +# run agat_convert_sp_bed2gff.pl +agat_convert_bed2gff.pl \ + --bed "$par_bed" \ + -o "$par_output" \ + ${par_source:+--source "${par_source}"} \ + ${par_primary_tag:+--primary_tag "${par_primary_tag}"} \ + ${par_inflate_off:+--inflate_off} \ + ${par_inflate_type:+--inflate_type "${par_inflate_type}"} \ + ${par_verbose:+--verbose} + ${par_config:+--config "${par_config}"} \ diff --git a/src/agat/agat_convert_bed2gff/test.sh b/src/agat/agat_convert_bed2gff/test.sh new file mode 100644 index 00000000..6e7d43f3 --- /dev/null +++ b/src/agat/agat_convert_bed2gff/test.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +test_dir="${meta_resources_dir}/test_data" +out_dir="${meta_resources_dir}/out_data" + +echo "> Run $meta_name with test data" +"$meta_executable" \ + --bed "$test_dir/test.bed" \ + --output "$out_dir/output.gff" + +echo ">> Checking output" +[ ! -f "$out_dir/output.gff" ] && echo "Output file output.gff does not exist" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "$out_dir/output.gff" ] && echo "Output file output.gff is empty" && exit 1 + +echo ">> Check if output matches expected output" +diff "$out_dir/output.gff" "$test_dir/agat_convert_bed2gff_1.gff" +if [ $? -ne 0 ]; then + echo "Output file output.gff does not match expected output" + exit 1 +fi + +echo "> Test successful" \ No newline at end of file diff --git a/src/agat/agat_convert_bed2gff/test_data/agat_convert_bed2gff_1.gff b/src/agat/agat_convert_bed2gff/test_data/agat_convert_bed2gff_1.gff new file mode 100644 index 00000000..587e3d09 --- /dev/null +++ b/src/agat/agat_convert_bed2gff/test_data/agat_convert_bed2gff_1.gff @@ -0,0 +1,12 @@ +##gff-version 3 +scaffold625 data gene 337818 343277 . + . ID=1;Name=CLUHART00000008717;blockCount=4;blockSizes=154%2C109%2C111%2C1314;blockStarts=0%2C2915%2C3700%2C4146;itemRgb=255%2C0%2C0;thickEnd=343033;thickStart=337914 +scaffold625 data exon 337818 337971 . + . ID=exon1;Parent=1 +scaffold625 data exon 340733 340841 . + . ID=exon2;Parent=1 +scaffold625 data exon 341518 341628 . + . ID=exon3;Parent=1 +scaffold625 data exon 341964 343277 . + . ID=exon4;Parent=1 +scaffold625 data CDS 337915 337971 . + 0 ID=CDS1;Parent=1 +scaffold625 data CDS 340733 340841 . + 0 ID=CDS2;Parent=1 +scaffold625 data CDS 341518 341628 . + 2 ID=CDS3;Parent=1 +scaffold625 data CDS 341964 343033 . + 2 ID=CDS4;Parent=1 +scaffold625 data five_prime_UTR 337818 337914 . + . ID=five_prime_UTR1;Parent=1 +scaffold625 data three_prime_UTR 343034 343277 . + . ID=three_prime_UTR1;Parent=1 diff --git a/src/agat/agat_convert_bed2gff/test_data/script.sh b/src/agat/agat_convert_bed2gff/test_data/script.sh new file mode 100755 index 00000000..d1206a42 --- /dev/null +++ b/src/agat/agat_convert_bed2gff/test_data/script.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# clone repo +if [ ! -d /tmp/agat_source ]; then + git clone --depth 1 --single-branch --branch master https://github.com/NBISweden/AGAT /tmp/agat_source +fi + +# copy test data +cp -r /tmp/agat_source/t/scripts_output/in/test.bed src/agat/agat_convert_bed2gff/test_data/test.bed +cp -r /tmp/agat_source/t/scripts_output/out/agat_convert_bed2gff_1.gff src/agat/agat_convert_bed2gff/test_data/agat_convert_bed2gff_1.gff \ No newline at end of file diff --git a/src/agat/agat_convert_bed2gff/test_data/test.bed b/src/agat/agat_convert_bed2gff/test_data/test.bed new file mode 100644 index 00000000..bfeba3bb --- /dev/null +++ b/src/agat/agat_convert_bed2gff/test_data/test.bed @@ -0,0 +1 @@ +scaffold625 337817 343277 CLUHART00000008717 0 + 337914 343033 255,0,0 4 154,109,111,1314 0,2915,3700,4146 From ede5850f577cbfe8ca5edf8525703535b12b4a36 Mon Sep 17 00:00:00 2001 From: Leila011 Date: Sat, 10 Aug 2024 08:51:39 +0200 Subject: [PATCH 4/6] Add agat convert embl2gff (#99) * add config * add help * add test data and expected output * add script to get test data * add running script * add test script * update description * update changelog * cleanup * fix path to copy test data * pull the test data again * fix typo GTF => GFF * fix tests * fix output file: replace by generated output * fix test data: add --emblmygff3 * cleanup * config: add longer name to `-k` and `-d` --- CHANGELOG.md | 2 + .../agat_convert_embl2gff/config.vsh.yaml | 84 +++++++++++++++++++ src/agat/agat_convert_embl2gff/help.txt | 78 +++++++++++++++++ src/agat/agat_convert_embl2gff/script.sh | 23 +++++ src/agat/agat_convert_embl2gff/test.sh | 28 +++++++ .../test_data/agat_convert_embl2gff_1.embl | 51 +++++++++++ .../test_data/agat_convert_embl2gff_1.gff | 10 +++ .../agat_convert_embl2gff/test_data/script.sh | 10 +++ 8 files changed, 286 insertions(+) create mode 100644 src/agat/agat_convert_embl2gff/config.vsh.yaml create mode 100644 src/agat/agat_convert_embl2gff/help.txt create mode 100644 src/agat/agat_convert_embl2gff/script.sh create mode 100644 src/agat/agat_convert_embl2gff/test.sh create mode 100644 src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.embl create mode 100644 src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.gff create mode 100755 src/agat/agat_convert_embl2gff/test_data/script.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 9dd2389c..3c2f347a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,8 @@ - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98). +* `agat/agat_convert_embl2gff`: convert an EMBL file into GFF format (PR #99). + ## MINOR CHANGES * `busco` components: update BUSCO to `5.7.1` (PR #72). diff --git a/src/agat/agat_convert_embl2gff/config.vsh.yaml b/src/agat/agat_convert_embl2gff/config.vsh.yaml new file mode 100644 index 00000000..99ceec46 --- /dev/null +++ b/src/agat/agat_convert_embl2gff/config.vsh.yaml @@ -0,0 +1,84 @@ +name: agat_convert_embl2gff +namespace: agat +description: | + The script takes an EMBL file as input, and will translate it in gff format. +keywords: [gene annotations, GFF conversion] +links: + homepage: https://github.com/NBISweden/AGAT + documentation: https://agat.readthedocs.io/en/latest/tools/agat_convert_embl2gff.html + issue_tracker: https://github.com/NBISweden/AGAT/issues + repository: https://github.com/NBISweden/AGAT +references: + doi: 10.5281/zenodo.3552717 +license: GPL-3.0 +authors: + - __merge__: /src/_authors/leila_paquay.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --embl + description: Input EMBL file that will be read. + type: file + required: true + direction: input + example: input.embl + - name: Outputs + arguments: + - name: --output + alternatives: [-o, --out, --outfile, --gff] + description: Output GFF file. If no output file is specified, the output will be written to STDOUT. + type: file + direction: output + required: false + example: output.gff + - name: Arguments + arguments: + - name: --emblmygff3 + description: | + Means that the EMBL flat file comes from the EMBLmyGFF3 software. This is an EMBL format dedicated for submission and contains particularity to deal with. This parameter is needed to get a proper sequence id in the GFF3 from an embl made with EMBLmyGFF3. + type: boolean_true + - name: --primary_tag + alternatives: [--pt, -t] + description: | + List of "primary tag". Useful to discard or keep specific features. Multiple tags must be comma-separated. + type: string + multiple: true + required: false + example: [tag1, tag2] + - name: --discard + alternatives: [-d] + description: | + Means that primary tags provided by the option "primary_tag" will be discarded. + type: boolean_true + - name: --keep + alternatives: [-k] + description: | + Means that only primary tags provided by the option "primary_tag" will be kept. + type: boolean_true + - name: --config + alternatives: [-c] + description: | + Input agat config file. By default AGAT takes as input agat_config.yaml file from the working directory if any, otherwise it takes the original agat_config.yaml shipped with AGAT. To get the agat_config.yaml locally type: "agat config --expose". The --config option gives you the possibility to use your own AGAT config file (located elsewhere or named differently). + type: file + required: false + example: custom_agat_config.yaml +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/agat:1.4.0--pl5321hdfd78af_0 + setup: + - type: docker + run: | + agat --version | sed 's/AGAT\s\(.*\)/agat: "\1"/' > /var/software_versions.txt +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/agat/agat_convert_embl2gff/help.txt b/src/agat/agat_convert_embl2gff/help.txt new file mode 100644 index 00000000..5fce4939 --- /dev/null +++ b/src/agat/agat_convert_embl2gff/help.txt @@ -0,0 +1,78 @@ + ```sh +agat_convert_embl2gff.pl --help +``` + + ------------------------------------------------------------------------------ +| Another GFF Analysis Toolkit (AGAT) - Version: v1.4.0 | +| https://github.com/NBISweden/AGAT | +| National Bioinformatics Infrastructure Sweden (NBIS) - www.nbis.se | + ------------------------------------------------------------------------------ + + +Name: + agat_converter_embl2gff.pl + +Description: + The script takes an EMBL file as input, and will translate it in gff + format. + +Usage: + agat_converter_embl2gff.pl --embl infile.embl [ -o outfile ] + +Options: + --embl Input EMBL file that will be read + + --emblmygff3 + Bolean - Means that the EMBL flat file comes from the EMBLmyGFF3 + software. This is an EMBL format dedicated for submission and + contains particularity to deal with. This parameter is needed to + get a proper sequence id in the GFF3 from an embl made with + EMBLmyGFF3. + + --primary_tag, --pt, -t + List of "primary tag". Useful to discard or keep specific + features. Multiple tags must be coma-separated. + + -d Bolean - Means that primary tags provided by the option + "primary_tag" will be discarded. + + -k Bolean - Means that only primary tags provided by the option + "primary_tag" will be kept. + + -o, --output, --out, --outfile or --gff + Output GFF file. If no output file is specified, the output will + be written to STDOUT. + + -c or --config + String - Input agat config file. By default AGAT takes as input + agat_config.yaml file from the working directory if any, + otherwise it takes the orignal agat_config.yaml shipped with + AGAT. To get the agat_config.yaml locally type: "agat config + --expose". The --config option gives you the possibility to use + your own AGAT config file (located elsewhere or named + differently). + + -h or --help + Display this helpful text. + +Feedback: + Did you find a bug?: + Do not hesitate to report bugs to help us keep track of the bugs and + their resolution. Please use the GitHub issue tracking system available + at this address: + + https://github.com/NBISweden/AGAT/issues + + Ensure that the bug was not already reported by searching under Issues. + If you're unable to find an (open) issue addressing the problem, open a new one. + Try as much as possible to include in the issue when relevant: + - a clear description, + - as much relevant information as possible, + - the command used, + - a data sample, + - an explanation of the expected behaviour that is not occurring. + + Do you want to contribute?: + You are very welcome, visit this address for the Contributing + guidelines: + https://github.com/NBISweden/AGAT/blob/master/CONTRIBUTING.md diff --git a/src/agat/agat_convert_embl2gff/script.sh b/src/agat/agat_convert_embl2gff/script.sh new file mode 100644 index 00000000..63ab8df0 --- /dev/null +++ b/src/agat/agat_convert_embl2gff/script.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +## VIASH START +## VIASH END + + +# unset flags +[[ "$par_emblmygff3" == "false" ]] && unset par_emblmygff3 +[[ "$par_discard" == "false" ]] && unset par_discard +[[ "$par_keep" == "false" ]] && unset par_keep + +# replace ';' with ',' +par_primary_tag=$(echo $par_primary_tag | tr ';' ',') + +# run agat_convert_embl2gff +agat_convert_embl2gff.pl \ + --embl "$par_embl" \ + -o "$par_output" \ + ${par_emblmygff3:+--emblmygff3} \ + ${par_primary_tag:+--primary_tag "${par_primary_tag}"} \ + ${par_discard:+-d} \ + ${par_keep:+-k} \ + ${par_config:+--config "${par_config}"} diff --git a/src/agat/agat_convert_embl2gff/test.sh b/src/agat/agat_convert_embl2gff/test.sh new file mode 100644 index 00000000..81d24aaa --- /dev/null +++ b/src/agat/agat_convert_embl2gff/test.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +test_dir="${meta_resources_dir}/test_data" +out_dir="${meta_resources_dir}/out_data" + +echo "> Run $meta_name with test data and --emblmygff3" +"$meta_executable" \ + --embl "$test_dir/agat_convert_embl2gff_1.embl" \ + --output "$out_dir/output.gff" \ + --emblmygff3 + +echo ">> Checking output" +[ ! -f "$out_dir/output.gff" ] && echo "Output file output.gff does not exist" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "$out_dir/output.gff" ] && echo "Output file output.gff is empty" && exit 1 + +echo ">> Check if output matches expected output" +diff "$out_dir/output.gff" "$test_dir/agat_convert_embl2gff_1.gff" +if [ $? -ne 0 ]; then + echo "Output file output.gff does not match expected output" + exit 1 +fi + +echo "> Test successful" \ No newline at end of file diff --git a/src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.embl b/src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.embl new file mode 100644 index 00000000..aa4f50aa --- /dev/null +++ b/src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.embl @@ -0,0 +1,51 @@ +ID patatrac; SV 1; circular; genomic DNA; XXX; PRO; 317941 BP. +XX +AC XXX; +XX +AC * _ERS324955|SC|contig000001 +XX +PR Project:PRJEBNNNN; +XX +DE XXX +XX +RN [1] +RP 1-2149 +RA XXX; +RT ; +RL Submitted {(DD-MMM-YYYY)} to the INSDC. +XX +FH Key Location/Qualifiers +FH +FT source 1..588788 +FT /organism={"scientific organism name"} +FT /mol_type={"in vivo molecule type of sequence"} +XX +SQ Sequence 588788 BP; 101836 A; 193561 C; 192752 G; 100639 T; 0 other; + tgcgtactcg aagagacgcg cccagattat ataagggcgt cgtctcgagg ccgacggcgc 60 + gccggcgagt acgcgtgatc cacaacccga agcgaccgtc gggagaccga gggtcgtcga 120 + gggtggatac gttcctgcct tcgtgccggg aaacggccga agggaacgtg gcgacctgcg 180 +// +ID fdssf; SV 1; circular; genomic DNA; XXX; PRO; 317941 BP. +XX +AC XXX; +XX +AC * _ERS344554 +XX +PR Project:PRJEBNNNN; +XX +DE XXX +XX +RN [1] +RP 1-2149 +RA XXX; +RT ; +RL Submitted {(DD-MMM-YYYY)} to the INSDC. +XX +FH Key Location/Qualifiers +FH +FT source 1..588788 +FT /organism={"scientific organism name"} +FT /mol_type={"in vivo molecule type of sequence"} +XX +SQ Sequence 588788 BP; 101836 A; 193561 C; 192752 G; 100639 T; 0 other; + TTTTTTTTTT aagagacgcg cccagattat ataagggcgt cgtctcgagg ccgacggcgc 60 diff --git a/src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.gff b/src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.gff new file mode 100644 index 00000000..f6893022 --- /dev/null +++ b/src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.gff @@ -0,0 +1,10 @@ +##gff-version 3 +ERS324955|SC|contig000001 EMBL/GenBank/SwissProt source 1 588788 . + 1 mol_type={"in vivo molecule type of sequence"};organism={"scientific organism name"} +ERS344554 EMBL/GenBank/SwissProt source 1 588788 . + 1 mol_type={"in vivo molecule type of sequence"};organism={"scientific organism name"} +##FASTA +>ERS324955|SC|contig000001 XXX +TGCGTACTCGAAGAGACGCGCCCAGATTATATAAGGGCGTCGTCTCGAGGCCGACGGCGCGCCGGCGAGTACGCGTGATC +CACAACCCGAAGCGACCGTCGGGAGACCGAGGGTCGTCGAGGGTGGATACGTTCCTGCCTTCGTGCCGGGAAACGGCCGA +AGGGAACGTGGCGACCTGCG +>ERS344554 XXX +TTTTTTTTTTAAGAGACGCGCCCAGATTATATAAGGGCGTCGTCTCGAGGCCGACGGCGC diff --git a/src/agat/agat_convert_embl2gff/test_data/script.sh b/src/agat/agat_convert_embl2gff/test_data/script.sh new file mode 100755 index 00000000..7ddbce5b --- /dev/null +++ b/src/agat/agat_convert_embl2gff/test_data/script.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# clone repo +if [ ! -d /tmp/agat_source ]; then + git clone --depth 1 --single-branch --branch master https://github.com/NBISweden/AGAT /tmp/agat_source +fi + +# copy test data +cp -r /tmp/agat_source/t/scripts_output/in/agat_convert_embl2gff_1.embl src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.embl +cp -r /tmp/agat_source/t/scripts_output/out/agat_convert_embl2gff_1.gff src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.gff \ No newline at end of file From d5fc46b1d9ef8313c06e369bc881f6de75c53dd4 Mon Sep 17 00:00:00 2001 From: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> Date: Tue, 13 Aug 2024 10:14:59 +0200 Subject: [PATCH 5/6] Avoid duplicate code when unsetting multiple boolean arguments (#133) * Avoid duplicate code when unsetting multiple boolean arguments * Add CHANGELOG entry [ci skip] * Update CONTRIBUTING guide --- CHANGELOG.md | 2 + CONTRIBUTING.md | 25 ++++++ src/bedtools/bedtools_intersect/script.sh | 49 +++++++----- src/bedtools/bedtools_sort/script.sh | 21 +++-- src/busco/busco_run/script.sh | 28 ++++--- src/fastp/script.sh | 45 ++++++----- src/featurecounts/script.sh | 43 +++++----- src/gffread/script.sh | 97 ++++++++++++----------- src/lofreq/call/script.sh | 37 +++++---- src/multiqc/script.sh | 44 +++++----- src/salmon/salmon_index/script.sh | 19 +++-- src/salmon/salmon_quant/script.sh | 90 +++++++++++---------- src/samtools/samtools_fastq/script.sh | 17 ++-- src/samtools/samtools_sort/script.sh | 25 +++--- src/samtools/samtools_view/script.sh | 38 +++++---- src/umi_tools/umi_tools_dedup/script.sh | 33 +++++--- src/umi_tools/umi_tools_extract/script.sh | 19 +++-- 17 files changed, 380 insertions(+), 252 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c2f347a..5030894c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,8 @@ * Update CI to reusable workflow in `viash-io/viash-actions` (PR #86). +* Update several components in order to avoid duplicate code when using `unset` on boolean arguments (PR #133). + ## DOCUMENTATION * Extend the contributing guidelines (PR #82): diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cee4249a..a32b680c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -320,6 +320,31 @@ Notes: * If your tool allows for multiple inputs using a separator other than `;` (which is the default Viash multiple separator), you can substitute these values with a command like: `par_disable_filters=$(echo $par_disable_filters | tr ';' ',')`. +* If you have a lot of boolean variables that you would like to unset when the value is `false`, you can avoid duplicate code by using the following syntax: + +```bash +unset_if_false=( + par_argument_1 + par_argument_2 + par_argument_3 + par_argument_4 +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done +``` + +this code is equivalent to + +```bash +[[ "$par_argument_1" == "false" ]] && unset par_argument_1 +[[ "$par_argument_2" == "false" ]] && unset par_argument_2 +[[ "$par_argument_3" == "false" ]] && unset par_argument_3 +[[ "$par_argument_4" == "false" ]] && unset par_argument_4 +``` + ### Step 12: Create test script diff --git a/src/bedtools/bedtools_intersect/script.sh b/src/bedtools/bedtools_intersect/script.sh index 2141859d..04a8d854 100644 --- a/src/bedtools/bedtools_intersect/script.sh +++ b/src/bedtools/bedtools_intersect/script.sh @@ -3,27 +3,34 @@ ## VIASH START ## VIASH END -[[ "$par_write_a" == "false" ]] && unset par_write_a -[[ "$par_write_b" == "false" ]] && unset par_write_b -[[ "$par_left_outer_join" == "false" ]] && unset par_left_outer_join -[[ "$par_write_overlap" == "false" ]] && unset par_write_overlap -[[ "$par_write_overlap_plus" == "false" ]] && unset par_write_overlap_plus -[[ "$par_report_A_if_no_overlap" == "false" ]] && unset par_report_A_if_no_overlap -[[ "$par_number_of_overlaps_A" == "false" ]] && unset par_number_of_overlaps_A -[[ "$par_report_no_overlaps_A" == "false" ]] && unset par_report_no_overlaps_A -[[ "$par_uncompressed_bam" == "false" ]] && unset par_uncompressed_bam -[[ "$par_same_strand" == "false" ]] && unset par_same_strand -[[ "$par_opposite_strand" == "false" ]] && unset par_opposite_strand -[[ "$par_reciprocal_overlap" == "false" ]] && unset par_reciprocal_overlap -[[ "$par_either_overlap" == "false" ]] && unset par_either_overlap -[[ "$par_split" == "false" ]] && unset par_split -[[ "$par_nonamecheck" == "false" ]] && unset par_nonamecheck -[[ "$par_sorted" == "false" ]] && unset par_sorted -[[ "$par_filenames" == "false" ]] && unset par_filenames -[[ "$par_sortout" == "false" ]] && unset par_sortout -[[ "$par_bed" == "false" ]] && unset par_bed -[[ "$par_header" == "false" ]] && unset par_header -[[ "$par_no_buffer_output" == "false" ]] && unset par_no_buffer_output +unset_if_false=( + par_write_a + par_write_b + par_left_outer_join + par_write_overlap + par_write_overlap_plus + par_report_A_if_no_overlap + par_number_of_overlaps_A + par_report_no_overlaps_A + par_uncompressed_bam + par_same_strand + par_opposite_strand + par_reciprocal_overlap + par_either_overlap + par_split + par_nonamecheck + par_sorted + par_filenames + par_sortout + par_bed + par_no_buffer_output +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + # Create input array IFS=";" read -ra input <<< $par_input_b diff --git a/src/bedtools/bedtools_sort/script.sh b/src/bedtools/bedtools_sort/script.sh index e7f712d7..0d0b9b54 100644 --- a/src/bedtools/bedtools_sort/script.sh +++ b/src/bedtools/bedtools_sort/script.sh @@ -4,13 +4,20 @@ ## VIASH END # Unset parameters -[[ "$par_sizeA" == "false" ]] && unset par_sizeA -[[ "$par_sizeD" == "false" ]] && unset par_sizeD -[[ "$par_chrThenSizeA" == "false" ]] && unset par_chrThenSizeA -[[ "$par_chrThenSizeD" == "false" ]] && unset par_chrThenSizeD -[[ "$par_chrThenScoreA" == "false" ]] && unset par_chrThenScoreA -[[ "$par_chrThenScoreD" == "false" ]] && unset par_chrThenScoreD -[[ "$par_header" == "false" ]] && unset par_header +unset_if_false=( + par_sizeA + par_sizeD + par_chrThenSizeA + par_chrThenSizeD + par_chrThenScoreA + par_chrThenScoreD + par_header +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done # Execute bedtools sort with the provided arguments bedtools sort \ diff --git a/src/busco/busco_run/script.sh b/src/busco/busco_run/script.sh index a0ef24de..673ccd0b 100644 --- a/src/busco/busco_run/script.sh +++ b/src/busco/busco_run/script.sh @@ -3,18 +3,24 @@ ## VIASH START ## VIASH END +unset_if_false=( + par_tar + par_force + par_quiet + par_restart + par_auto_lineage + par_auto_lineage_euk + par_auto_lineage_prok + par_augustus + par_long + par_scaffold_composition + par_miniprot +) -[[ "$par_tar" == "false" ]] && unset par_tar -[[ "$par_force" == "false" ]] && unset par_force -[[ "$par_quiet" == "false" ]] && unset par_quiet -[[ "$par_restart" == "false" ]] && unset par_restart -[[ "$par_auto_lineage" == "false" ]] && unset par_auto_lineage -[[ "$par_auto_lineage_euk" == "false" ]] && unset par_auto_lineage_euk -[[ "$par_auto_lineage_prok" == "false" ]] && unset par_auto_lineage_prok -[[ "$par_augustus" == "false" ]] && unset par_augustus -[[ "$par_long" == "false" ]] && unset par_long -[[ "$par_scaffold_composition" == "false" ]] && unset par_scaffold_composition -[[ "$par_miniprot" == "false" ]] && unset par_miniprot +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done tmp_dir=$(mktemp -d -p "$meta_temp_dir" busco_XXXXXXXXX) prefix=$(openssl rand -hex 8) diff --git a/src/fastp/script.sh b/src/fastp/script.sh index 4bb37c87..557f7ac3 100644 --- a/src/fastp/script.sh +++ b/src/fastp/script.sh @@ -4,25 +4,32 @@ ## VIASH END # disable flags -[[ "$par_disable_adapter_trimming" == "false" ]] && unset par_disable_adapter_trimming -[[ "$par_detect_adapter_for_pe" == "false" ]] && unset par_detect_adapter_for_pe -[[ "$par_merge" == "false" ]] && unset par_merge -[[ "$par_include_unmerged" == "false" ]] && unset par_include_unmerged -[[ "$par_interleaved_in" == "false" ]] && unset par_interleaved_in -[[ "$par_fix_mgi_id" == "false" ]] && unset par_fix_mgi_id -[[ "$par_phred64" == "false" ]] && unset par_phred64 -[[ "$par_dont_overwrite" == "false" ]] && unset par_dont_overwrite -[[ "$par_verbose" == "false" ]] && unset par_verbose -[[ "$par_dedup" == "false" ]] && unset par_dedup -[[ "$par_dont_eval_duplication" == "false" ]] && unset par_dont_eval_duplication -[[ "$par_trim_poly_g" == "false" ]] && unset par_trim_poly_g -[[ "$par_disable_trim_poly_g" == "false" ]] && unset par_disable_trim_poly_g -[[ "$par_trim_poly_x" == "false" ]] && unset par_trim_poly_x -[[ "$par_disable_quality_filtering" == "false" ]] && unset par_disable_quality_filtering -[[ "$par_disable_length_filtering" == "false" ]] && unset par_disable_length_filtering -[[ "$par_low_complexity_filter" == "false" ]] && unset par_low_complexity_filter -[[ "$par_umi" == "false" ]] && unset par_umi -[[ "$par_overrepresentation_analysis" == "false" ]] && unset par_overrepresentation_analysis +unset_if_false=( + par_disable_adapter_trimming + par_detect_adapter_for_pe + par_merge + par_include_unmerged + par_interleaved_in + par_fix_mgi_id + par_phred64 + par_dont_overwrite + par_verbose + par_dedup + par_dont_eval_duplication + par_trim_poly_g + par_disable_trim_poly_g + par_trim_poly_x + par_disable_quality_filtering + par_disable_length_filtering + par_low_complexity_filter + par_umi + par_overrepresentation_analysis +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done # run command fastp \ diff --git a/src/featurecounts/script.sh b/src/featurecounts/script.sh index 2e54feb3..53f8c63f 100644 --- a/src/featurecounts/script.sh +++ b/src/featurecounts/script.sh @@ -19,24 +19,31 @@ par_feature_type=$(echo $par_feature_type | tr ',' ';') par_extra_attributes=$(echo $par_extra_attributes | tr ',' ';') # unset flag variables -[[ "$par_feature_level" == "false" ]] && unset par_feature_level -[[ "$par_overlapping" == "false" ]] && unset par_overlapping -[[ "$par_largest_overlap" == "false" ]] && unset par_largest_overlap -[[ "$par_multi_mapping" == "false" ]] && unset par_multi_mapping -[[ "$par_fraction" == "false" ]] && unset par_fraction -[[ "$par_split_only" == "false" ]] && unset par_split_only -[[ "$par_non_split_only" == "false" ]] && unset par_non_split_only -[[ "$par_primary" == "false" ]] && unset par_primary -[[ "$par_ignore_dup" == "false" ]] && unset par_ignore_dup -[[ "$par_paired" == "false" ]] && unset par_paired -[[ "$par_count_read_pairs" == "false" ]] && unset par_count_read_pairs -[[ "$par_both_aligned" == "false" ]] && unset par_both_aligned -[[ "$par_check_pe_dist" == "false" ]] && unset par_check_pe_dist -[[ "$par_same_strand" == "false" ]] && unset par_same_strand -[[ "$par_donotsort" == "false" ]] && unset par_donotsort -[[ "$par_by_read_group" == "false" ]] && unset par_by_read_group -[[ "$par_long_reads" == "false" ]] && unset par_long_reads -[[ "$par_verbose" == "false" ]] && unset par_verbose +unset_if_false=( + par_feature_level + par_overlapping + par_largest_overlap + par_multi_mapping + par_fraction + par_split_only + par_non_split_only + par_primary + par_ignore_dup + par_paired + par_count_read_pairs + par_both_aligned + par_check_pe_dist + par_same_strand + par_donotsort + par_by_read_group + par_long_reads + par_verbose +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done IFS=";" read -ra input <<< $par_input diff --git a/src/gffread/script.sh b/src/gffread/script.sh index cd4abf14..fab9e521 100644 --- a/src/gffread/script.sh +++ b/src/gffread/script.sh @@ -4,51 +4,58 @@ ## VIASH END # unset flags -[[ "$par_coding" == "false" ]] && unset par_coding -[[ "$par_strict_range" == "false" ]] && unset par_strict_range -[[ "$par_no_single_exon" == "false" ]] && unset par_no_single_exon -[[ "$par_no_exon_attrs" == "false" ]] && unset par_no_exon_attrs -[[ "$par_nc" == "false" ]] && unset par_nc -[[ "$par_ignore_locus" == "false" ]] && unset par_ignore_locus -[[ "$par_description" == "false" ]] && unset par_description -[[ "$par_sort_alpha" == "false" ]] && unset par_sort_alpha -[[ "$par_keep_genes" == "false" ]] && unset par_keep_genes -[[ "$par_keep_attrs" == "false" ]] && unset par_keep_attrs -[[ "$par_keep_exon_attrs" == "false" ]] && unset par_keep_exon_attrs -[[ "$par_keep_comments" == "false" ]] && unset par_keep_comments -[[ "$par_process_other" == "false" ]] && unset par_process_other -[[ "$par_rm_stop_codons" == "false" ]] && unset par_rm_stop_codons -[[ "$par_adj_cds_start" == "false" ]] && unset par_adj_cds_start -[[ "$par_opposite_strand" == "false" ]] && unset par_opposite_strand -[[ "$par_coding_status" == "false" ]] && unset par_coding_status -[[ "$par_add_hasCDS" == "false" ]] && unset par_add_hasCDS -[[ "$par_adj_stop" == "false" ]] && unset par_adj_stop -[[ "$par_rm_noncanon" == "false" ]] && unset par_rm_noncanon -[[ "$par_complete_cds" == "false" ]] && unset par_complete_cds -[[ "$par_no_pseudo" == "false" ]] && unset par_no_pseudo -[[ "$par_in_bed" == "false" ]] && unset par_in_bed -[[ "$par_in_tlf" == "false" ]] && unset par_in_tlf -[[ "$par_stream" == "false" ]] && unset par_stream -[[ "$par_merge" == "false" ]] && unset par_merge -[[ "$par_rm_redundant" == "false" ]] && unset par_rm_redundant -[[ "$par_no_boundary" == "false" ]] && unset par_no_boundary -[[ "$par_no_overlap" == "false" ]] && unset par_no_overlap -[[ "$par_force_exons" == "false" ]] && unset par_force_exons -[[ "$par_gene2exon" == "false" ]] && unset par_gene2exon -[[ "$par_t_adopt" == "false" ]] && unset par_t_adopt -[[ "$par_decode" == "false" ]] && unset par_decode -[[ "$par_merge_exons" == "false" ]] && unset par_merge_exons -[[ "$par_junctions" == "false" ]] && unset par_junctions -[[ "$par_w_nocds" == "false" ]] && unset par_w_nocds -[[ "$par_tr_cds" == "false" ]] && unset par_tr_cds -[[ "$par_w_coords" == "false" ]] && unset par_w_coords -[[ "$par_stop_dot" == "false" ]] && unset par_stop_dot -[[ "$par_id_version" == "false" ]] && unset par_id_version -[[ "$par_gtf_output" == "false" ]] && unset par_gtf_output -[[ "$par_bed" == "false" ]] && unset par_bed -[[ "$par_tlf" == "false" ]] && unset par_tlf -[[ "$par_expose_dups" == "false" ]] && unset par_expose_dups -[[ "$par_cluster_only" == "false" ]] && unset par_cluster_only +unset_if_false=( + par_coding + par_strict_range + par_no_single_exon + par_no_exon_attrs + par_nc + par_ignore_locus + par_description + par_sort_alpha + par_keep_genes + par_keep_attrs + par_keep_exon_attrs + par_keep_comments + par_process_other + par_rm_stop_codons + par_adj_cds_start + par_opposite_strand + par_coding_status + par_add_hasCDS + par_adj_stop + par_rm_noncanon + par_complete_cds + par_no_pseudo + par_in_bed + par_in_tlf + par_stream + par_merge + par_rm_redundant + par_no_boundary + par_no_overlap + par_force_exons + par_gene2exon + par_t_adopt + par_decode + par_merge_exons + par_junctions + par_w_nocds + par_tr_cds + par_w_coords + par_stop_dot + par_id_version + par_gtf_output + par_bed + par_tlf + par_expose_dups + par_cluster_only +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done # if par_table is not empty, replace ";" with "," par_table=$(echo "$par_table" | tr ';' ',') diff --git a/src/lofreq/call/script.sh b/src/lofreq/call/script.sh index 863fe986..ca229194 100644 --- a/src/lofreq/call/script.sh +++ b/src/lofreq/call/script.sh @@ -4,21 +4,28 @@ ## VIASH END # Unset all parameters that are set to "false" -[[ "$par_no_baq" == "false" ]] && unset par_no_baq -[[ "$par_no_idaq" == "false" ]] && unset par_no_idaq -[[ "$par_del_baq" == "false" ]] && unset par_del_baq -[[ "$par_no_ext_baq" == "false" ]] && unset par_no_ext_baq -[[ "$par_no_mq" == "false" ]] && unset par_no_mq -[[ "$par_call_indels" == "false" ]] && unset par_call_indels -[[ "$par_only_indels" == "false" ]] && unset par_only_indels -[[ "$par_src_qual" == "false" ]] && unset par_src_qual -[[ "$par_illumina_13" == "false" ]] && unset par_illumina_13 -[[ "$par_use_orphan" == "false" ]] && unset par_use_orphan -[[ "$par_plp_summary_only" == "false" ]] && unset par_plp_summary_only -[[ "$par_no_default_filter" == "false" ]] && unset par_no_default_filter -[[ "$par_force_overwrite" == "false" ]] && unset par_force_overwrite -[[ "$par_verbose" == "false" ]] && unset par_verbose -[[ "$par_debug" == "false" ]] && unset par_debug +unset_if_false=( + par_no_baq + par_no_idaq + par_del_baq + par_no_ext_baq + par_no_mq + par_call_indels + par_only_indels + par_src_qual + par_illumina_13 + par_use_orphan + par_plp_summary_only + par_no_default_filter + par_force_overwrite + par_verbose + par_debug +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done # Run lofreq call lofreq call \ diff --git a/src/multiqc/script.sh b/src/multiqc/script.sh index ad8c1c0c..5806fa1d 100755 --- a/src/multiqc/script.sh +++ b/src/multiqc/script.sh @@ -1,26 +1,32 @@ #!/bin/bash # disable flags -[[ "$par_ignore_symlinks" == "false" ]] && unset par_ignore_symlinks -[[ "$par_dirs" == "false" ]] && unset par_dirs -[[ "$par_full_names" == "false" ]] && unset par_full_names -[[ "$par_fn_as_s_name" == "false" ]] && unset par_fn_as_s_name -[[ "$par_profile_runtime" == "false" ]] && unset par_profile_runtime -[[ "$par_verbose" == "false" ]] && unset par_verbose -[[ "$par_quiet" == "false" ]] && unset par_quiet -[[ "$par_strict" == "false" ]] && unset par_strict -[[ "$par_development" == "false" ]] && unset par_development -[[ "$par_require_logs" == "false" ]] && unset par_require_logs -[[ "$par_no_megaqc_upload" == "false" ]] && unset par_no_megaqc_upload -[[ "$par_no_ansi" == "false" ]] && unset par_no_ansi -[[ "$par_flat" == "false" ]] && unset par_flat -[[ "$par_interactive" == "false" ]] && unset par_interactive -[[ "$par_static_plot_export" == "false" ]] && unset par_static_plot_export -[[ "$par_data_dir" == "false" ]] && unset par_data_dir -[[ "$par_no_data_dir" == "false" ]] && unset par_no_data_dir -[[ "$par_zip_data_dir" == "false" ]] && unset par_zip_data_dir -[[ "$par_pdf" == "false" ]] && unset par_pdf +unset_if_false=( + par_ignore_symlinks + par_dirs + par_full_names + par_fn_as_s_name + par_profile_runtime + par_verbose + par_quiet + par_strict + par_development + par_require_logs + par_no_megaqc_upload + par_no_ansi + par_flat + par_interactive + par_static_plot_export + par_data_dir + par_no_data_dir + par_zip_data_dir + par_pdf +) +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done # handle inputs out_dir=$(dirname "$par_output_report") diff --git a/src/salmon/salmon_index/script.sh b/src/salmon/salmon_index/script.sh index c2b9e7a0..5b1c4d76 100644 --- a/src/salmon/salmon_index/script.sh +++ b/src/salmon/salmon_index/script.sh @@ -5,12 +5,19 @@ set -e ## VIASH START ## VIASH END -[[ "$par_gencode" == "false" ]] && unset par_gencode -[[ "$par_features" == "false" ]] && unset par_features -[[ "$par_keep_duplicates" == "false" ]] && unset par_keep_duplicates -[[ "$par_keep_fixed_fasta" == "false" ]] && unset par_keep_fixed_fasta -[[ "$par_sparse" == "false" ]] && unset par_sparse -[[ "$par_no_clip" == "false" ]] && unset par_no_clip +unset_if_false=( + par_gencode + par_features + par_keep_duplicates + par_keep_fixed_fasta + par_sparse + par_no_clip +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done tmp_dir=$(mktemp -d -p "$meta_temp_dir" "${meta_functionality_name}_XXXXXX") mkdir -p "$tmp_dir/temp" diff --git a/src/salmon/salmon_quant/script.sh b/src/salmon/salmon_quant/script.sh index 4c9f69d5..47cba1b9 100644 --- a/src/salmon/salmon_quant/script.sh +++ b/src/salmon/salmon_quant/script.sh @@ -4,49 +4,55 @@ set -e ## VIASH START ## VIASH END +unset_if_false=( + par_discard_orphans + par_ont + par_seq_bias + par_gc_bias + par_pos_bias + par_meta + par_discard_orphans_quasi + par_disable_chaining_heuristic + par_allow_dovetail + par_recover_orphans + par_mimicBT2 + par_mimic_strictBT2 + par_softclip + par_softclip_overhangs + par_full_length_alignment + par_hard_filter + par_write_mappings + par_write_qualities + par_alternative_init_mode + par_skip_quant + par_dump_eq + par_dump_eq_weights + par_reduce_GC_memory + par_init_uniform + par_no_length_correction + par_no_effective_length_correction + par_no_single_frag_prob + par_no_frag_length_dist + par_no_bias_length_threshold + par_useEM + par_useVBOpt + par_no_Gamma_draw + par_bootstrap_reproject + par_quiet + par_per_transcript_prior + par_per_nucleotide_prior + par_write_orphan_links + par_write_unmapped_names + par_no_error_model + par_sample_out + par_sample_unaligned + par_gencode +) -[[ "$par_discard_orphans" == "false" ]] && unset par_discard_orphans -[[ "$par_ont" == "false" ]] && unset par_ont -[[ "$par_seq_bias" == "false" ]] && unset par_seq_bias -[[ "$par_gc_bias" == "false" ]] && unset par_gc_bias -[[ "$par_pos_bias" == "false" ]] && unset par_pos_bias -[[ "$par_meta" == "false" ]] && unset par_meta -[[ "$par_discard_orphans_quasi" == "false" ]] && unset par_discard_orphans_quasi -[[ "$par_disable_chaining_heuristic" == "false" ]] && unset par_disable_chaining_heuristic -[[ "$par_allow_dovetail" == "false" ]] && unset par_allow_dovetail -[[ "$par_recover_orphans" == "false" ]] && unset par_recover_orphans -[[ "$par_mimicBT2" == "false" ]] && unset par_mimicBT2 -[[ "$par_mimic_strictBT2" == "false" ]] && unset par_mimic_strictBT2 -[[ "$par_softclip" == "false" ]] && unset par_softclip -[[ "$par_softclip_overhangs" == "false" ]] && unset par_softclip_overhangs -[[ "$par_full_length_alignment" == "false" ]] && unset par_full_length_alignment -[[ "$par_hard_filter" == "false" ]] && unset par_hard_filter -[[ "$par_write_mappings" == "false" ]] && unset par_write_mappings -[[ "$par_write_qualities" == "false" ]] && unset par_write_qualities -[[ "$par_alternative_init_mode" == "false" ]] && unset par_alternative_init_mode -[[ "$par_skip_quant" == "false" ]] && unset par_skip_quant -[[ "$par_dump_eq" == "false" ]] && unset par_dump_eq -[[ "$par_dump_eq_weights" == "false" ]] && unset par_dump_eq_weights -[[ "$par_reduce_GC_memory" == "false" ]] && unset par_reduce_GC_memory -[[ "$par_init_uniform" == "false" ]] && unset par_init_uniform -[[ "$par_no_length_correction" == "false" ]] && unset par_no_length_correction -[[ "$par_no_effective_length_correction" == "false" ]] && unset par_no_effective_length_correction -[[ "$par_no_single_frag_prob" == "false" ]] && unset par_no_single_frag_prob -[[ "$par_no_frag_length_dist" == "false" ]] && unset par_no_frag_length_dist -[[ "$par_no_bias_length_threshold" == "false" ]] && unset par_no_bias_length_threshold -[[ "$par_useEM" == "false" ]] && unset par_useEM -[[ "$par_useVBOpt" == "false" ]] && unset par_useVBOpt -[[ "$par_no_Gamma_draw" == "false" ]] && unset par_no_Gamma_draw -[[ "$par_bootstrap_reproject" == "false" ]] && unset par_bootstrap_reproject -[[ "$par_quiet" == "false" ]] && unset par_quiet -[[ "$par_per_transcript_prior" == "false" ]] && unset par_per_transcript_prior -[[ "$par_per_nucleotide_prior" == "false" ]] && unset par_per_nucleotide_prior -[[ "$par_write_orphan_links" == "false" ]] && unset par_write_orphan_links -[[ "$par_write_unmapped_names" == "false" ]] && unset par_write_unmapped_names -[[ "$par_no_error_model" == "false" ]] && unset par_no_error_model -[[ "$par_sample_out" == "false" ]] && unset par_sample_out -[[ "$par_sample_unaligned" == "false" ]] && unset par_sample_unaligned -[[ "$par_gencode" == "false" ]] && unset par_gencode +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done IFS=";" read -ra unmated_reads <<< $par_unmated_reads IFS=";" read -ra mates1 <<< $par_mates1 diff --git a/src/samtools/samtools_fastq/script.sh b/src/samtools/samtools_fastq/script.sh index 0cad9cfe..e05da9b0 100644 --- a/src/samtools/samtools_fastq/script.sh +++ b/src/samtools/samtools_fastq/script.sh @@ -5,11 +5,18 @@ set -e -[[ "$par_no_suffix" == "false" ]] && unset par_no_suffix -[[ "$par_suffix" == "false" ]] && unset par_suffix -[[ "$par_use_oq" == "false" ]] && unset par_use_oq -[[ "$par_copy_tags" == "false" ]] && unset par_copy_tags -[[ "$par_casava" == "false" ]] && unset par_casava +unset_if_false=( + par_no_suffix + par_suffix + par_use_oq + par_copy_tags + par_casava +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done if [[ "$meta_name" == "samtools_fasta" ]]; then subcommand=fasta diff --git a/src/samtools/samtools_sort/script.sh b/src/samtools/samtools_sort/script.sh index 94836c18..a8b3ce0f 100644 --- a/src/samtools/samtools_sort/script.sh +++ b/src/samtools/samtools_sort/script.sh @@ -5,15 +5,22 @@ set -e -[[ "$par_uncompressed" == "false" ]] && unset par_uncompressed -[[ "$par_minimiser" == "false" ]] && unset par_minimiser -[[ "$par_not_reverse" == "false" ]] && unset par_not_reverse -[[ "$par_homopolymers" == "false" ]] && unset par_homopolymers -[[ "$par_natural_sort" == "false" ]] && unset par_natural_sort -[[ "$par_ascii_sort" == "false" ]] && unset par_ascii_sort -[[ "$par_template_coordinate" == "false" ]] && unset par_template_coordinate -[[ "$par_write_index" == "false" ]] && unset par_write_index -[[ "$par_no_PG" == "false" ]] && unset par_no_PG +unset_if_false=( + par_uncompressed + par_minimiser + par_not_reverse + par_homopolymers + par_natural_sort + par_ascii_sort + par_template_coordinate + par_write_index + par_no_PG +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done samtools sort \ diff --git a/src/samtools/samtools_view/script.sh b/src/samtools/samtools_view/script.sh index c3911b48..7608844b 100644 --- a/src/samtools/samtools_view/script.sh +++ b/src/samtools/samtools_view/script.sh @@ -5,21 +5,29 @@ set -e -[[ "$par_bam" == "false" ]] && unset par_bam -[[ "$par_cram" == "false" ]] && unset par_cram -[[ "$par_fast" == "false" ]] && unset par_fast -[[ "$par_uncompressed" == "false" ]] && unset par_uncompressed -[[ "$par_with_header" == "false" ]] && unset par_with_header -[[ "$par_header_only" == "false" ]] && unset par_header_only -[[ "$par_no_header" == "false" ]] && unset par_no_header -[[ "$par_count" == "false" ]] && unset par_count -[[ "$par_unmap" == "false" ]] && unset par_unmap -[[ "$par_use_index" == "false" ]] && unset par_use_index -[[ "$par_fetch_pairs" == "false" ]] && unset par_fetch_pairs -[[ "$par_customized_index" == "false" ]] && unset par_customized_index -[[ "$par_no_PG" == "false" ]] && unset par_no_PG -[[ "$par_write_index" == "false" ]] && unset par_write_index -[[ "$par_remove_B" == "false" ]] && unset par_remove_B +unset_if_false=( + par_bam + par_cram + par_fast + par_uncompressed + par_with_header + par_header_only + par_no_header + par_count + par_unmap + par_use_index + par_fetch_pairs + par_customized_index + par_no_PG + par_write_index + par_remove_B +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + samtools view \ ${par_bam:+-b} \ diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh index d57a5e76..3f3bdc89 100644 --- a/src/umi_tools/umi_tools_dedup/script.sh +++ b/src/umi_tools/umi_tools_dedup/script.sh @@ -7,19 +7,26 @@ set -e test_dir="${metal_executable}/test_data" -[[ "$par_paired" == "false" ]] && unset par_paired -[[ "$par_in_sam" == "false" ]] && unset par_in_sam -[[ "$par_out_sam" == "false" ]] && unset par_out_sam -[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique -[[ "$par_per_gene" == "false" ]] && unset par_per_gene -[[ "$par_per_contig" == "false" ]] && unset par_per_contig -[[ "$par_per_cell" == "false" ]] && unset par_per_cell -[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output -[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig -[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi -[[ "$par_subset" == "false" ]] && unset par_subset -[[ "$par_log2stderr" == "false" ]] && unset par_log2stderr -[[ "$par_read_length" == "false" ]] && unset par_read_length +unset_if_false=( + par_paired + par_in_sam + par_out_sam + par_spliced_is_unique + par_per_gene + par_per_contig + par_per_cell + par_no_sort_output + par_buffer_whole_contig + par_ignore_umi + par_subset + par_log2stderr + par_read_length +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done umi_tools dedup \ --stdin "$par_input" \ diff --git a/src/umi_tools/umi_tools_extract/script.sh b/src/umi_tools/umi_tools_extract/script.sh index 5e41865d..4514860e 100644 --- a/src/umi_tools/umi_tools_extract/script.sh +++ b/src/umi_tools/umi_tools_extract/script.sh @@ -5,14 +5,19 @@ set -exo pipefail -test_dir="${metal_executable}/test_data" +unset_if_false=( + par_error_correct_cell + par_reconcile_pairs + par_three_prime + par_ignore_read_pair_suffixes + par_timeit_header + par_log2stderr +) -[[ "$par_error_correct_cell" == "false" ]] && unset par_error_correct_cell -[[ "$par_reconcile_pairs" == "false" ]] && unset par_reconcile_pairs -[[ "$par_three_prime" == "false" ]] && unset par_three_prime -[[ "$par_ignore_read_pair_suffixes" == "false" ]] && unset par_ignore_read_pair_suffixes -[[ "$par_timeit_header" == "false" ]] && unset par_timeit_header -[[ "$par_log2stderr" == "false" ]] && unset par_log2stderr +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done # Check if we have the correct number of input files and patterns for paired-end or single-end reads From 9fc07f6c05879f8efff441767ec489bb24fdce7d Mon Sep 17 00:00:00 2001 From: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> Date: Tue, 13 Aug 2024 17:19:06 +0200 Subject: [PATCH 6/6] Bump Viash to 0.9.0-RC7 (#134) * Bump viash to 0.9.0-RC7 * Update CHANGELOG --- CHANGELOG.md | 2 ++ _viash.yaml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5030894c..d51fcf12 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,8 @@ * Update several components in order to avoid duplicate code when using `unset` on boolean arguments (PR #133). +* Bump viash to `0.9.0-RC7` (PR #134) + ## DOCUMENTATION * Extend the contributing guidelines (PR #82): diff --git a/_viash.yaml b/_viash.yaml index 9a240c24..ab4f3828 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -7,7 +7,7 @@ links: issue_tracker: https://github.com/viash-hub/biobox/issues repository: https://github.com/viash-hub/biobox -viash_version: 0.9.0-RC6 +viash_version: 0.9.0-RC7 config_mods: | .requirements.commands := ['ps']