From 0ac91d7dca2b157f2996729a2e79b8f92edc3ae1 Mon Sep 17 00:00:00 2001 From: tgaspe Date: Tue, 6 Aug 2024 10:48:02 -0300 Subject: [PATCH 01/16] Initial Commit --- src/bedtools/bedtools_groupby/config.vsh.yaml | 67 +++++++++++++ src/bedtools/bedtools_groupby/help.txt | 93 +++++++++++++++++++ src/bedtools/bedtools_groupby/script.sh | 10 ++ src/bedtools/bedtools_groupby/test.sh | 57 ++++++++++++ 4 files changed, 227 insertions(+) create mode 100644 src/bedtools/bedtools_groupby/config.vsh.yaml create mode 100644 src/bedtools/bedtools_groupby/help.txt create mode 100644 src/bedtools/bedtools_groupby/script.sh create mode 100644 src/bedtools/bedtools_groupby/test.sh diff --git a/src/bedtools/bedtools_groupby/config.vsh.yaml b/src/bedtools/bedtools_groupby/config.vsh.yaml new file mode 100644 index 00000000..4111b48b --- /dev/null +++ b/src/bedtools/bedtools_groupby/config.vsh.yaml @@ -0,0 +1,67 @@ +name: bedtools_groupby +namespace: bedtools +description: | + +keywords: [feature intersection, BAM, BED, GFF, VCF] +links: + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/groupby.html + repository: https://github.com/arq5x/bedtools2 +references: + doi: 10.1093/bioinformatics/btq033 +license: GPL-2.0, MIT +requirements: + commands: [bedtools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + direction: input + description: | + The input file (BED/GFF/VCF/BAM) to be used as the -a file. + required: true + example: input_a.bed + + - name: Outputs + arguments: + - name: --output + type: file + direction: output + description: | + The output BED file. + required: true + example: output.bed + + - name: Options + arguments: + - name: + alternatives: + type: boolean_true + description: + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bedtools, procps] + - type: docker + run: | + echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/bedtools/bedtools_groupby/help.txt b/src/bedtools/bedtools_groupby/help.txt new file mode 100644 index 00000000..a631b4b1 --- /dev/null +++ b/src/bedtools/bedtools_groupby/help.txt @@ -0,0 +1,93 @@ +```bash +bedtools groupby +``` + +Tool: bedtools groupby +Version: v2.30.0 +Summary: Summarizes a dataset column based upon + common column groupings. Akin to the SQL "group by" command. + +Usage: bedtools groupby -g [group_column(s)] -c [op_column(s)] -o [ops] + cat [FILE] | bedtools groupby -g [group_column(s)] -c [op_column(s)] -o [ops] + +Options: + -i Input file. Assumes "stdin" if omitted. + + -g -grp Specify the columns (1-based) for the grouping. + The columns must be comma separated. + - Default: 1,2,3 + + -c -opCols Specify the column (1-based) that should be summarized. + - Required. + + -o -ops Specify the operation that should be applied to opCol. + Valid operations: + sum, count, count_distinct, min, max, + mean, median, mode, antimode, + stdev, sstdev (sample standard dev.), + collapse (i.e., print a comma separated list (duplicates allowed)), + distinct (i.e., print a comma separated list (NO duplicates allowed)), + distinct_sort_num (as distinct, but sorted numerically, ascending), + distinct_sort_num_desc (as distinct, but sorted numerically, descending), + concat (i.e., merge values into a single, non-delimited string), + freqdesc (i.e., print desc. list of values:freq) + freqasc (i.e., print asc. list of values:freq) + first (i.e., print first value) + last (i.e., print last value) + - Default: sum + + If there is only column, but multiple operations, all operations will be + applied on that column. Likewise, if there is only one operation, but + multiple columns, that operation will be applied to all columns. + Otherwise, the number of columns must match the the number of operations, + and will be applied in respective order. + E.g., "-c 5,4,6 -o sum,mean,count" will give the sum of column 5, + the mean of column 4, and the count of column 6. + The order of output columns will match the ordering given in the command. + + + -full Print all columns from input file. The first line in the group is used. + Default: print only grouped columns. + + -inheader Input file has a header line - the first line will be ignored. + + -outheader Print header line in the output, detailing the column names. + If the input file has headers (-inheader), the output file + will use the input's column names. + If the input file has no headers, the output file + will use "col_1", "col_2", etc. as the column names. + + -header same as '-inheader -outheader' + + -ignorecase Group values regardless of upper/lower case. + + -prec Sets the decimal precision for output (Default: 5) + + -delim Specify a custom delimiter for the collapse operations. + - Example: -delim "|" + - Default: ",". + +Examples: + $ cat ex1.out + chr1 10 20 A chr1 15 25 B.1 1000 ATAT + chr1 10 20 A chr1 25 35 B.2 10000 CGCG + + $ groupBy -i ex1.out -g 1,2,3,4 -c 9 -o sum + chr1 10 20 A 11000 + + $ groupBy -i ex1.out -grp 1,2,3,4 -opCols 9,9 -ops sum,max + chr1 10 20 A 11000 10000 + + $ groupBy -i ex1.out -g 1,2,3,4 -c 8,9 -o collapse,mean + chr1 10 20 A B.1,B.2, 5500 + + $ cat ex1.out | groupBy -g 1,2,3,4 -c 8,9 -o collapse,mean + chr1 10 20 A B.1,B.2, 5500 + + $ cat ex1.out | groupBy -g 1,2,3,4 -c 10 -o concat + chr1 10 20 A ATATCGCG + +Notes: + (1) The input file/stream should be sorted/grouped by the -grp. columns + (2) If -i is unspecified, input is assumed to come from stdin. + diff --git a/src/bedtools/bedtools_groupby/script.sh b/src/bedtools/bedtools_groupby/script.sh new file mode 100644 index 00000000..28c1aa9e --- /dev/null +++ b/src/bedtools/bedtools_groupby/script.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +## VIASH START +## VIASH END + + +bedtools groupby \ + -i "$par_input" \ + > "$par_output" + \ No newline at end of file diff --git a/src/bedtools/bedtools_groupby/test.sh b/src/bedtools/bedtools_groupby/test.sh new file mode 100644 index 00000000..6d92959a --- /dev/null +++ b/src/bedtools/bedtools_groupby/test.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# exit on error +set -e + +## VIASH START +meta_executable="target/executable/bedtools/bedtools_groupby/bedtools_groupby" +meta_resources_dir="src/bedtools/bedtools_groupby" +## VIASH END + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +mkdir -p test_data + +# Create and populate featuresA.bed + +# Create and populate expected output files for different tests + +# Test 1: Default intersect +mkdir test1 +cd test1 + +# echo "> Run bedtools_intersect on BED files with default intersect" +# "$meta_executable" \ +# --input_a "../test_data/featuresA.bed" \ +# --input_b "../test_data/featuresB.bed" \ +# --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_default.bed" +echo "- test1 succeeded -" + +cd .. + + + +echo "---- All tests succeeded! ----" +exit 0 From 38c816e5c2c18795e319f4240d7f3911b244aa14 Mon Sep 17 00:00:00 2001 From: tgaspe Date: Tue, 6 Aug 2024 10:55:43 -0300 Subject: [PATCH 02/16] Update config.vsh.yaml --- src/bedtools/bedtools_groupby/config.vsh.yaml | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/bedtools/bedtools_groupby/config.vsh.yaml b/src/bedtools/bedtools_groupby/config.vsh.yaml index 4111b48b..c7c98af8 100644 --- a/src/bedtools/bedtools_groupby/config.vsh.yaml +++ b/src/bedtools/bedtools_groupby/config.vsh.yaml @@ -23,7 +23,7 @@ argument_groups: type: file direction: input description: | - The input file (BED/GFF/VCF/BAM) to be used as the -a file. + The input BED file to be used. required: true example: input_a.bed @@ -33,16 +33,27 @@ argument_groups: type: file direction: output description: | - The output BED file. + The output groupby BED file. required: true example: output.bed - name: Options arguments: - - name: - alternatives: - type: boolean_true - description: + - name: --groupby + alternatives: -g + type: string + description: | + Specify the columns (1-based) for the grouping. + The columns must be comma separated. + - Default: 1,2,3 + + # - name: --groupby + # alternatives: -g + # type: string + # description: | + # Specify the columns (1-based) for the grouping. + # The columns must be comma separated. + # - Default: 1,2,3 resources: - type: bash_script From 6451570efcff20d267fd5dbef631743cc048e8fc Mon Sep 17 00:00:00 2001 From: tgaspe Date: Tue, 6 Aug 2024 11:49:36 -0300 Subject: [PATCH 03/16] config file --- src/bedtools/bedtools_groupby/config.vsh.yaml | 97 +++++++++++++++++-- 1 file changed, 87 insertions(+), 10 deletions(-) diff --git a/src/bedtools/bedtools_groupby/config.vsh.yaml b/src/bedtools/bedtools_groupby/config.vsh.yaml index c7c98af8..c380e8ce 100644 --- a/src/bedtools/bedtools_groupby/config.vsh.yaml +++ b/src/bedtools/bedtools_groupby/config.vsh.yaml @@ -1,8 +1,8 @@ name: bedtools_groupby namespace: bedtools description: | - -keywords: [feature intersection, BAM, BED, GFF, VCF] + Summarizes a dataset column based upon common column groupings. Akin to the SQL "group by" command. +keywords: [groupby, BED] links: documentation: https://bedtools.readthedocs.io/en/latest/content/tools/groupby.html repository: https://github.com/arq5x/bedtools2 @@ -40,20 +40,97 @@ argument_groups: - name: Options arguments: - name: --groupby - alternatives: -g + alternatives: [-g, -grp] type: string description: | Specify the columns (1-based) for the grouping. The columns must be comma separated. - Default: 1,2,3 - # - name: --groupby - # alternatives: -g - # type: string - # description: | - # Specify the columns (1-based) for the grouping. - # The columns must be comma separated. - # - Default: 1,2,3 + - name: --column + alternatives: [-c, -opCols] + type: string + description: | + Specify the column (1-based) that should be summarized. + - Required. + + - name: --operation + alternatives: [-0, -ops] + type: string + description: | + Specify the operation that should be applied to opCol. + Valid operations: + sum, count, count_distinct, min, max, + mean, median, mode, antimode, + stdev, sstdev (sample standard dev.), + collapse (i.e., print a comma separated list (duplicates allowed)), + distinct (i.e., print a comma separated list (NO duplicates allowed)), + distinct_sort_num (as distinct, but sorted numerically, ascending), + distinct_sort_num_desc (as distinct, but sorted numerically, descending), + concat (i.e., merge values into a single, non-delimited string), + freqdesc (i.e., print desc. list of values:freq) + freqasc (i.e., print asc. list of values:freq) + first (i.e., print first value) + last (i.e., print last value) + - Default: sum + + If there is only column, but multiple operations, all operations will be + applied on that column. Likewise, if there is only one operation, but + multiple columns, that operation will be applied to all columns. + Otherwise, the number of columns must match the the number of operations, + and will be applied in respective order. + E.g., "-c 5,4,6 -o sum,mean,count" will give the sum of column 5, + the mean of column 4, and the count of column 6. + The order of output columns will match the ordering given in the command. + + - name: --full + type: boolean_true + description: | + Print all columns from input file. The first line in the group is used. + Default: print only grouped columns. + + - name: --inheader + type: boolean_true + description: | + Input file has a header line - the first line will be ignored. + + - name: --outheader + type: boolean_true + description: | + Print header line in the output, detailing the column names. + If the input file has headers (-inheader), the output file + will use the input's column names. + If the input file has no headers, the output file + will use "col_1", "col_2", etc. as the column names. + + - name: --header + type: boolean_true + description: same as '-inheader -outheader'. + + - name: --ignorecase + type: boolean_true + description: | + Group values regardless of upper/lower case. + + - name: --ignorecase + type: boolean_true + description: | + Group values regardless of upper/lower case. + + - name: --precision + alternatives: -prec + type: integer + description: | + Sets the decimal precision for output. + default: 5 + + - name: --delimiter + alternatives: -delim + type: string + description: | + Specify a custom delimiter for the collapse operations. + example: "|" + default: "," resources: - type: bash_script From 3f8a45f67bbbf97184b237f462838226967fae3d Mon Sep 17 00:00:00 2001 From: tgaspe Date: Tue, 6 Aug 2024 12:00:29 -0300 Subject: [PATCH 04/16] script.sh --- src/bedtools/bedtools_groupby/config.vsh.yaml | 15 ++++++--------- src/bedtools/bedtools_groupby/script.sh | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/src/bedtools/bedtools_groupby/config.vsh.yaml b/src/bedtools/bedtools_groupby/config.vsh.yaml index c380e8ce..5c00972a 100644 --- a/src/bedtools/bedtools_groupby/config.vsh.yaml +++ b/src/bedtools/bedtools_groupby/config.vsh.yaml @@ -45,17 +45,18 @@ argument_groups: description: | Specify the columns (1-based) for the grouping. The columns must be comma separated. - - Default: 1,2,3 + - Default: 1,2,3 + required: true - name: --column alternatives: [-c, -opCols] type: string description: | Specify the column (1-based) that should be summarized. - - Required. + required: true - name: --operation - alternatives: [-0, -ops] + alternatives: [-o, -ops] type: string description: | Specify the operation that should be applied to opCol. @@ -82,6 +83,7 @@ argument_groups: E.g., "-c 5,4,6 -o sum,mean,count" will give the sum of column 5, the mean of column 4, and the count of column 6. The order of output columns will match the ordering given in the command. + required: true - name: --full type: boolean_true @@ -111,12 +113,7 @@ argument_groups: type: boolean_true description: | Group values regardless of upper/lower case. - - - name: --ignorecase - type: boolean_true - description: | - Group values regardless of upper/lower case. - + - name: --precision alternatives: -prec type: integer diff --git a/src/bedtools/bedtools_groupby/script.sh b/src/bedtools/bedtools_groupby/script.sh index 28c1aa9e..7ca8982c 100644 --- a/src/bedtools/bedtools_groupby/script.sh +++ b/src/bedtools/bedtools_groupby/script.sh @@ -3,8 +3,24 @@ ## VIASH START ## VIASH END +# Unset parameters +[[ "$par_full" == "false" ]] && unset par_full +[[ "$par_inheader" == "false" ]] && unset par_inheader +[[ "$par_outheader" == "false" ]] && unset par_outheader +[[ "$par_header" == "false" ]] && unset par_header +[[ "$par_ignorecase" == "false" ]] && unset par_ignorecase bedtools groupby \ + ${par_full:+-full} \ + ${par_inheader:+-inheader} \ + ${par_outheader:+-outheader} \ + ${par_header:+-header} \ + ${par_ignorecase:+-ignorecase} \ + ${par_precision:+-prec "$par_precision"} \ + ${par_delimiter:+-delim "$par_delimiter"} \ -i "$par_input" \ + -g "$par_groupby" \ + -c "$par_column" \ + -o "$par_operation" \ > "$par_output" \ No newline at end of file From cdeb8e63e9f1d79f748b77d9f80596852e75212b Mon Sep 17 00:00:00 2001 From: tgaspe Date: Tue, 6 Aug 2024 15:08:04 -0300 Subject: [PATCH 05/16] adding some tests --- src/bedtools/bedtools_groupby/config.vsh.yaml | 1 - src/bedtools/bedtools_groupby/script.sh | 2 +- src/bedtools/bedtools_groupby/test.sh | 69 ++++++++++++++++--- .../bedtools_groupby/test_data/example.bed | 15 ++++ .../bedtools_groupby/test_data/expected.bed | 3 + 5 files changed, 80 insertions(+), 10 deletions(-) create mode 100644 src/bedtools/bedtools_groupby/test_data/example.bed create mode 100644 src/bedtools/bedtools_groupby/test_data/expected.bed diff --git a/src/bedtools/bedtools_groupby/config.vsh.yaml b/src/bedtools/bedtools_groupby/config.vsh.yaml index 5c00972a..c4f1a8f9 100644 --- a/src/bedtools/bedtools_groupby/config.vsh.yaml +++ b/src/bedtools/bedtools_groupby/config.vsh.yaml @@ -83,7 +83,6 @@ argument_groups: E.g., "-c 5,4,6 -o sum,mean,count" will give the sum of column 5, the mean of column 4, and the count of column 6. The order of output columns will match the ordering given in the command. - required: true - name: --full type: boolean_true diff --git a/src/bedtools/bedtools_groupby/script.sh b/src/bedtools/bedtools_groupby/script.sh index 7ca8982c..ae8547af 100644 --- a/src/bedtools/bedtools_groupby/script.sh +++ b/src/bedtools/bedtools_groupby/script.sh @@ -21,6 +21,6 @@ bedtools groupby \ -i "$par_input" \ -g "$par_groupby" \ -c "$par_column" \ - -o "$par_operation" \ + ${par_operation:+-o "$par_operation"} \ > "$par_output" \ No newline at end of file diff --git a/src/bedtools/bedtools_groupby/test.sh b/src/bedtools/bedtools_groupby/test.sh index 6d92959a..c78da8a7 100644 --- a/src/bedtools/bedtools_groupby/test.sh +++ b/src/bedtools/bedtools_groupby/test.sh @@ -29,28 +29,81 @@ assert_identical_content() { echo "Creating Test Data..." mkdir -p test_data -# Create and populate featuresA.bed +# Create and populate example.bed +cat << EOF > test_data/example.bed +# Header +chr21 9719758 9729320 variant1 chr21 9719768 9721892 ALR/Alpha 1004 + +chr21 9719758 9729320 variant1 chr21 9721905 9725582 ALR/Alpha 1010 + +chr21 9719758 9729320 variant1 chr21 9725582 9725977 L1PA3 3288 + +chr21 9719758 9729320 variant1 chr21 9726021 9729309 ALR/Alpha 1051 + +chr21 9729310 9757478 variant2 chr21 9729320 9729809 L1PA3 3897 - +chr21 9729310 9757478 variant2 chr21 9729809 9730866 L1P1 8367 + +chr21 9729310 9757478 variant2 chr21 9730866 9734026 ALR/Alpha 1036 - +chr21 9729310 9757478 variant2 chr21 9734037 9757471 ALR/Alpha 1182 - +chr21 9795588 9796685 variant3 chr21 9795589 9795713 (GAATG)n 308 + +chr21 9795588 9796685 variant3 chr21 9795736 9795894 (GAATG)n 683 + +chr21 9795588 9796685 variant3 chr21 9795911 9796007 (GAATG)n 345 + +chr21 9795588 9796685 variant3 chr21 9796028 9796187 (GAATG)n 756 + +chr21 9795588 9796685 variant3 chr21 9796202 9796615 (GAATG)n 891 + +chr21 9795588 9796685 variant3 chr21 9796637 9796824 (GAATG)n 621 + +EOF # Create and populate expected output files for different tests +cat << EOF > test_data/expected.bed +chr21 9719758 9729320 6353 +chr21 9729310 9757478 14482 +chr21 9795588 9796685 3604 +EOF +cat << EOF > test_data/expected_max.bed +chr21 9719758 9729320 variant1 3288 +chr21 9729310 9757478 variant2 8367 +chr21 9795588 9796685 variant3 891 +EOF -# Test 1: Default intersect +# Test 1: without operation option, default operation is sum mkdir test1 cd test1 -# echo "> Run bedtools_intersect on BED files with default intersect" -# "$meta_executable" \ -# --input_a "../test_data/featuresA.bed" \ -# --input_b "../test_data/featuresB.bed" \ -# --output "output.bed" +echo "> Run bedtools groupby on BED file" +"$meta_executable" \ + --input "../test_data/example.bed" \ + --groupby "1,2,3" \ + --column "9" \ + --output "output.bed" # checks assert_file_exists "output.bed" assert_file_not_empty "output.bed" -assert_identical_content "output.bed" "../test_data/expected_default.bed" +assert_identical_content "output.bed" "../test_data/expected.bed" echo "- test1 succeeded -" cd .. +# Test 2: with operation option +mkdir test2 +cd test2 + +echo "> Run bedtools groupby on BED file with max operation" +"$meta_executable" \ + --input "../test_data/example.bed" \ + --groupby "1-4" \ + --column "9" \ + --operation "max" \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_max.bed" +echo "- test2 succeeded -" + +cd .. + +# Test 3: + + + + echo "---- All tests succeeded! ----" diff --git a/src/bedtools/bedtools_groupby/test_data/example.bed b/src/bedtools/bedtools_groupby/test_data/example.bed new file mode 100644 index 00000000..d86e15dd --- /dev/null +++ b/src/bedtools/bedtools_groupby/test_data/example.bed @@ -0,0 +1,15 @@ +# Header +chr21 9719758 9729320 variant1 chr21 9719768 9721892 ALR/Alpha 1004 + +chr21 9719758 9729320 variant1 chr21 9721905 9725582 ALR/Alpha 1010 + +chr21 9719758 9729320 variant1 chr21 9725582 9725977 L1PA3 3288 + +chr21 9719758 9729320 variant1 chr21 9726021 9729309 ALR/Alpha 1051 + +chr21 9729310 9757478 variant2 chr21 9729320 9729809 L1PA3 3897 - +chr21 9729310 9757478 variant2 chr21 9729809 9730866 L1P1 8367 + +chr21 9729310 9757478 variant2 chr21 9730866 9734026 ALR/Alpha 1036 - +chr21 9729310 9757478 variant2 chr21 9734037 9757471 ALR/Alpha 1182 - +chr21 9795588 9796685 variant3 chr21 9795589 9795713 (GAATG)n 308 + +chr21 9795588 9796685 variant3 chr21 9795736 9795894 (GAATG)n 683 + +chr21 9795588 9796685 variant3 chr21 9795911 9796007 (GAATG)n 345 + +chr21 9795588 9796685 variant3 chr21 9796028 9796187 (GAATG)n 756 + +chr21 9795588 9796685 variant3 chr21 9796202 9796615 (GAATG)n 891 + +chr21 9795588 9796685 variant3 chr21 9796637 9796824 (GAATG)n 621 + diff --git a/src/bedtools/bedtools_groupby/test_data/expected.bed b/src/bedtools/bedtools_groupby/test_data/expected.bed new file mode 100644 index 00000000..94f90dc9 --- /dev/null +++ b/src/bedtools/bedtools_groupby/test_data/expected.bed @@ -0,0 +1,3 @@ +chr21 9719758 9729320 6353 +chr21 9729310 9757478 14482 +chr21 9795588 9796685 3604 From 4cbcdf15cfa08791336810e007b2242a1484728f Mon Sep 17 00:00:00 2001 From: tgaspe Date: Tue, 6 Aug 2024 15:23:16 -0300 Subject: [PATCH 06/16] more test --- src/bedtools/bedtools_groupby/test.sh | 95 ++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 2 deletions(-) diff --git a/src/bedtools/bedtools_groupby/test.sh b/src/bedtools/bedtools_groupby/test.sh index c78da8a7..1435facd 100644 --- a/src/bedtools/bedtools_groupby/test.sh +++ b/src/bedtools/bedtools_groupby/test.sh @@ -59,6 +59,21 @@ chr21 9719758 9729320 variant1 3288 chr21 9729310 9757478 variant2 8367 chr21 9795588 9796685 variant3 891 EOF +cat << EOF > test_data/expected_full.bed +chr21 9719758 9729320 variant1 chr21 9719768 9721892 ALR/Alpha 1004 + 6353 +chr21 9729310 9757478 variant2 chr21 9729320 9729809 L1PA3 3897 - 14482 +chr21 9795588 9796685 variant3 chr21 9795589 9795713 (GAATG)n 308 + 3604 +EOF +cat << EOF > test_data/expected_delimited.bed +chr21 9719758 9729320 variant1 1004;1010;3288;1051 +chr21 9729310 9757478 variant2 3897;8367;1036;1182 +chr21 9795588 9796685 variant3 308;683;345;756;891;621 +EOF +cat << EOF > test_data/expected_precision.bed +chr21 9719758 9729320 variant1 1.6e+03 +chr21 9729310 9757478 variant2 3.6e+03 +chr21 9795588 9796685 variant3 6e+02 +EOF # Test 1: without operation option, default operation is sum mkdir test1 @@ -79,7 +94,7 @@ echo "- test1 succeeded -" cd .. -# Test 2: with operation option +# Test 2: with operation max option mkdir test2 cd test2 @@ -99,12 +114,88 @@ echo "- test2 succeeded -" cd .. -# Test 3: +# Test 3: full option +mkdir test3 +cd test3 +echo "> Run bedtools groupby on BED file with full option" +"$meta_executable" \ + --input "../test_data/example.bed" \ + --groupby "1-4" \ + --column "9" \ + --full \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_full.bed" +echo "- test3 succeeded -" +cd .. +# Test 4: header option +mkdir test4 +cd test4 + +echo "> Run bedtools groupby on BED file with header option" +"$meta_executable" \ + --input "../test_data/example.bed" \ + --groupby "1-4" \ + --column "9" \ + --header \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_file_contains "output.bed" "# Header" +echo "- test4 succeeded -" + +cd .. +# Test 5: Delimiter and collapse +mkdir test5 +cd test5 +echo "> Run bedtools groupby on BED file with delimiter and collapse options" +"$meta_executable" \ + --input "../test_data/example.bed" \ + --groupby "1-4" \ + --column "9" \ + --operation "collapse" \ + --collapse \ + --delimiter ";" \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_delimited.bed" +echo "- test5 succeeded -" + +cd .. + +# Test 6: precision option +mkdir test6 +cd test6 + +echo "> Run bedtools groupby on BED file with precision option" +"$meta_executable" \ + --input "../test_data/example.bed" \ + --groupby "1-4" \ + --column "9" \ + --operation "mean" \ + --precision 2 \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../test_data/expected_precision.bed" +echo "- test6 succeeded -" + +cd .. echo "---- All tests succeeded! ----" exit 0 From df0282e683b863faa243eb83c2ad480f007ba1d4 Mon Sep 17 00:00:00 2001 From: tgaspe Date: Tue, 6 Aug 2024 15:24:50 -0300 Subject: [PATCH 07/16] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9dd2389c..56d6dfc9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,7 +26,7 @@ * `bedtools`: - `bedtools/bedtools_intersect`: Allows one to screen for overlaps between two sets of genomic features (PR #94). - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98). - + - `bedtools/bedtools_groupby`: Summarizes a dataset column based upon common column groupings. Akin to the SQL "group by" command (PR #123). ## MINOR CHANGES From ad9926595d8ffc962ed8c6704f2d4f6c16fde94a Mon Sep 17 00:00:00 2001 From: tgaspe Date: Tue, 6 Aug 2024 15:25:22 -0300 Subject: [PATCH 08/16] deleted test_data --- .../bedtools_groupby/test_data/example.bed | 15 --------------- .../bedtools_groupby/test_data/expected.bed | 3 --- 2 files changed, 18 deletions(-) delete mode 100644 src/bedtools/bedtools_groupby/test_data/example.bed delete mode 100644 src/bedtools/bedtools_groupby/test_data/expected.bed diff --git a/src/bedtools/bedtools_groupby/test_data/example.bed b/src/bedtools/bedtools_groupby/test_data/example.bed deleted file mode 100644 index d86e15dd..00000000 --- a/src/bedtools/bedtools_groupby/test_data/example.bed +++ /dev/null @@ -1,15 +0,0 @@ -# Header -chr21 9719758 9729320 variant1 chr21 9719768 9721892 ALR/Alpha 1004 + -chr21 9719758 9729320 variant1 chr21 9721905 9725582 ALR/Alpha 1010 + -chr21 9719758 9729320 variant1 chr21 9725582 9725977 L1PA3 3288 + -chr21 9719758 9729320 variant1 chr21 9726021 9729309 ALR/Alpha 1051 + -chr21 9729310 9757478 variant2 chr21 9729320 9729809 L1PA3 3897 - -chr21 9729310 9757478 variant2 chr21 9729809 9730866 L1P1 8367 + -chr21 9729310 9757478 variant2 chr21 9730866 9734026 ALR/Alpha 1036 - -chr21 9729310 9757478 variant2 chr21 9734037 9757471 ALR/Alpha 1182 - -chr21 9795588 9796685 variant3 chr21 9795589 9795713 (GAATG)n 308 + -chr21 9795588 9796685 variant3 chr21 9795736 9795894 (GAATG)n 683 + -chr21 9795588 9796685 variant3 chr21 9795911 9796007 (GAATG)n 345 + -chr21 9795588 9796685 variant3 chr21 9796028 9796187 (GAATG)n 756 + -chr21 9795588 9796685 variant3 chr21 9796202 9796615 (GAATG)n 891 + -chr21 9795588 9796685 variant3 chr21 9796637 9796824 (GAATG)n 621 + diff --git a/src/bedtools/bedtools_groupby/test_data/expected.bed b/src/bedtools/bedtools_groupby/test_data/expected.bed deleted file mode 100644 index 94f90dc9..00000000 --- a/src/bedtools/bedtools_groupby/test_data/expected.bed +++ /dev/null @@ -1,3 +0,0 @@ -chr21 9719758 9729320 6353 -chr21 9729310 9757478 14482 -chr21 9795588 9796685 3604 From 34330996bf0ac065e3b8505a548ba4dcb86652c0 Mon Sep 17 00:00:00 2001 From: tgaspe Date: Tue, 6 Aug 2024 15:34:26 -0300 Subject: [PATCH 09/16] bug fix --- src/bedtools/bedtools_groupby/test.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/src/bedtools/bedtools_groupby/test.sh b/src/bedtools/bedtools_groupby/test.sh index 1435facd..65f513a9 100644 --- a/src/bedtools/bedtools_groupby/test.sh +++ b/src/bedtools/bedtools_groupby/test.sh @@ -164,7 +164,6 @@ echo "> Run bedtools groupby on BED file with delimiter and collapse options" --groupby "1-4" \ --column "9" \ --operation "collapse" \ - --collapse \ --delimiter ";" \ --output "output.bed" From f88bfadfb0a7e5b392da064c5745d336809aa8f9 Mon Sep 17 00:00:00 2001 From: tgaspe Date: Mon, 12 Aug 2024 00:16:42 +0200 Subject: [PATCH 10/16] Update config.vsh.yaml --- src/bedtools/bedtools_groupby/config.vsh.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/bedtools/bedtools_groupby/config.vsh.yaml b/src/bedtools/bedtools_groupby/config.vsh.yaml index c4f1a8f9..5d2ce22b 100644 --- a/src/bedtools/bedtools_groupby/config.vsh.yaml +++ b/src/bedtools/bedtools_groupby/config.vsh.yaml @@ -1,14 +1,15 @@ name: bedtools_groupby namespace: bedtools description: | - Summarizes a dataset column based upon common column groupings. Akin to the SQL "group by" command. + Summarizes a dataset column based upon common column groupings. + Akin to the SQL "group by" command. keywords: [groupby, BED] links: documentation: https://bedtools.readthedocs.io/en/latest/content/tools/groupby.html repository: https://github.com/arq5x/bedtools2 references: doi: 10.1093/bioinformatics/btq033 -license: GPL-2.0, MIT +license: MIT requirements: commands: [bedtools] authors: @@ -87,7 +88,7 @@ argument_groups: - name: --full type: boolean_true description: | - Print all columns from input file. The first line in the group is used. + Print all columns from input file. The first line in the group is used. Default: print only grouped columns. - name: --inheader From 9c9942d69d637941ff787e95b8a34c1361b6298b Mon Sep 17 00:00:00 2001 From: tgaspe Date: Mon, 12 Aug 2024 21:28:16 +0200 Subject: [PATCH 11/16] adding more links --- src/bedtools/bedtools_groupby/config.vsh.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/bedtools/bedtools_groupby/config.vsh.yaml b/src/bedtools/bedtools_groupby/config.vsh.yaml index 5d2ce22b..ecf46dca 100644 --- a/src/bedtools/bedtools_groupby/config.vsh.yaml +++ b/src/bedtools/bedtools_groupby/config.vsh.yaml @@ -7,6 +7,8 @@ keywords: [groupby, BED] links: documentation: https://bedtools.readthedocs.io/en/latest/content/tools/groupby.html repository: https://github.com/arq5x/bedtools2 + homepage: https://bedtools.readthedocs.io/en/latest/# + issue_tracker: https://github.com/arq5x/bedtools2/issues references: doi: 10.1093/bioinformatics/btq033 license: MIT From d9cc263f2ab57d364ae67c18ee3e004148862372 Mon Sep 17 00:00:00 2001 From: tgaspe Date: Mon, 12 Aug 2024 22:13:56 +0200 Subject: [PATCH 12/16] exit on error --- src/bedtools/bedtools_groupby/script.sh | 3 +++ src/bedtools/bedtools_groupby/test.sh | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/bedtools/bedtools_groupby/script.sh b/src/bedtools/bedtools_groupby/script.sh index ae8547af..cd97d272 100644 --- a/src/bedtools/bedtools_groupby/script.sh +++ b/src/bedtools/bedtools_groupby/script.sh @@ -3,6 +3,9 @@ ## VIASH START ## VIASH END +# Exit on error +set -eo pipefail + # Unset parameters [[ "$par_full" == "false" ]] && unset par_full [[ "$par_inheader" == "false" ]] && unset par_inheader diff --git a/src/bedtools/bedtools_groupby/test.sh b/src/bedtools/bedtools_groupby/test.sh index 65f513a9..2eab8fc0 100644 --- a/src/bedtools/bedtools_groupby/test.sh +++ b/src/bedtools/bedtools_groupby/test.sh @@ -1,7 +1,7 @@ #!/bin/bash # exit on error -set -e +set -eo pipefail ## VIASH START meta_executable="target/executable/bedtools/bedtools_groupby/bedtools_groupby" From a21037ef88d9292931325e3652659fa5c112a6eb Mon Sep 17 00:00:00 2001 From: tgaspe Date: Wed, 14 Aug 2024 23:23:57 +0200 Subject: [PATCH 13/16] $TMPDIR --- src/bedtools/bedtools_groupby/test.sh | 70 +++++++++++++-------------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/src/bedtools/bedtools_groupby/test.sh b/src/bedtools/bedtools_groupby/test.sh index 2eab8fc0..ce99a1ec 100644 --- a/src/bedtools/bedtools_groupby/test.sh +++ b/src/bedtools/bedtools_groupby/test.sh @@ -27,10 +27,14 @@ assert_identical_content() { # Create directories for tests echo "Creating Test Data..." -mkdir -p test_data +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT # Create and populate example.bed -cat << EOF > test_data/example.bed +cat << EOF > $TMPDIR/example.bed # Header chr21 9719758 9729320 variant1 chr21 9719768 9721892 ALR/Alpha 1004 + chr21 9719758 9729320 variant1 chr21 9721905 9725582 ALR/Alpha 1010 + @@ -49,39 +53,38 @@ chr21 9795588 9796685 variant3 chr21 9796637 9796824 (GAATG)n 621 + EOF # Create and populate expected output files for different tests -cat << EOF > test_data/expected.bed +cat << EOF > $TMPDIR/expected.bed chr21 9719758 9729320 6353 chr21 9729310 9757478 14482 chr21 9795588 9796685 3604 EOF -cat << EOF > test_data/expected_max.bed +cat << EOF > $TMPDIR/expected_max.bed chr21 9719758 9729320 variant1 3288 chr21 9729310 9757478 variant2 8367 chr21 9795588 9796685 variant3 891 EOF -cat << EOF > test_data/expected_full.bed +cat << EOF > $TMPDIR/expected_full.bed chr21 9719758 9729320 variant1 chr21 9719768 9721892 ALR/Alpha 1004 + 6353 chr21 9729310 9757478 variant2 chr21 9729320 9729809 L1PA3 3897 - 14482 chr21 9795588 9796685 variant3 chr21 9795589 9795713 (GAATG)n 308 + 3604 EOF -cat << EOF > test_data/expected_delimited.bed +cat << EOF > $TMPDIR/expected_delimited.bed chr21 9719758 9729320 variant1 1004;1010;3288;1051 chr21 9729310 9757478 variant2 3897;8367;1036;1182 chr21 9795588 9796685 variant3 308;683;345;756;891;621 EOF -cat << EOF > test_data/expected_precision.bed +cat << EOF > $TMPDIR/expected_precision.bed chr21 9719758 9729320 variant1 1.6e+03 chr21 9729310 9757478 variant2 3.6e+03 chr21 9795588 9796685 variant3 6e+02 EOF # Test 1: without operation option, default operation is sum -mkdir test1 -cd test1 +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null echo "> Run bedtools groupby on BED file" "$meta_executable" \ - --input "../test_data/example.bed" \ + --input "../example.bed" \ --groupby "1,2,3" \ --column "9" \ --output "output.bed" @@ -89,18 +92,17 @@ echo "> Run bedtools groupby on BED file" # checks assert_file_exists "output.bed" assert_file_not_empty "output.bed" -assert_identical_content "output.bed" "../test_data/expected.bed" +assert_identical_content "output.bed" "../expected.bed" echo "- test1 succeeded -" -cd .. +popd > /dev/null # Test 2: with operation max option -mkdir test2 -cd test2 +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null echo "> Run bedtools groupby on BED file with max operation" "$meta_executable" \ - --input "../test_data/example.bed" \ + --input "../example.bed" \ --groupby "1-4" \ --column "9" \ --operation "max" \ @@ -109,18 +111,17 @@ echo "> Run bedtools groupby on BED file with max operation" # checks assert_file_exists "output.bed" assert_file_not_empty "output.bed" -assert_identical_content "output.bed" "../test_data/expected_max.bed" +assert_identical_content "output.bed" "../expected_max.bed" echo "- test2 succeeded -" -cd .. +popd > /dev/null # Test 3: full option -mkdir test3 -cd test3 +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null echo "> Run bedtools groupby on BED file with full option" "$meta_executable" \ - --input "../test_data/example.bed" \ + --input "../example.bed" \ --groupby "1-4" \ --column "9" \ --full \ @@ -129,18 +130,17 @@ echo "> Run bedtools groupby on BED file with full option" # checks assert_file_exists "output.bed" assert_file_not_empty "output.bed" -assert_identical_content "output.bed" "../test_data/expected_full.bed" +assert_identical_content "output.bed" "../expected_full.bed" echo "- test3 succeeded -" -cd .. +popd > /dev/null # Test 4: header option -mkdir test4 -cd test4 +mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null echo "> Run bedtools groupby on BED file with header option" "$meta_executable" \ - --input "../test_data/example.bed" \ + --input "../example.bed" \ --groupby "1-4" \ --column "9" \ --header \ @@ -152,15 +152,14 @@ assert_file_not_empty "output.bed" assert_file_contains "output.bed" "# Header" echo "- test4 succeeded -" -cd .. +popd > /dev/null # Test 5: Delimiter and collapse -mkdir test5 -cd test5 +mkdir "$TMPDIR/test5" && pushd "$TMPDIR/test5" > /dev/null echo "> Run bedtools groupby on BED file with delimiter and collapse options" "$meta_executable" \ - --input "../test_data/example.bed" \ + --input "../example.bed" \ --groupby "1-4" \ --column "9" \ --operation "collapse" \ @@ -170,18 +169,17 @@ echo "> Run bedtools groupby on BED file with delimiter and collapse options" # checks assert_file_exists "output.bed" assert_file_not_empty "output.bed" -assert_identical_content "output.bed" "../test_data/expected_delimited.bed" +assert_identical_content "output.bed" "../expected_delimited.bed" echo "- test5 succeeded -" -cd .. +popd > /dev/null # Test 6: precision option -mkdir test6 -cd test6 +mkdir "$TMPDIR/test6" && pushd "$TMPDIR/test6" > /dev/null echo "> Run bedtools groupby on BED file with precision option" "$meta_executable" \ - --input "../test_data/example.bed" \ + --input "../example.bed" \ --groupby "1-4" \ --column "9" \ --operation "mean" \ @@ -191,10 +189,10 @@ echo "> Run bedtools groupby on BED file with precision option" # checks assert_file_exists "output.bed" assert_file_not_empty "output.bed" -assert_identical_content "output.bed" "../test_data/expected_precision.bed" +assert_identical_content "output.bed" "../expected_precision.bed" echo "- test6 succeeded -" -cd .. +popd > /dev/null echo "---- All tests succeeded! ----" exit 0 From e657b95fd7fb14ffb91de53fa74c283b55d7ff61 Mon Sep 17 00:00:00 2001 From: tgaspe Date: Wed, 14 Aug 2024 23:27:51 +0200 Subject: [PATCH 14/16] Update script.sh --- src/bedtools/bedtools_groupby/script.sh | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/bedtools/bedtools_groupby/script.sh b/src/bedtools/bedtools_groupby/script.sh index cd97d272..b8a40cdc 100644 --- a/src/bedtools/bedtools_groupby/script.sh +++ b/src/bedtools/bedtools_groupby/script.sh @@ -7,11 +7,18 @@ set -eo pipefail # Unset parameters -[[ "$par_full" == "false" ]] && unset par_full -[[ "$par_inheader" == "false" ]] && unset par_inheader -[[ "$par_outheader" == "false" ]] && unset par_outheader -[[ "$par_header" == "false" ]] && unset par_header -[[ "$par_ignorecase" == "false" ]] && unset par_ignorecase +unset_if_false=( + par_full + par_inheader + par_outheader + par_header + par_ignorecase +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done bedtools groupby \ ${par_full:+-full} \ From ebacbfcd3b9dc69ca4e15b2d03efa11fd068b968 Mon Sep 17 00:00:00 2001 From: tgaspe Date: Tue, 20 Aug 2024 20:06:29 +0200 Subject: [PATCH 15/16] Update config.vsh.yaml --- src/bedtools/bedtools_groupby/config.vsh.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/bedtools/bedtools_groupby/config.vsh.yaml b/src/bedtools/bedtools_groupby/config.vsh.yaml index ecf46dca..514e06df 100644 --- a/src/bedtools/bedtools_groupby/config.vsh.yaml +++ b/src/bedtools/bedtools_groupby/config.vsh.yaml @@ -76,7 +76,8 @@ argument_groups: freqasc (i.e., print asc. list of values:freq) first (i.e., print first value) last (i.e., print last value) - - Default: sum + + Default value: sum If there is only column, but multiple operations, all operations will be applied on that column. Likewise, if there is only one operation, but From 1d0abca5892eaa20a1c1f76ac51425a9faef708c Mon Sep 17 00:00:00 2001 From: tgaspe Date: Wed, 28 Aug 2024 22:59:33 +0200 Subject: [PATCH 16/16] Suggested change on column option --- src/bedtools/bedtools_groupby/config.vsh.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bedtools/bedtools_groupby/config.vsh.yaml b/src/bedtools/bedtools_groupby/config.vsh.yaml index 514e06df..89c4845b 100644 --- a/src/bedtools/bedtools_groupby/config.vsh.yaml +++ b/src/bedtools/bedtools_groupby/config.vsh.yaml @@ -53,7 +53,7 @@ argument_groups: - name: --column alternatives: [-c, -opCols] - type: string + type: integer description: | Specify the column (1-based) that should be summarized. required: true