diff --git a/CHANGELOG.md b/CHANGELOG.md index 6dda7ab4..29fb8cfa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ * `bedtools`: - `bedtools/bedtools_intersect`: Allows one to screen for overlaps between two sets of genomic features (PR #94). - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98). + - `bedtools/bedtools_groupby`: Summarizes a dataset column based upon common column groupings. Akin to the SQL "group by" command (PR #123). - `bedtools/bedtools_merge`: Merges overlapping BED/GFF/VCF entries into a single interval (PR #118). - `bedtools/bedtools_bamtofastq`: Convert BAM alignments to FASTQ files (PR #101). - `bedtools/bedtools_bedtobam`: Converts genomic feature records (bed/gff/vcf) to BAM format (PR #111). @@ -38,6 +39,7 @@ * `rsem/rsem_prepare_reference`: Prepare transcript references for RSEM (PR #89). + ## MINOR CHANGES * `busco` components: update BUSCO to `5.7.1` (PR #72). diff --git a/src/bedtools/bedtools_groupby/config.vsh.yaml b/src/bedtools/bedtools_groupby/config.vsh.yaml new file mode 100644 index 00000000..89c4845b --- /dev/null +++ b/src/bedtools/bedtools_groupby/config.vsh.yaml @@ -0,0 +1,155 @@ +name: bedtools_groupby +namespace: bedtools +description: | + Summarizes a dataset column based upon common column groupings. + Akin to the SQL "group by" command. +keywords: [groupby, BED] +links: + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/groupby.html + repository: https://github.com/arq5x/bedtools2 + homepage: https://bedtools.readthedocs.io/en/latest/# + issue_tracker: https://github.com/arq5x/bedtools2/issues +references: + doi: 10.1093/bioinformatics/btq033 +license: MIT +requirements: + commands: [bedtools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + direction: input + description: | + The input BED file to be used. + required: true + example: input_a.bed + + - name: Outputs + arguments: + - name: --output + type: file + direction: output + description: | + The output groupby BED file. + required: true + example: output.bed + + - name: Options + arguments: + - name: --groupby + alternatives: [-g, -grp] + type: string + description: | + Specify the columns (1-based) for the grouping. + The columns must be comma separated. + - Default: 1,2,3 + required: true + + - name: --column + alternatives: [-c, -opCols] + type: integer + description: | + Specify the column (1-based) that should be summarized. + required: true + + - name: --operation + alternatives: [-o, -ops] + type: string + description: | + Specify the operation that should be applied to opCol. + Valid operations: + sum, count, count_distinct, min, max, + mean, median, mode, antimode, + stdev, sstdev (sample standard dev.), + collapse (i.e., print a comma separated list (duplicates allowed)), + distinct (i.e., print a comma separated list (NO duplicates allowed)), + distinct_sort_num (as distinct, but sorted numerically, ascending), + distinct_sort_num_desc (as distinct, but sorted numerically, descending), + concat (i.e., merge values into a single, non-delimited string), + freqdesc (i.e., print desc. list of values:freq) + freqasc (i.e., print asc. list of values:freq) + first (i.e., print first value) + last (i.e., print last value) + + Default value: sum + + If there is only column, but multiple operations, all operations will be + applied on that column. Likewise, if there is only one operation, but + multiple columns, that operation will be applied to all columns. + Otherwise, the number of columns must match the the number of operations, + and will be applied in respective order. + E.g., "-c 5,4,6 -o sum,mean,count" will give the sum of column 5, + the mean of column 4, and the count of column 6. + The order of output columns will match the ordering given in the command. + + - name: --full + type: boolean_true + description: | + Print all columns from input file. The first line in the group is used. + Default: print only grouped columns. + + - name: --inheader + type: boolean_true + description: | + Input file has a header line - the first line will be ignored. + + - name: --outheader + type: boolean_true + description: | + Print header line in the output, detailing the column names. + If the input file has headers (-inheader), the output file + will use the input's column names. + If the input file has no headers, the output file + will use "col_1", "col_2", etc. as the column names. + + - name: --header + type: boolean_true + description: same as '-inheader -outheader'. + + - name: --ignorecase + type: boolean_true + description: | + Group values regardless of upper/lower case. + + - name: --precision + alternatives: -prec + type: integer + description: | + Sets the decimal precision for output. + default: 5 + + - name: --delimiter + alternatives: -delim + type: string + description: | + Specify a custom delimiter for the collapse operations. + example: "|" + default: "," + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bedtools, procps] + - type: docker + run: | + echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/bedtools/bedtools_groupby/help.txt b/src/bedtools/bedtools_groupby/help.txt new file mode 100644 index 00000000..a631b4b1 --- /dev/null +++ b/src/bedtools/bedtools_groupby/help.txt @@ -0,0 +1,93 @@ +```bash +bedtools groupby +``` + +Tool: bedtools groupby +Version: v2.30.0 +Summary: Summarizes a dataset column based upon + common column groupings. Akin to the SQL "group by" command. + +Usage: bedtools groupby -g [group_column(s)] -c [op_column(s)] -o [ops] + cat [FILE] | bedtools groupby -g [group_column(s)] -c [op_column(s)] -o [ops] + +Options: + -i Input file. Assumes "stdin" if omitted. + + -g -grp Specify the columns (1-based) for the grouping. + The columns must be comma separated. + - Default: 1,2,3 + + -c -opCols Specify the column (1-based) that should be summarized. + - Required. + + -o -ops Specify the operation that should be applied to opCol. + Valid operations: + sum, count, count_distinct, min, max, + mean, median, mode, antimode, + stdev, sstdev (sample standard dev.), + collapse (i.e., print a comma separated list (duplicates allowed)), + distinct (i.e., print a comma separated list (NO duplicates allowed)), + distinct_sort_num (as distinct, but sorted numerically, ascending), + distinct_sort_num_desc (as distinct, but sorted numerically, descending), + concat (i.e., merge values into a single, non-delimited string), + freqdesc (i.e., print desc. list of values:freq) + freqasc (i.e., print asc. list of values:freq) + first (i.e., print first value) + last (i.e., print last value) + - Default: sum + + If there is only column, but multiple operations, all operations will be + applied on that column. Likewise, if there is only one operation, but + multiple columns, that operation will be applied to all columns. + Otherwise, the number of columns must match the the number of operations, + and will be applied in respective order. + E.g., "-c 5,4,6 -o sum,mean,count" will give the sum of column 5, + the mean of column 4, and the count of column 6. + The order of output columns will match the ordering given in the command. + + + -full Print all columns from input file. The first line in the group is used. + Default: print only grouped columns. + + -inheader Input file has a header line - the first line will be ignored. + + -outheader Print header line in the output, detailing the column names. + If the input file has headers (-inheader), the output file + will use the input's column names. + If the input file has no headers, the output file + will use "col_1", "col_2", etc. as the column names. + + -header same as '-inheader -outheader' + + -ignorecase Group values regardless of upper/lower case. + + -prec Sets the decimal precision for output (Default: 5) + + -delim Specify a custom delimiter for the collapse operations. + - Example: -delim "|" + - Default: ",". + +Examples: + $ cat ex1.out + chr1 10 20 A chr1 15 25 B.1 1000 ATAT + chr1 10 20 A chr1 25 35 B.2 10000 CGCG + + $ groupBy -i ex1.out -g 1,2,3,4 -c 9 -o sum + chr1 10 20 A 11000 + + $ groupBy -i ex1.out -grp 1,2,3,4 -opCols 9,9 -ops sum,max + chr1 10 20 A 11000 10000 + + $ groupBy -i ex1.out -g 1,2,3,4 -c 8,9 -o collapse,mean + chr1 10 20 A B.1,B.2, 5500 + + $ cat ex1.out | groupBy -g 1,2,3,4 -c 8,9 -o collapse,mean + chr1 10 20 A B.1,B.2, 5500 + + $ cat ex1.out | groupBy -g 1,2,3,4 -c 10 -o concat + chr1 10 20 A ATATCGCG + +Notes: + (1) The input file/stream should be sorted/grouped by the -grp. columns + (2) If -i is unspecified, input is assumed to come from stdin. + diff --git a/src/bedtools/bedtools_groupby/script.sh b/src/bedtools/bedtools_groupby/script.sh new file mode 100644 index 00000000..b8a40cdc --- /dev/null +++ b/src/bedtools/bedtools_groupby/script.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +# Unset parameters +unset_if_false=( + par_full + par_inheader + par_outheader + par_header + par_ignorecase +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +bedtools groupby \ + ${par_full:+-full} \ + ${par_inheader:+-inheader} \ + ${par_outheader:+-outheader} \ + ${par_header:+-header} \ + ${par_ignorecase:+-ignorecase} \ + ${par_precision:+-prec "$par_precision"} \ + ${par_delimiter:+-delim "$par_delimiter"} \ + -i "$par_input" \ + -g "$par_groupby" \ + -c "$par_column" \ + ${par_operation:+-o "$par_operation"} \ + > "$par_output" + \ No newline at end of file diff --git a/src/bedtools/bedtools_groupby/test.sh b/src/bedtools/bedtools_groupby/test.sh new file mode 100644 index 00000000..ce99a1ec --- /dev/null +++ b/src/bedtools/bedtools_groupby/test.sh @@ -0,0 +1,198 @@ +#!/bin/bash + +# exit on error +set -eo pipefail + +## VIASH START +meta_executable="target/executable/bedtools/bedtools_groupby/bedtools_groupby" +meta_resources_dir="src/bedtools/bedtools_groupby" +## VIASH END + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create and populate example.bed +cat << EOF > $TMPDIR/example.bed +# Header +chr21 9719758 9729320 variant1 chr21 9719768 9721892 ALR/Alpha 1004 + +chr21 9719758 9729320 variant1 chr21 9721905 9725582 ALR/Alpha 1010 + +chr21 9719758 9729320 variant1 chr21 9725582 9725977 L1PA3 3288 + +chr21 9719758 9729320 variant1 chr21 9726021 9729309 ALR/Alpha 1051 + +chr21 9729310 9757478 variant2 chr21 9729320 9729809 L1PA3 3897 - +chr21 9729310 9757478 variant2 chr21 9729809 9730866 L1P1 8367 + +chr21 9729310 9757478 variant2 chr21 9730866 9734026 ALR/Alpha 1036 - +chr21 9729310 9757478 variant2 chr21 9734037 9757471 ALR/Alpha 1182 - +chr21 9795588 9796685 variant3 chr21 9795589 9795713 (GAATG)n 308 + +chr21 9795588 9796685 variant3 chr21 9795736 9795894 (GAATG)n 683 + +chr21 9795588 9796685 variant3 chr21 9795911 9796007 (GAATG)n 345 + +chr21 9795588 9796685 variant3 chr21 9796028 9796187 (GAATG)n 756 + +chr21 9795588 9796685 variant3 chr21 9796202 9796615 (GAATG)n 891 + +chr21 9795588 9796685 variant3 chr21 9796637 9796824 (GAATG)n 621 + +EOF + +# Create and populate expected output files for different tests +cat << EOF > $TMPDIR/expected.bed +chr21 9719758 9729320 6353 +chr21 9729310 9757478 14482 +chr21 9795588 9796685 3604 +EOF +cat << EOF > $TMPDIR/expected_max.bed +chr21 9719758 9729320 variant1 3288 +chr21 9729310 9757478 variant2 8367 +chr21 9795588 9796685 variant3 891 +EOF +cat << EOF > $TMPDIR/expected_full.bed +chr21 9719758 9729320 variant1 chr21 9719768 9721892 ALR/Alpha 1004 + 6353 +chr21 9729310 9757478 variant2 chr21 9729320 9729809 L1PA3 3897 - 14482 +chr21 9795588 9796685 variant3 chr21 9795589 9795713 (GAATG)n 308 + 3604 +EOF +cat << EOF > $TMPDIR/expected_delimited.bed +chr21 9719758 9729320 variant1 1004;1010;3288;1051 +chr21 9729310 9757478 variant2 3897;8367;1036;1182 +chr21 9795588 9796685 variant3 308;683;345;756;891;621 +EOF +cat << EOF > $TMPDIR/expected_precision.bed +chr21 9719758 9729320 variant1 1.6e+03 +chr21 9729310 9757478 variant2 3.6e+03 +chr21 9795588 9796685 variant3 6e+02 +EOF + +# Test 1: without operation option, default operation is sum +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null + +echo "> Run bedtools groupby on BED file" +"$meta_executable" \ + --input "../example.bed" \ + --groupby "1,2,3" \ + --column "9" \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected.bed" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: with operation max option +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bedtools groupby on BED file with max operation" +"$meta_executable" \ + --input "../example.bed" \ + --groupby "1-4" \ + --column "9" \ + --operation "max" \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_max.bed" +echo "- test2 succeeded -" + +popd > /dev/null + +# Test 3: full option +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null + +echo "> Run bedtools groupby on BED file with full option" +"$meta_executable" \ + --input "../example.bed" \ + --groupby "1-4" \ + --column "9" \ + --full \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_full.bed" +echo "- test3 succeeded -" + +popd > /dev/null + +# Test 4: header option +mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null + +echo "> Run bedtools groupby on BED file with header option" +"$meta_executable" \ + --input "../example.bed" \ + --groupby "1-4" \ + --column "9" \ + --header \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_file_contains "output.bed" "# Header" +echo "- test4 succeeded -" + +popd > /dev/null + +# Test 5: Delimiter and collapse +mkdir "$TMPDIR/test5" && pushd "$TMPDIR/test5" > /dev/null + +echo "> Run bedtools groupby on BED file with delimiter and collapse options" +"$meta_executable" \ + --input "../example.bed" \ + --groupby "1-4" \ + --column "9" \ + --operation "collapse" \ + --delimiter ";" \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_delimited.bed" +echo "- test5 succeeded -" + +popd > /dev/null + +# Test 6: precision option +mkdir "$TMPDIR/test6" && pushd "$TMPDIR/test6" > /dev/null + +echo "> Run bedtools groupby on BED file with precision option" +"$meta_executable" \ + --input "../example.bed" \ + --groupby "1-4" \ + --column "9" \ + --operation "mean" \ + --precision 2 \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_precision.bed" +echo "- test6 succeeded -" + +popd > /dev/null + +echo "---- All tests succeeded! ----" +exit 0