From b68f1edd7ae4774e1971cefcd18e3fc9832fbe18 Mon Sep 17 00:00:00 2001 From: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> Date: Fri, 21 Jun 2024 22:30:34 +0200 Subject: [PATCH] FEAT: add bedtools getfasta. (#59) * FEAT: add bedtools getfasta. * Add PR number to CHANGELOG --- CHANGELOG.md | 3 + .../bedtools_getfasta/config.vsh.yaml | 103 +++++++++++++++ src/bedtools/bedtools_getfasta/script.sh | 22 ++++ src/bedtools/bedtools_getfasta/test.sh | 119 ++++++++++++++++++ 4 files changed, 247 insertions(+) create mode 100644 src/bedtools/bedtools_getfasta/config.vsh.yaml create mode 100644 src/bedtools/bedtools_getfasta/script.sh create mode 100644 src/bedtools/bedtools_getfasta/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index a71db3b4..a3e3fa4d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -52,6 +52,9 @@ * `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43). +* `bedtools`: + - `bedtools_getfasta`: extract sequences from a FASTA file for each of the + intervals defined in a BED/GFF/VCF file (PR #59). ## MINOR CHANGES diff --git a/src/bedtools/bedtools_getfasta/config.vsh.yaml b/src/bedtools/bedtools_getfasta/config.vsh.yaml new file mode 100644 index 00000000..f1f49a87 --- /dev/null +++ b/src/bedtools/bedtools_getfasta/config.vsh.yaml @@ -0,0 +1,103 @@ +name: bedtools_getfasta +namespace: bedtools +description: Extract sequences from a FASTA file for each of the intervals defined in a BED/GFF/VCF file. +keywords: [sequencing, fasta, BED, GFF, VCF] +links: + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/getfasta.html + repository: https://github.com/arq5x/bedtools2 +references: + doi: 10.1093/bioinformatics/btq033 +license: GPL-2.0 +requirements: + commands: [bedtools] + +argument_groups: + - name: Input arguments + arguments: + - name: --input_fasta + type: file + description: | + FASTA file containing sequences for each interval specified in the input BED file. + The headers in the input FASTA file must exactly match the chromosome column in the BED file. + - name: "--input_bed" + type: file + description: | + BED file containing intervals to extract from the FASTA file. + BED files containing a single region require a newline character + at the end of the line, otherwise a blank output file is produced. + - name: --rna + type: boolean_true + description: | + The FASTA is RNA not DNA. Reverse complementation handled accordingly. + + - name: Run arguments + arguments: + - name: "--strandedness" + type: boolean_true + alternatives: ["-s"] + description: | + Force strandedness. If the feature occupies the antisense strand, the output sequence will + be reverse complemented. By default strandedness is not taken into account. + + - name: Output arguments + arguments: + - name: --output + alternatives: [-o] + required: true + type: file + direction: output + description: | + Output file where the output from the 'bedtools getfasta' commend will + be written to. + - name: --tab + type: boolean_true + description: | + Report extract sequences in a tab-delimited format instead of in FASTA format. + - name: --bed_out + type: boolean_true + description: | + Report extract sequences in a tab-delimited BED format instead of in FASTA format. + - name: "--name" + type: boolean_true + description: | + Set the FASTA header for each extracted sequence to be the "name" and coordinate columns from the BED feature. + - name: "--name_only" + type: boolean_true + description: | + Set the FASTA header for each extracted sequence to be the "name" columns from the BED feature. + - name: "--split" + type: boolean_true + description: | + When --input is in BED12 format, create a separate fasta entry for each block in a BED12 record, + blocks being described in the 11th and 12th column of the BED. + - name: "--full_header" + type: boolean_true + description: | + Use full fasta header. By default, only the word before the first space or tab is used. + +# Arguments not taken into account: +# +# -fo [Specify an output file name. By default, output goes to stdout. +# + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bedtools, procps] + - type: docker + run: | + echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/bedtools/bedtools_getfasta/script.sh b/src/bedtools/bedtools_getfasta/script.sh new file mode 100644 index 00000000..8e88b318 --- /dev/null +++ b/src/bedtools/bedtools_getfasta/script.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -eo pipefail + +unset_if_false=( par_rna par_strandedness par_tab par_bed_out par_name par_name_only par_split par_full_header ) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +bedtools getfasta \ + -fi "$par_input_fasta" \ + -bed "$par_input_bed" \ + ${par_rna:+-rna} \ + ${par_name:+-name} \ + ${par_name_only:+-nameOnly} \ + ${par_tab:+-tab} \ + ${par_bed_out:+-bedOut} \ + ${par_strandedness:+-s} \ + ${par_split:+-split} \ + ${par_full_header:+-fullHeader} > "$par_output" + diff --git a/src/bedtools/bedtools_getfasta/test.sh b/src/bedtools/bedtools_getfasta/test.sh new file mode 100644 index 00000000..a28e3a7e --- /dev/null +++ b/src/bedtools/bedtools_getfasta/test.sh @@ -0,0 +1,119 @@ +#!/usr/bin/env bash +set -eo pipefail + +TMPDIR=$(mktemp -d) +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create dummy test fasta file +cat > "$TMPDIR/test.fa" <chr1 +AAAAAAAACCCCCCCCCCCCCGCTACTGGGGGGGGGGGGGGGGGG +EOF + +TAB="$(printf '\t')" + +# Create dummy bed file +cat > "$TMPDIR/test.bed" < "$TMPDIR/expected.fasta" <chr1:5-10 +AAACC +EOF + +"$meta_executable" \ + --input_bed "$TMPDIR/test.bed" \ + --input_fasta "$TMPDIR/test.fa" \ + --output "$TMPDIR/output.fasta" + +cmp --silent "$TMPDIR/output.fasta" "$TMPDIR/expected.fasta" || { echo "files are different:"; exit 1; } + + +# Create expected bed file for --name +cat > "$TMPDIR/expected_with_name.fasta" <myseq::chr1:5-10 +AAACC +EOF + +"$meta_executable" \ + --input_bed "$TMPDIR/test.bed" \ + --input_fasta "$TMPDIR/test.fa" \ + --name \ + --output "$TMPDIR/output_with_name.fasta" + + +cmp --silent "$TMPDIR/output_with_name.fasta" "$TMPDIR/expected_with_name.fasta" || { echo "Files when using --name are different."; exit 1; } + +# Create expected bed file for --name_only +cat > "$TMPDIR/expected_with_name_only.fasta" <myseq +AAACC +EOF + +"$meta_executable" \ + --input_bed "$TMPDIR/test.bed" \ + --input_fasta "$TMPDIR/test.fa" \ + --name_only \ + --output "$TMPDIR/output_with_name_only.fasta" + +cmp --silent "$TMPDIR/output_with_name_only.fasta" "$TMPDIR/expected_with_name_only.fasta" || { echo "Files when using --name_only are different."; exit 1; } + + +# Create expected tab-delimited file for --tab +cat > "$TMPDIR/expected_tab.out" < "$TMPDIR/expected.bed" < "$TMPDIR/test_strandedness.bed" < "$TMPDIR/expected_strandedness.fasta" <forward(+) +CGCTA +>reverse(-) +TAGCG +EOF + +"$meta_executable" \ + --input_bed "$TMPDIR/test_strandedness.bed" \ + --input_fasta "$TMPDIR/test.fa" \ + -s \ + --name_only \ + --output "$TMPDIR/output_strandedness.fasta" + + +cmp --silent "$TMPDIR/expected_strandedness.fasta" "$TMPDIR/output_strandedness.fasta" || { echo "Files when using -s are different."; exit 1; } +