From b68f1edd7ae4774e1971cefcd18e3fc9832fbe18 Mon Sep 17 00:00:00 2001
From: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com>
Date: Fri, 21 Jun 2024 22:30:34 +0200
Subject: [PATCH] FEAT: add bedtools getfasta. (#59)

* FEAT: add bedtools getfasta.

* Add PR number to CHANGELOG
---
 CHANGELOG.md                                  |   3 +
 .../bedtools_getfasta/config.vsh.yaml         | 103 +++++++++++++++
 src/bedtools/bedtools_getfasta/script.sh      |  22 ++++
 src/bedtools/bedtools_getfasta/test.sh        | 119 ++++++++++++++++++
 4 files changed, 247 insertions(+)
 create mode 100644 src/bedtools/bedtools_getfasta/config.vsh.yaml
 create mode 100644 src/bedtools/bedtools_getfasta/script.sh
 create mode 100644 src/bedtools/bedtools_getfasta/test.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a71db3b4..a3e3fa4d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -52,6 +52,9 @@
 
 * `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43).
 
+* `bedtools`:
+    - `bedtools_getfasta`: extract sequences from a FASTA file for each of the
+                           intervals defined in a BED/GFF/VCF file (PR #59).
 
 ## MINOR CHANGES
 
diff --git a/src/bedtools/bedtools_getfasta/config.vsh.yaml b/src/bedtools/bedtools_getfasta/config.vsh.yaml
new file mode 100644
index 00000000..f1f49a87
--- /dev/null
+++ b/src/bedtools/bedtools_getfasta/config.vsh.yaml
@@ -0,0 +1,103 @@
+name: bedtools_getfasta
+namespace: bedtools
+description: Extract sequences from a FASTA file for each of the intervals defined in a BED/GFF/VCF file.
+keywords: [sequencing, fasta, BED, GFF, VCF]
+links:
+  documentation: https://bedtools.readthedocs.io/en/latest/content/tools/getfasta.html
+  repository: https://github.com/arq5x/bedtools2
+references:
+  doi: 10.1093/bioinformatics/btq033
+license: GPL-2.0
+requirements:
+  commands: [bedtools]
+
+argument_groups:
+  - name: Input arguments
+    arguments:
+      - name: --input_fasta
+        type: file
+        description: |
+          FASTA file containing sequences for each interval specified in the input BED file.
+          The headers in the input FASTA file must exactly match the chromosome column in the BED file.
+      - name: "--input_bed"
+        type: file
+        description: |
+          BED file containing intervals to extract from the FASTA file.
+          BED files containing a single region require a newline character
+          at the end of the line, otherwise a blank output file is produced.
+      - name: --rna
+        type: boolean_true
+        description: |
+          The FASTA is RNA not DNA. Reverse complementation handled accordingly.
+    
+  - name: Run arguments
+    arguments:
+        - name: "--strandedness"
+          type: boolean_true
+          alternatives: ["-s"]
+          description: |
+            Force strandedness. If the feature occupies the antisense strand, the output sequence will
+            be reverse complemented. By default strandedness is not taken into account.
+
+  - name: Output arguments
+    arguments:
+      - name: --output
+        alternatives: [-o]
+        required: true
+        type: file
+        direction: output
+        description: |
+          Output file where the output from the 'bedtools getfasta' commend will
+          be written to.
+      - name: --tab
+        type: boolean_true
+        description: |
+          Report extract sequences in a tab-delimited format instead of in FASTA format.
+      - name: --bed_out
+        type: boolean_true
+        description: |
+          Report extract sequences in a tab-delimited BED format instead of in FASTA format.
+      - name: "--name"
+        type: boolean_true
+        description: |
+          Set the FASTA header for each extracted sequence to be the "name" and coordinate columns from the BED feature.
+      - name: "--name_only"
+        type: boolean_true
+        description: |
+          Set the FASTA header for each extracted sequence to be the "name" columns from the BED feature.
+      - name: "--split"
+        type: boolean_true
+        description: |
+          When --input is in BED12 format, create a separate fasta entry for each block in a BED12 record,
+          blocks being described in the 11th and 12th column of the BED.
+      - name: "--full_header"
+        type: boolean_true
+        description: |
+          Use full fasta header. By default, only the word before the first space or tab is used.
+        
+# Arguments not taken into account:
+#
+#       -fo           [Specify an output file name. By default, output goes to stdout.
+#                            
+
+resources:
+  - type: bash_script
+    path: script.sh
+
+test_resources:
+  - type: bash_script
+    path: test.sh
+
+engines:
+  - type: docker
+    image: debian:stable-slim
+    setup:
+      - type: apt
+        packages: [bedtools, procps]
+      - type: docker
+        run: |
+          echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt
+
+runners:
+  - type: executable
+  - type: nextflow
diff --git a/src/bedtools/bedtools_getfasta/script.sh b/src/bedtools/bedtools_getfasta/script.sh
new file mode 100644
index 00000000..8e88b318
--- /dev/null
+++ b/src/bedtools/bedtools_getfasta/script.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+unset_if_false=( par_rna par_strandedness par_tab par_bed_out par_name par_name_only par_split par_full_header )
+
+for par in ${unset_if_false[@]}; do
+    test_val="${!par}"
+    [[ "$test_val" == "false" ]] && unset $par
+done
+
+bedtools getfasta \
+    -fi "$par_input_fasta" \
+    -bed "$par_input_bed" \
+    ${par_rna:+-rna} \
+    ${par_name:+-name} \
+    ${par_name_only:+-nameOnly} \
+    ${par_tab:+-tab} \
+    ${par_bed_out:+-bedOut} \
+    ${par_strandedness:+-s} \
+    ${par_split:+-split} \
+    ${par_full_header:+-fullHeader} > "$par_output"
+
diff --git a/src/bedtools/bedtools_getfasta/test.sh b/src/bedtools/bedtools_getfasta/test.sh
new file mode 100644
index 00000000..a28e3a7e
--- /dev/null
+++ b/src/bedtools/bedtools_getfasta/test.sh
@@ -0,0 +1,119 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+TMPDIR=$(mktemp -d)
+function clean_up {
+  [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
+}
+trap clean_up EXIT
+
+# Create dummy test fasta file
+cat > "$TMPDIR/test.fa" <<EOF
+>chr1
+AAAAAAAACCCCCCCCCCCCCGCTACTGGGGGGGGGGGGGGGGGG
+EOF
+
+TAB="$(printf '\t')"
+
+# Create dummy bed file
+cat > "$TMPDIR/test.bed" <<EOF
+chr1${TAB}5${TAB}10${TAB}myseq
+EOF
+
+# Create expected bed file
+cat > "$TMPDIR/expected.fasta" <<EOF
+>chr1:5-10
+AAACC
+EOF
+
+"$meta_executable" \
+  --input_bed "$TMPDIR/test.bed" \
+  --input_fasta "$TMPDIR/test.fa" \
+  --output "$TMPDIR/output.fasta"
+
+cmp --silent "$TMPDIR/output.fasta" "$TMPDIR/expected.fasta" || { echo "files are different:"; exit 1; }
+
+
+# Create expected bed file for --name
+cat > "$TMPDIR/expected_with_name.fasta" <<EOF
+>myseq::chr1:5-10
+AAACC
+EOF
+
+"$meta_executable" \
+  --input_bed "$TMPDIR/test.bed" \
+  --input_fasta "$TMPDIR/test.fa" \
+  --name \
+  --output "$TMPDIR/output_with_name.fasta"
+
+
+cmp --silent "$TMPDIR/output_with_name.fasta" "$TMPDIR/expected_with_name.fasta" || { echo "Files when using --name are different."; exit 1; }
+
+# Create expected bed file for --name_only
+cat > "$TMPDIR/expected_with_name_only.fasta" <<EOF
+>myseq
+AAACC
+EOF
+
+"$meta_executable" \
+  --input_bed "$TMPDIR/test.bed" \
+  --input_fasta "$TMPDIR/test.fa" \
+  --name_only \
+  --output "$TMPDIR/output_with_name_only.fasta"
+
+cmp --silent "$TMPDIR/output_with_name_only.fasta" "$TMPDIR/expected_with_name_only.fasta" || { echo "Files when using --name_only are different."; exit 1; }
+
+
+# Create expected tab-delimited file for --tab
+cat > "$TMPDIR/expected_tab.out" <<EOF
+myseq${TAB}AAACC
+EOF
+
+"$meta_executable" \
+  --input_bed "$TMPDIR/test.bed" \
+  --input_fasta "$TMPDIR/test.fa" \
+  --name_only \
+  --tab \
+  --output "$TMPDIR/tab.out"
+
+cmp --silent "$TMPDIR/expected_tab.out" "$TMPDIR/tab.out" || { echo "Files when using --tab are different."; exit 1; }
+
+
+# Create expected tab-delimited file for --bed_out
+cat > "$TMPDIR/expected.bed" <<EOF
+chr1${TAB}5${TAB}10${TAB}myseq${TAB}AAACC
+EOF
+
+"$meta_executable" \
+  --input_bed "$TMPDIR/test.bed" \
+  --input_fasta "$TMPDIR/test.fa" \
+  --bed_out \
+  --output "$TMPDIR/output.bed"
+
+
+cmp --silent "$TMPDIR/expected.bed" "$TMPDIR/output.bed" || { echo "Files when using --bed_out are different."; exit 1; }
+
+# Create dummy bed file for strandedness
+cat > "$TMPDIR/test_strandedness.bed" <<EOF
+chr1${TAB}20${TAB}25${TAB}forward${TAB}1${TAB}+
+chr1${TAB}20${TAB}25${TAB}reverse${TAB}1${TAB}-
+EOF
+
+# Create expected tab-delimited file for --bed_out
+cat > "$TMPDIR/expected_strandedness.fasta" <<EOF
+>forward(+)
+CGCTA
+>reverse(-)
+TAGCG
+EOF
+
+"$meta_executable" \
+  --input_bed "$TMPDIR/test_strandedness.bed" \
+  --input_fasta "$TMPDIR/test.fa" \
+  -s \
+  --name_only \
+  --output "$TMPDIR/output_strandedness.fasta"
+
+
+cmp --silent "$TMPDIR/expected_strandedness.fasta" "$TMPDIR/output_strandedness.fasta" || { echo "Files when using -s are different."; exit 1; }
+