Skip to content

Commit

Permalink
FEAT: add bedtools getfasta. (#59)
Browse files Browse the repository at this point in the history
* FEAT: add bedtools getfasta.

* Add PR number to CHANGELOG
  • Loading branch information
DriesSchaumont authored Jun 21, 2024
1 parent 9d74c3b commit b68f1ed
Show file tree
Hide file tree
Showing 4 changed files with 247 additions and 0 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@

* `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43).

* `bedtools`:
- `bedtools_getfasta`: extract sequences from a FASTA file for each of the
intervals defined in a BED/GFF/VCF file (PR #59).

## MINOR CHANGES

Expand Down
103 changes: 103 additions & 0 deletions src/bedtools/bedtools_getfasta/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
name: bedtools_getfasta
namespace: bedtools
description: Extract sequences from a FASTA file for each of the intervals defined in a BED/GFF/VCF file.
keywords: [sequencing, fasta, BED, GFF, VCF]
links:
documentation: https://bedtools.readthedocs.io/en/latest/content/tools/getfasta.html
repository: https://github.com/arq5x/bedtools2
references:
doi: 10.1093/bioinformatics/btq033
license: GPL-2.0
requirements:
commands: [bedtools]

argument_groups:
- name: Input arguments
arguments:
- name: --input_fasta
type: file
description: |
FASTA file containing sequences for each interval specified in the input BED file.
The headers in the input FASTA file must exactly match the chromosome column in the BED file.
- name: "--input_bed"
type: file
description: |
BED file containing intervals to extract from the FASTA file.
BED files containing a single region require a newline character
at the end of the line, otherwise a blank output file is produced.
- name: --rna
type: boolean_true
description: |
The FASTA is RNA not DNA. Reverse complementation handled accordingly.
- name: Run arguments
arguments:
- name: "--strandedness"
type: boolean_true
alternatives: ["-s"]
description: |
Force strandedness. If the feature occupies the antisense strand, the output sequence will
be reverse complemented. By default strandedness is not taken into account.
- name: Output arguments
arguments:
- name: --output
alternatives: [-o]
required: true
type: file
direction: output
description: |
Output file where the output from the 'bedtools getfasta' commend will
be written to.
- name: --tab
type: boolean_true
description: |
Report extract sequences in a tab-delimited format instead of in FASTA format.
- name: --bed_out
type: boolean_true
description: |
Report extract sequences in a tab-delimited BED format instead of in FASTA format.
- name: "--name"
type: boolean_true
description: |
Set the FASTA header for each extracted sequence to be the "name" and coordinate columns from the BED feature.
- name: "--name_only"
type: boolean_true
description: |
Set the FASTA header for each extracted sequence to be the "name" columns from the BED feature.
- name: "--split"
type: boolean_true
description: |
When --input is in BED12 format, create a separate fasta entry for each block in a BED12 record,
blocks being described in the 11th and 12th column of the BED.
- name: "--full_header"
type: boolean_true
description: |
Use full fasta header. By default, only the word before the first space or tab is used.
# Arguments not taken into account:
#
# -fo [Specify an output file name. By default, output goes to stdout.
#

resources:
- type: bash_script
path: script.sh

test_resources:
- type: bash_script
path: test.sh

engines:
- type: docker
image: debian:stable-slim
setup:
- type: apt
packages: [bedtools, procps]
- type: docker
run: |
echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt
runners:
- type: executable
- type: nextflow
22 changes: 22 additions & 0 deletions src/bedtools/bedtools_getfasta/script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env bash
set -eo pipefail

unset_if_false=( par_rna par_strandedness par_tab par_bed_out par_name par_name_only par_split par_full_header )

for par in ${unset_if_false[@]}; do
test_val="${!par}"
[[ "$test_val" == "false" ]] && unset $par
done

bedtools getfasta \
-fi "$par_input_fasta" \
-bed "$par_input_bed" \
${par_rna:+-rna} \
${par_name:+-name} \
${par_name_only:+-nameOnly} \
${par_tab:+-tab} \
${par_bed_out:+-bedOut} \
${par_strandedness:+-s} \
${par_split:+-split} \
${par_full_header:+-fullHeader} > "$par_output"

119 changes: 119 additions & 0 deletions src/bedtools/bedtools_getfasta/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#!/usr/bin/env bash
set -eo pipefail

TMPDIR=$(mktemp -d)
function clean_up {
[[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
}
trap clean_up EXIT

# Create dummy test fasta file
cat > "$TMPDIR/test.fa" <<EOF
>chr1
AAAAAAAACCCCCCCCCCCCCGCTACTGGGGGGGGGGGGGGGGGG
EOF

TAB="$(printf '\t')"

# Create dummy bed file
cat > "$TMPDIR/test.bed" <<EOF
chr1${TAB}5${TAB}10${TAB}myseq
EOF

# Create expected bed file
cat > "$TMPDIR/expected.fasta" <<EOF
>chr1:5-10
AAACC
EOF

"$meta_executable" \
--input_bed "$TMPDIR/test.bed" \
--input_fasta "$TMPDIR/test.fa" \
--output "$TMPDIR/output.fasta"

cmp --silent "$TMPDIR/output.fasta" "$TMPDIR/expected.fasta" || { echo "files are different:"; exit 1; }


# Create expected bed file for --name
cat > "$TMPDIR/expected_with_name.fasta" <<EOF
>myseq::chr1:5-10
AAACC
EOF

"$meta_executable" \
--input_bed "$TMPDIR/test.bed" \
--input_fasta "$TMPDIR/test.fa" \
--name \
--output "$TMPDIR/output_with_name.fasta"


cmp --silent "$TMPDIR/output_with_name.fasta" "$TMPDIR/expected_with_name.fasta" || { echo "Files when using --name are different."; exit 1; }

# Create expected bed file for --name_only
cat > "$TMPDIR/expected_with_name_only.fasta" <<EOF
>myseq
AAACC
EOF

"$meta_executable" \
--input_bed "$TMPDIR/test.bed" \
--input_fasta "$TMPDIR/test.fa" \
--name_only \
--output "$TMPDIR/output_with_name_only.fasta"

cmp --silent "$TMPDIR/output_with_name_only.fasta" "$TMPDIR/expected_with_name_only.fasta" || { echo "Files when using --name_only are different."; exit 1; }


# Create expected tab-delimited file for --tab
cat > "$TMPDIR/expected_tab.out" <<EOF
myseq${TAB}AAACC
EOF

"$meta_executable" \
--input_bed "$TMPDIR/test.bed" \
--input_fasta "$TMPDIR/test.fa" \
--name_only \
--tab \
--output "$TMPDIR/tab.out"

cmp --silent "$TMPDIR/expected_tab.out" "$TMPDIR/tab.out" || { echo "Files when using --tab are different."; exit 1; }


# Create expected tab-delimited file for --bed_out
cat > "$TMPDIR/expected.bed" <<EOF
chr1${TAB}5${TAB}10${TAB}myseq${TAB}AAACC
EOF

"$meta_executable" \
--input_bed "$TMPDIR/test.bed" \
--input_fasta "$TMPDIR/test.fa" \
--bed_out \
--output "$TMPDIR/output.bed"


cmp --silent "$TMPDIR/expected.bed" "$TMPDIR/output.bed" || { echo "Files when using --bed_out are different."; exit 1; }

# Create dummy bed file for strandedness
cat > "$TMPDIR/test_strandedness.bed" <<EOF
chr1${TAB}20${TAB}25${TAB}forward${TAB}1${TAB}+
chr1${TAB}20${TAB}25${TAB}reverse${TAB}1${TAB}-
EOF

# Create expected tab-delimited file for --bed_out
cat > "$TMPDIR/expected_strandedness.fasta" <<EOF
>forward(+)
CGCTA
>reverse(-)
TAGCG
EOF

"$meta_executable" \
--input_bed "$TMPDIR/test_strandedness.bed" \
--input_fasta "$TMPDIR/test.fa" \
-s \
--name_only \
--output "$TMPDIR/output_strandedness.fasta"


cmp --silent "$TMPDIR/expected_strandedness.fasta" "$TMPDIR/output_strandedness.fasta" || { echo "Files when using -s are different."; exit 1; }

0 comments on commit b68f1ed

Please sign in to comment.