-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* FEAT: add bedtools getfasta. * Add PR number to CHANGELOG
- Loading branch information
1 parent
9d74c3b
commit b68f1ed
Showing
4 changed files
with
247 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
name: bedtools_getfasta | ||
namespace: bedtools | ||
description: Extract sequences from a FASTA file for each of the intervals defined in a BED/GFF/VCF file. | ||
keywords: [sequencing, fasta, BED, GFF, VCF] | ||
links: | ||
documentation: https://bedtools.readthedocs.io/en/latest/content/tools/getfasta.html | ||
repository: https://github.com/arq5x/bedtools2 | ||
references: | ||
doi: 10.1093/bioinformatics/btq033 | ||
license: GPL-2.0 | ||
requirements: | ||
commands: [bedtools] | ||
|
||
argument_groups: | ||
- name: Input arguments | ||
arguments: | ||
- name: --input_fasta | ||
type: file | ||
description: | | ||
FASTA file containing sequences for each interval specified in the input BED file. | ||
The headers in the input FASTA file must exactly match the chromosome column in the BED file. | ||
- name: "--input_bed" | ||
type: file | ||
description: | | ||
BED file containing intervals to extract from the FASTA file. | ||
BED files containing a single region require a newline character | ||
at the end of the line, otherwise a blank output file is produced. | ||
- name: --rna | ||
type: boolean_true | ||
description: | | ||
The FASTA is RNA not DNA. Reverse complementation handled accordingly. | ||
- name: Run arguments | ||
arguments: | ||
- name: "--strandedness" | ||
type: boolean_true | ||
alternatives: ["-s"] | ||
description: | | ||
Force strandedness. If the feature occupies the antisense strand, the output sequence will | ||
be reverse complemented. By default strandedness is not taken into account. | ||
- name: Output arguments | ||
arguments: | ||
- name: --output | ||
alternatives: [-o] | ||
required: true | ||
type: file | ||
direction: output | ||
description: | | ||
Output file where the output from the 'bedtools getfasta' commend will | ||
be written to. | ||
- name: --tab | ||
type: boolean_true | ||
description: | | ||
Report extract sequences in a tab-delimited format instead of in FASTA format. | ||
- name: --bed_out | ||
type: boolean_true | ||
description: | | ||
Report extract sequences in a tab-delimited BED format instead of in FASTA format. | ||
- name: "--name" | ||
type: boolean_true | ||
description: | | ||
Set the FASTA header for each extracted sequence to be the "name" and coordinate columns from the BED feature. | ||
- name: "--name_only" | ||
type: boolean_true | ||
description: | | ||
Set the FASTA header for each extracted sequence to be the "name" columns from the BED feature. | ||
- name: "--split" | ||
type: boolean_true | ||
description: | | ||
When --input is in BED12 format, create a separate fasta entry for each block in a BED12 record, | ||
blocks being described in the 11th and 12th column of the BED. | ||
- name: "--full_header" | ||
type: boolean_true | ||
description: | | ||
Use full fasta header. By default, only the word before the first space or tab is used. | ||
# Arguments not taken into account: | ||
# | ||
# -fo [Specify an output file name. By default, output goes to stdout. | ||
# | ||
|
||
resources: | ||
- type: bash_script | ||
path: script.sh | ||
|
||
test_resources: | ||
- type: bash_script | ||
path: test.sh | ||
|
||
engines: | ||
- type: docker | ||
image: debian:stable-slim | ||
setup: | ||
- type: apt | ||
packages: [bedtools, procps] | ||
- type: docker | ||
run: | | ||
echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt | ||
runners: | ||
- type: executable | ||
- type: nextflow |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#!/usr/bin/env bash | ||
set -eo pipefail | ||
|
||
unset_if_false=( par_rna par_strandedness par_tab par_bed_out par_name par_name_only par_split par_full_header ) | ||
|
||
for par in ${unset_if_false[@]}; do | ||
test_val="${!par}" | ||
[[ "$test_val" == "false" ]] && unset $par | ||
done | ||
|
||
bedtools getfasta \ | ||
-fi "$par_input_fasta" \ | ||
-bed "$par_input_bed" \ | ||
${par_rna:+-rna} \ | ||
${par_name:+-name} \ | ||
${par_name_only:+-nameOnly} \ | ||
${par_tab:+-tab} \ | ||
${par_bed_out:+-bedOut} \ | ||
${par_strandedness:+-s} \ | ||
${par_split:+-split} \ | ||
${par_full_header:+-fullHeader} > "$par_output" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
#!/usr/bin/env bash | ||
set -eo pipefail | ||
|
||
TMPDIR=$(mktemp -d) | ||
function clean_up { | ||
[[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" | ||
} | ||
trap clean_up EXIT | ||
|
||
# Create dummy test fasta file | ||
cat > "$TMPDIR/test.fa" <<EOF | ||
>chr1 | ||
AAAAAAAACCCCCCCCCCCCCGCTACTGGGGGGGGGGGGGGGGGG | ||
EOF | ||
|
||
TAB="$(printf '\t')" | ||
|
||
# Create dummy bed file | ||
cat > "$TMPDIR/test.bed" <<EOF | ||
chr1${TAB}5${TAB}10${TAB}myseq | ||
EOF | ||
|
||
# Create expected bed file | ||
cat > "$TMPDIR/expected.fasta" <<EOF | ||
>chr1:5-10 | ||
AAACC | ||
EOF | ||
|
||
"$meta_executable" \ | ||
--input_bed "$TMPDIR/test.bed" \ | ||
--input_fasta "$TMPDIR/test.fa" \ | ||
--output "$TMPDIR/output.fasta" | ||
|
||
cmp --silent "$TMPDIR/output.fasta" "$TMPDIR/expected.fasta" || { echo "files are different:"; exit 1; } | ||
|
||
|
||
# Create expected bed file for --name | ||
cat > "$TMPDIR/expected_with_name.fasta" <<EOF | ||
>myseq::chr1:5-10 | ||
AAACC | ||
EOF | ||
|
||
"$meta_executable" \ | ||
--input_bed "$TMPDIR/test.bed" \ | ||
--input_fasta "$TMPDIR/test.fa" \ | ||
--name \ | ||
--output "$TMPDIR/output_with_name.fasta" | ||
|
||
|
||
cmp --silent "$TMPDIR/output_with_name.fasta" "$TMPDIR/expected_with_name.fasta" || { echo "Files when using --name are different."; exit 1; } | ||
|
||
# Create expected bed file for --name_only | ||
cat > "$TMPDIR/expected_with_name_only.fasta" <<EOF | ||
>myseq | ||
AAACC | ||
EOF | ||
|
||
"$meta_executable" \ | ||
--input_bed "$TMPDIR/test.bed" \ | ||
--input_fasta "$TMPDIR/test.fa" \ | ||
--name_only \ | ||
--output "$TMPDIR/output_with_name_only.fasta" | ||
|
||
cmp --silent "$TMPDIR/output_with_name_only.fasta" "$TMPDIR/expected_with_name_only.fasta" || { echo "Files when using --name_only are different."; exit 1; } | ||
|
||
|
||
# Create expected tab-delimited file for --tab | ||
cat > "$TMPDIR/expected_tab.out" <<EOF | ||
myseq${TAB}AAACC | ||
EOF | ||
|
||
"$meta_executable" \ | ||
--input_bed "$TMPDIR/test.bed" \ | ||
--input_fasta "$TMPDIR/test.fa" \ | ||
--name_only \ | ||
--tab \ | ||
--output "$TMPDIR/tab.out" | ||
|
||
cmp --silent "$TMPDIR/expected_tab.out" "$TMPDIR/tab.out" || { echo "Files when using --tab are different."; exit 1; } | ||
|
||
|
||
# Create expected tab-delimited file for --bed_out | ||
cat > "$TMPDIR/expected.bed" <<EOF | ||
chr1${TAB}5${TAB}10${TAB}myseq${TAB}AAACC | ||
EOF | ||
|
||
"$meta_executable" \ | ||
--input_bed "$TMPDIR/test.bed" \ | ||
--input_fasta "$TMPDIR/test.fa" \ | ||
--bed_out \ | ||
--output "$TMPDIR/output.bed" | ||
|
||
|
||
cmp --silent "$TMPDIR/expected.bed" "$TMPDIR/output.bed" || { echo "Files when using --bed_out are different."; exit 1; } | ||
|
||
# Create dummy bed file for strandedness | ||
cat > "$TMPDIR/test_strandedness.bed" <<EOF | ||
chr1${TAB}20${TAB}25${TAB}forward${TAB}1${TAB}+ | ||
chr1${TAB}20${TAB}25${TAB}reverse${TAB}1${TAB}- | ||
EOF | ||
|
||
# Create expected tab-delimited file for --bed_out | ||
cat > "$TMPDIR/expected_strandedness.fasta" <<EOF | ||
>forward(+) | ||
CGCTA | ||
>reverse(-) | ||
TAGCG | ||
EOF | ||
|
||
"$meta_executable" \ | ||
--input_bed "$TMPDIR/test_strandedness.bed" \ | ||
--input_fasta "$TMPDIR/test.fa" \ | ||
-s \ | ||
--name_only \ | ||
--output "$TMPDIR/output_strandedness.fasta" | ||
|
||
|
||
cmp --silent "$TMPDIR/expected_strandedness.fasta" "$TMPDIR/output_strandedness.fasta" || { echo "Files when using -s are different."; exit 1; } | ||
|