-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Initial Commit * config and help.txt * script.sh * test template * More tests and debugging * test 5 and 6 * test 7, 8, 9 * Update test.sh * fixing bug on config * Changelog * Update config.vsh.yaml * Requested changes * Bug fixing --------- Co-authored-by: Jakub Majercik <[email protected]>
- Loading branch information
1 parent
c3ba4a7
commit dc7b33d
Showing
5 changed files
with
516 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,194 @@ | ||
name: bcftools_norm | ||
namespace: bcftools | ||
description: | | ||
Left-align and normalize indels, check if REF alleles match the reference, split multiallelic sites into multiple rows; | ||
recover multiallelics from multiple rows. | ||
keywords: [Normalize, VCF, BCF] | ||
links: | ||
homepage: https://samtools.github.io/bcftools/ | ||
documentation: https://samtools.github.io/bcftools/bcftools.html#norm | ||
repository: https://github.com/samtools/bcftools | ||
issue_tracker: https://github.com/samtools/bcftools/issues | ||
references: | ||
doi: https://doi.org/10.1093/gigascience/giab008 | ||
license: MIT/Expat, GNU | ||
requirements: | ||
commands: [bcftools] | ||
authors: | ||
- __merge__: /src/_authors/theodoro_gasperin.yaml | ||
roles: [author] | ||
|
||
argument_groups: | ||
- name: Inputs | ||
arguments: | ||
- name: --input | ||
alternatives: -i | ||
type: file | ||
description: Input VCF/BCF file. | ||
required: true | ||
|
||
- name: Outputs | ||
arguments: | ||
- name: --output | ||
alternatives: -o | ||
direction: output | ||
type: file | ||
description: Output normalized VCF/BCF file. | ||
required: true | ||
|
||
- name: Options | ||
arguments: | ||
|
||
- name: --atomize | ||
alternatives: -a | ||
type: boolean_true | ||
description: | | ||
Decompose complex variants (e.g., MNVs become consecutive SNVs). | ||
- name: --atom_overlaps | ||
type: string | ||
choices: [".", "*"] | ||
description: | | ||
Use the star allele (*) for overlapping alleles or set to missing (.). | ||
- name: --check_ref | ||
alternatives: -c | ||
type: string | ||
choices: ['e', 'w', 'x', 's'] | ||
description: | | ||
Check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites. | ||
- name: --remove_duplicates | ||
alternatives: -d | ||
type: string | ||
choices: ['snps', 'indels', 'both', 'all', 'exact', 'none'] | ||
description: Remove duplicate snps, indels, both, all, exact matches, or none (old -D option). | ||
|
||
- name: --fasta_ref | ||
alternatives: -f | ||
type: file | ||
description: Reference fasta sequence file. | ||
|
||
- name: --force | ||
type: boolean_true | ||
description: | | ||
Try to proceed even if malformed tags are encountered. | ||
Experimental, use at your own risk. | ||
- name: --keep_sum | ||
type: string | ||
description: | | ||
Keep vector sum constant when splitting multiallelics (see github issue #360). | ||
- name: --multiallelics | ||
alternatives: -m | ||
type: string | ||
choices: ['+snps', '+indels', '+both', '+any', '-snps', '-indels', '-both', '-any'] | ||
description: | | ||
Split multiallelics (-) or join biallelics (+), type: snps, indels, both, any [default: both]. | ||
- name: --no_version | ||
type: boolean_true | ||
description: Do not append version and command line information to the header. | ||
|
||
- name: --do_not_normalize | ||
alternatives: -N | ||
type: boolean_true | ||
description: Do not normalize indels (with -m or -c s). | ||
|
||
- name: --output_type | ||
alternatives: --O | ||
type: string | ||
choices: ['u', 'z', 'b', 'v'] | ||
description: | | ||
Output type: | ||
u: uncompressed BCF | ||
z: compressed VCF | ||
b: compressed BCF | ||
v: uncompressed VCF | ||
- name: --old_rec_tag | ||
type: string | ||
description: Annotate modified records with INFO/STR indicating the original variant. | ||
|
||
- name: --regions | ||
alternatives: --r | ||
type: string | ||
description: | | ||
Restrict to comma-separated list of regions. | ||
Following formats are supported: chr|chr:pos|chr:beg-end|chr:beg-[,…]. | ||
example: '20:1000000-2000000' | ||
|
||
- name: --regions_file | ||
alternatives: --R | ||
type: file | ||
description: | | ||
Restrict to regions listed in a file. | ||
Regions can be specified either on a VCF, BED, or tab-delimited file (the default). | ||
For more information check manual. | ||
- name: --regions_overlap | ||
type: string | ||
choices: ['pos', 'record', 'variant', '0', '1', '2'] | ||
description: | | ||
This option controls how overlapping records are determined: | ||
set to 'pos' or '0' if the VCF record has to have POS inside a region (this corresponds to the default behavior of -t/-T); | ||
set to 'record' or '1' if also overlapping records with POS outside a region should be included (this is the default behavior of -r/-R, | ||
and includes indels with POS at the end of a region, which are technically outside the region); | ||
or set to 'variant' or '2' to include only true overlapping variation (compare the full VCF representation "TA>T-" vs the true sequence variation "A>-"). | ||
- name: --site_win | ||
alternatives: -w | ||
type: integer | ||
description: | | ||
Buffer for sorting lines that changed position during realignment. | ||
- name: --strict_filter | ||
alternatives: -s | ||
type: boolean_true | ||
description: When merging (-m+), merged site is PASS only if all sites being merged PASS. | ||
|
||
- name: --targets | ||
alternatives: -t | ||
type: string | ||
description: Similar to --regions but streams rather than index-jumps. | ||
example: '20:1000000-2000000' | ||
|
||
- name: --targets_file | ||
alternatives: -T | ||
type: file | ||
description: Similar to --regions_file but streams rather than index-jumps. | ||
|
||
- name: --targets_overlap | ||
type: string | ||
choices: ['pos', 'record', 'variant', '0', '1', '2'] | ||
description: | | ||
Include if POS in the region (0), record overlaps (1), variant overlaps (2). | ||
Similar to --regions_overlap. | ||
resources: | ||
- type: bash_script | ||
path: script.sh | ||
|
||
test_resources: | ||
- type: bash_script | ||
path: test.sh | ||
|
||
engines: | ||
- type: docker | ||
image: debian:stable-slim | ||
setup: | ||
- type: apt | ||
packages: [bcftools, procps] | ||
- type: docker | ||
run: | | ||
echo "bcftools: \"$(bcftools --version | grep 'bcftools' | sed -n 's/^bcftools //p')\"" > /var/software_versions.txt | ||
test_setup: | ||
- type: apt | ||
packages: [tabix] | ||
|
||
runners: | ||
- type: executable | ||
- type: nextflow | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
``` | ||
bcftools norm -h | ||
``` | ||
|
||
About: Left-align and normalize indels; check if REF alleles match the reference; | ||
split multiallelic sites into multiple rows; recover multiallelics from | ||
multiple rows. | ||
Usage: bcftools norm [options] <in.vcf.gz> | ||
|
||
Options: | ||
-a, --atomize Decompose complex variants (e.g. MNVs become consecutive SNVs) | ||
--atom-overlaps '*'|. Use the star allele (*) for overlapping alleles or set to missing (.) [*] | ||
-c, --check-ref e|w|x|s Check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e] | ||
-D, --remove-duplicates Remove duplicate lines of the same type. | ||
-d, --rm-dup TYPE Remove duplicate snps|indels|both|all|exact | ||
-f, --fasta-ref FILE Reference sequence | ||
--force Try to proceed even if malformed tags are encountered. Experimental, use at your own risk | ||
--keep-sum TAG,.. Keep vector sum constant when splitting multiallelics (see github issue #360) | ||
-m, --multiallelics -|+TYPE Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both] | ||
--no-version Do not append version and command line to the header | ||
-N, --do-not-normalize Do not normalize indels (with -m or -c s) | ||
--old-rec-tag STR Annotate modified records with INFO/STR indicating the original variant | ||
-o, --output FILE Write output to a file [standard output] | ||
-O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v] | ||
-r, --regions REGION Restrict to comma-separated list of regions | ||
-R, --regions-file FILE Restrict to regions listed in a file | ||
--regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1] | ||
-s, --strict-filter When merging (-m+), merged site is PASS only if all sites being merged PASS | ||
-t, --targets REGION Similar to -r but streams rather than index-jumps | ||
-T, --targets-file FILE Similar to -R but streams rather than index-jumps | ||
--targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0] | ||
--threads INT Use multithreading with <int> worker threads [0] | ||
-w, --site-win INT Buffer for sorting lines which changed position during realignment [1000] | ||
|
||
Examples: | ||
# normalize and left-align indels | ||
bcftools norm -f ref.fa in.vcf | ||
|
||
# split multi-allelic sites | ||
bcftools norm -m- in.vcf | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
#!/bin/bash | ||
|
||
## VIASH START | ||
## VIASH END | ||
|
||
# Exit on error | ||
set -eo pipefail | ||
|
||
# Unset parameters | ||
unset_if_false=( | ||
par_atomize | ||
par_remove_duplicates | ||
par_force | ||
par_no_version | ||
par_do_not_normalize | ||
par_strict_filter | ||
) | ||
|
||
for par in ${unset_if_false[@]}; do | ||
test_val="${!par}" | ||
[[ "$test_val" == "false" ]] && unset $par | ||
done | ||
|
||
# Execute bcftools norm with the provided arguments | ||
bcftools norm \ | ||
${par_atomize:+--atomize} \ | ||
${par_atom_overlaps:+--atom-overlaps "$par_atom_overlaps"} \ | ||
${par_check_ref:+-c "$par_check_ref"} \ | ||
${par_remove_duplicates:+-d "$par_remove_duplicates"} \ | ||
${par_fasta_ref:+-f "$par_fasta_ref"} \ | ||
${par_force:+--force} \ | ||
${par_keep_sum:+--keep-sum "$par_keep_sum"} \ | ||
${par_multiallelics:+-m "$par_multiallelics"} \ | ||
${par_no_version:+--no-version} \ | ||
${par_do_not_normalize:+-N} \ | ||
${par_old_rec_tag:+--old-rec-tag "$par_old_rec_tag"} \ | ||
${par_regions:+-r "$par_regions"} \ | ||
${par_regions_file:+-R "$par_regions_file"} \ | ||
${par_regions_overlap:+--regions-overlap "$par_regions_overlap"} \ | ||
${par_site_win:+-w "$par_site_win"} \ | ||
${par_strict_filter:+-s} \ | ||
${par_targets:+-t "$par_targets"} \ | ||
${par_targets_file:+-T "$par_targets_file"} \ | ||
${par_targets_overlap:+--targets-overlap "$par_targets_overlap"} \ | ||
${meta_cpus:+--threads "$meta_cpus"} \ | ||
${par_output_type:+-O "$par_output_type"} \ | ||
-o $par_output \ | ||
$par_input | ||
|
Oops, something went wrong.