-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
93 additions
and
217 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,217 +1,93 @@ | ||
name: bcftools_stats | ||
namespace: bcftools | ||
description: | | ||
Parses VCF or BCF and produces a txt stats file which can be plotted using plot-vcfstats. | ||
When two files are given, the program generates separate stats for intersection | ||
and the complements. By default only sites are compared, -s/-S must given to include | ||
also sample columns. | ||
keywords: [Stats, VCF, BCF] | ||
links: | ||
homepage: https://samtools.github.io/bcftools/ | ||
documentation: https://samtools.github.io/bcftools/bcftools.html#stats | ||
repository: https://github.com/samtools/bcftools | ||
issue_tracker: https://github.com/samtools/bcftools/issues | ||
references: | ||
doi: https://doi.org/10.1093/gigascience/giab008 | ||
license: MIT/Expat, GNU | ||
requirements: | ||
commands: [bcftools] | ||
authors: | ||
- __merge__: /src/_authors/theodoro_gasperin.yaml | ||
roles: [ author, maintainer ] | ||
|
||
argument_groups: | ||
- name: Inputs | ||
arguments: | ||
- name: --input | ||
alternatives: -i | ||
type: file | ||
multiple: true | ||
description: Input VCF/BCF file. Maximum of two files. | ||
required: true | ||
|
||
- name: Outputs | ||
arguments: | ||
- name: --output | ||
alternatives: -o | ||
direction: output | ||
type: file | ||
description: Output txt statistics file. | ||
required: true | ||
|
||
- name: Options | ||
arguments: | ||
|
||
- name: --allele_frequency_bins | ||
alternatives: --af_bins | ||
type: string | ||
description: | | ||
Allele frequency bins, a list (0.1,0.5,1) or a file (0.1\n0.5\n1). | ||
|
||
- name: --allele_frequency_tag | ||
alternatives: --af_tag | ||
type: string | ||
description: | | ||
Allele frequency tag to use, by default estimated from AN,AC or GT. | ||
|
||
- name: --first_allele_only | ||
alternatives: --first_only | ||
type: boolean_true | ||
description: | | ||
Include only 1st allele at multiallelic sites | ||
|
||
- name: --collapse | ||
alternatives: --c | ||
type: string | ||
choices: [ snps, indels, both, all, some, none ] | ||
description: | | ||
Treat as identical records with <snps|indels|both|all|some|none>. | ||
See https://samtools.github.io/bcftools/bcftools.html#common_options for details. | ||
|
||
- name: --depth | ||
alternatives: --d | ||
type: string | ||
description: | | ||
Depth distribution: min,max,bin size [0,500,1] | ||
example: 0,500,1 | ||
|
||
- name: --exclude | ||
alternatives: --e | ||
type: string | ||
description: | | ||
Exclude sites for which the expression is true. | ||
See https://samtools.github.io/bcftools/bcftools.html#expressions for details. | ||
|
||
- name: --exons | ||
alternatives: --E | ||
type: file | ||
description: | | ||
tab-delimited file with exons for indel frameshifts statistics. | ||
The columns of the file are CHR, FROM, TO, with 1-based, inclusive, positions. | ||
The file is BGZF-compressed and indexed with tabix | ||
e.g. | ||
tabix -s1 -b2 -e3 file.gz | ||
|
||
- name: --apply_filters | ||
alternatives: --f | ||
type: string | ||
description: | | ||
Require at least one of the listed FILTER strings (e.g. "PASS,.") | ||
|
||
- name: --fasta_reference | ||
alternatives: --F | ||
type: file | ||
description: | | ||
Faidx indexed reference sequence file to determine INDEL context | ||
|
||
- name: --include | ||
alternatives: --i | ||
type: string | ||
description: | | ||
Select sites for which the expression is true. | ||
See https://samtools.github.io/bcftools/bcftools.html#expressions for details. | ||
|
||
- name: --split_by_ID | ||
alternatives: --I | ||
type: boolean_true | ||
description: | | ||
Collect stats for sites with ID separately (known vs novel) | ||
|
||
- name: --regions | ||
alternatives: --r | ||
type: string | ||
description: | | ||
Restrict to comma-separated list of regions | ||
|
||
- name: --regions_file | ||
alternatives: --R | ||
type: file | ||
description: | | ||
Restrict to regions listed in a file. | ||
|
||
- name: --regions_overlap | ||
type: string | ||
choices: ['pos', 'record', 'variant', '0', '1', '2'] | ||
description: | | ||
This option controls how overlapping records are determined: | ||
set to 'pos' or '0' if the VCF record has to have POS inside a region (this corresponds to the default behavior of -t/-T); | ||
set to 'record' or '1' if also overlapping records with POS outside a region should be included (this is the default behavior of -r/-R, | ||
and includes indels with POS at the end of a region, which are technically outside the region); | ||
or set to 'variant' or '2' to include only true overlapping variation (compare the full VCF representation "TA>T-" vs the true sequence variation "A>-"). | ||
|
||
- name: --samples | ||
alternatives: --s | ||
type: string | ||
description: | | ||
List of samples for sample stats, "-" to include all samples. | ||
|
||
- name: --samples_file | ||
alternatives: --S | ||
type: file | ||
description: | | ||
File of samples to include. | ||
|
||
- name: --targets | ||
alternatives: --t | ||
type: string | ||
description: | | ||
Similar as -r, --regions, but the next position is accessed by streaming the whole VCF/BCF | ||
rather than using the tbi/csi index. Both -r and -t options can be applied simultaneously: -r uses the | ||
index to jump to a region and -t discards positions which are not in the targets. Unlike -r, targets | ||
can be prefixed with "^" to request logical complement. For example, "^X,Y,MT" indicates that | ||
sequences X, Y and MT should be skipped. Yet another difference between the -t/-T and -r/-R is | ||
that -r/-R checks for proper overlaps and considers both POS and the end position of an indel, | ||
while -t/-T considers the POS coordinate only (by default; see also --regions-overlap and --targets-overlap). | ||
Note that -t cannot be used in combination with -T. | ||
|
||
- name: --targets_file | ||
alternatives: --T | ||
type: file | ||
description: | | ||
Similar to -R but streams rather than index-jumps. | ||
|
||
- name: --targets_overlaps | ||
type: string | ||
choices: ['pos', 'record', 'variant', '0', '1', '2'] | ||
description: | | ||
Include if POS in the region (0), record overlaps (1), variant overlaps (2). | ||
|
||
- name: --user_tstv | ||
alternatives: --u | ||
type: string | ||
description: | | ||
Collect Ts/Tv stats for any tag using the given binning [0:1:100]. | ||
A subfield can be selected as e.g. 'PV4[0]', here the first value of the PV4 tag. | ||
|
||
- name: --verbose | ||
alternatives: --v | ||
type: boolean_true | ||
description: | | ||
Produce verbose per-site and per-sample output. | ||
|
||
resources: | ||
- type: bash_script | ||
path: script.sh | ||
|
||
test_resources: | ||
- type: bash_script | ||
path: test.sh | ||
|
||
engines: | ||
- type: docker | ||
image: debian:stable-slim | ||
setup: | ||
- type: apt | ||
packages: [bcftools, procps] | ||
- type: docker | ||
run: | | ||
echo "bcftools: \"$(bcftools --version | grep 'bcftools' | sed -n 's/^bcftools //p')\"" > /var/software_versions.txt | ||
test_setup: | ||
- type: apt | ||
packages: [tabix] | ||
|
||
runners: | ||
- type: executable | ||
- type: nextflow | ||
|
||
|
||
#!/bin/bash | ||
|
||
## VIASH START | ||
## VIASH END | ||
|
||
# Exit on error | ||
set -eo pipefail | ||
|
||
#test_data="$meta_resources_dir/test_data" | ||
|
||
############################################# | ||
# helper functions | ||
assert_file_exists() { | ||
[ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } | ||
} | ||
assert_file_not_empty() { | ||
[ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } | ||
} | ||
assert_file_contains() { | ||
grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } | ||
} | ||
assert_identical_content() { | ||
diff -a "$2" "$1" \ | ||
|| (echo "Files are not identical!" && exit 1) | ||
} | ||
############################################# | ||
|
||
# Create directories for tests | ||
echo "Creating Test Data..." | ||
TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") | ||
function clean_up { | ||
[[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" | ||
} | ||
trap clean_up EXIT | ||
|
||
# Create test data | ||
cat <<EOF > "$TMPDIR/example.vcf" | ||
##fileformat=VCFv4.0 | ||
##fileDate=20090805 | ||
##source=myImputationProgramV3.1 | ||
##reference=1000GenomesPilot-NCBI36 | ||
##contig=<ID=19,length=58617616> | ||
##contig=<ID=20,length=58617616> | ||
##phasing=partial | ||
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data"> | ||
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes"> | ||
##INFO=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> | ||
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth"> | ||
##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency"> | ||
##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele"> | ||
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129"> | ||
##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership"> | ||
##FILTER=<ID=q10,Description="Quality below 10"> | ||
##FILTER=<ID=s50,Description="Less than 50% of samples have data"> | ||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | ||
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> | ||
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth"> | ||
##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality"> | ||
##ALT=<ID=DEL:ME:ALU,Description="Deletion of ALU element"> | ||
##ALT=<ID=CNV,Description="Copy number variable region"> | ||
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 | ||
19 111 . A C 9.6 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 | ||
19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 | ||
20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. | ||
20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,. | ||
20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.,. | ||
20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:.:56,60 0|0:48:4:51,51 0/0:61:2:.,. | ||
20 1234567 microsat1 G GA,GAC 50 PASS NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:.:4 0/2:17:2 1/1:40:3 | ||
20 1235237 . T . . . . GT 0/0 0|0 ./. | ||
EOF | ||
|
||
|
||
|
||
# Test 1: Default Use | ||
mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null | ||
|
||
echo "> Run bcftools_stats on VCF file" | ||
"$meta_executable" \ | ||
--input "../example.vcf" \ | ||
--output "stats.txt" \ | ||
|
||
# checks | ||
assert_file_exists "stats.txt" | ||
assert_file_not_empty "stats.txt" | ||
assert_file_contains "stats.txt" "bcftools stats ../example.vcf" | ||
echo "- test1 succeeded -" | ||
|
||
popd > /dev/null | ||
|
||
|
||
|
||
echo "---- All tests succeeded! ----" | ||
exit 0 |