Skip to content

Commit

Permalink
Merge pull request #4 from viash-hub/add_bgzip
Browse files Browse the repository at this point in the history
Add bgzip
  • Loading branch information
Grifs authored Jan 30, 2024
2 parents ac13cf2 + 4a4ed79 commit 83ecebf
Show file tree
Hide file tree
Showing 4 changed files with 182 additions and 0 deletions.
124 changes: 124 additions & 0 deletions src/bgzip/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
functionality:
name: bgzip
description: Block compression/decompression utility
info:
homepage: https://www.htslib.org/
documentation: https://www.htslib.org/doc/bgzip.html
repository: https://github.com/samtools/htslib
licence: MIT
requirements:
cpus: 1
commands: [ bgzip ]
argument_groups:
- name: Inputs
arguments:
- name: --input
type: file
direction: input
description: file to be compressed or decompressed
required: true
- name: Outputs
arguments:
- name: --output
type: file
direction: output
description: compressed or decompressed output
required: true
- name: --index_name
alternatives: -I
type: file
direction: output
description: name of BGZF index file [file.gz.gzi]
- name: Arguments
arguments:
- name: offset
alternatives: -b
type: integer
description: decompress at virtual file pointer (0-based uncompressed offset)
- name: --decompress
alternatives: -d
type: boolean_true
description: decompress the input file
- name: --rebgzip
alternatives: -g
type: boolean_true
description: use an index file to bgzip a file
- name: --index
alternatives: -i
type: boolean_true
description: compress and create BGZF index
- name: --compress_level
alternatives: -l
type: integer
description: compression level to use when compressing; 0 to 9, or -1 for default [-1]
min: -1
max: 9
- name: --reindex
alternatives: -r
type: boolean_true
description: (re)index the output file
- name: --size
alternatives: -s
type: integer
description: decompress INT bytes (uncompressed size)
min: 0
- name: --test
alternatives: -t
type: boolean_true
description: test integrity of compressed file
- name: --binary
type: boolean_true
description: Don't align blocks with text lines
resources:
- type: bash_script
text: |
[[ "$par_decompress" == "false" ]] && unset par_decompress
[[ "$par_rebgzip" == "false" ]] && unset par_rebgzip
[[ "$par_index" == "false" ]] && unset par_index
[[ "$par_reindex" == "false" ]] && unset par_reindex
[[ "$par_test" == "false" ]] && unset par_test
[[ "$par_binary" == "false" ]] && unset par_binary
bgzip -c \
${meta_cpus:+--threads "${meta_cpus}"} \
${par_decompress:+-d} \
${par_rebgzip:+-g} \
${par_index:+-i} \
${par_index_name:+-I "${par_index_name}"} \
${par_compress_level:+-l "${par_compress_level}"} \
${par_reindex:+-r} \
${par_size:+-s "${par_size}"} \
${par_test:+-t} \
"$par_input" > "$par_output"
test_resources:
- type: bash_script
text: |
set -e
"$meta_executable" --input "$meta_resources_dir/test_data/test.vcf" --output "test.vcf.gz"
echo ">> Checking output of compressing"
[ ! -f "test.vcf.gz" ] && echo "Output file test.vcf.gz does not exist" && exit 1
"$meta_executable" --input "test.vcf.gz" --output "test.vcf" --decompress
echo ">> Checking output of decompressing"
[ ! -f "test.vcf" ] && echo "Output file test.vcf does not exist" && exit 1
echo ">> Checking original and decompressed files are the same"
set +e
cmp --silent -- "$meta_resources_dir/test_data/test.vcf" "test.vcf"
[ $? -ne 0 ] && echo "files are different" && exit 1
set -e
echo "> Test successful"
- type: file
path: test_data

platforms:
- type: docker
image: quay.io/biocontainers/htslib:1.19--h81da01d_0
setup:
- type: docker
run: |
bgzip -h | grep 'Version:' 2>&1 | sed 's/Version:\s\(.*\)/arriba: "\1"/' > /var/software_versions.txt
- type: nextflow
22 changes: 22 additions & 0 deletions src/bgzip/help.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
```bash
bgzip -h
```

Version: 1.19
Usage: bgzip [OPTIONS] [FILE] ...
Options:
-b, --offset INT decompress at virtual file pointer (0-based uncompressed offset)
-c, --stdout write on standard output, keep original files unchanged
-d, --decompress decompress
-f, --force overwrite files without asking
-g, --rebgzip use an index file to bgzip a file
-h, --help give this help
-i, --index compress and create BGZF index
-I, --index-name FILE name of BGZF index file [file.gz.gzi]
-k, --keep don't delete input files during operation
-l, --compress-level INT Compression level to use when compressing; 0 to 9, or -1 for default [-1]
-r, --reindex (re)index compressed file
-s, --size INT decompress INT bytes (uncompressed size)
-t, --test test integrity of compressed file
--binary Don't align blocks with text lines
-@, --threads INT number of compression threads to use [1]
13 changes: 13 additions & 0 deletions src/bgzip/test_data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# arriba test data

Test data was obtained from https://github.com/snakemake/snakemake-wrappers/tree/master/bio/bgzip/test.

__author__ = "William Rowell"
__copyright__ = "Copyright 2020, William Rowell"
__email__ = "[email protected]"
__license__ = "MIT"

```bash
git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers
cp -r /tmp/snakemake-wrappers/bio/bgzip/test/* src/bgzip/test_data
```
23 changes: 23 additions & 0 deletions src/bgzip/test_data/test.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
##fileformat=VCFv4.0
##fileDate=20090805
##source=https://www.internationalgenome.org/wiki/Analysis/vcf4.0/
##reference=1000GenomesPilot-NCBI36
##phasing=partial
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">
##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
##FILTER=<ID=q10,Description="Quality below 10">
##FILTER=<ID=s50,Description="Less than 50% of samples have data">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3
20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4
20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2
20 1234567 microsat1 GTCT G,GTACT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3

0 comments on commit 83ecebf

Please sign in to comment.