From 86333c1a465db45facd936695f1f33b186ccf0fc Mon Sep 17 00:00:00 2001 From: Suman Muralidharan <104161349+sumanm99@users.noreply.github.com> Date: Tue, 15 Oct 2024 23:46:17 +0530 Subject: [PATCH] SnpEff (#153) * Help file * config file * config file * runners script * config file * test script * test * test * runners script * snake case * snake case * output parameters * modify argument formatting, container setup * fix buf with mv command * avoid boolean_false and fix bug with output files --------- Co-authored-by: Emma Rousseau --- src/snpeff/config.vsh.yaml | 297 ++++++++++++++++++++++++ src/snpeff/help.txt | 79 +++++++ src/snpeff/script.sh | 148 ++++++++++++ src/snpeff/test.sh | 129 ++++++++++ src/snpeff/test_data/cancer.vcf | 2 + src/snpeff/test_data/my_annotations.bed | 1 + src/snpeff/test_data/script.sh | 15 ++ src/snpeff/test_data/test.vcf | 1 + 8 files changed, 672 insertions(+) create mode 100644 src/snpeff/config.vsh.yaml create mode 100644 src/snpeff/help.txt create mode 100644 src/snpeff/script.sh create mode 100644 src/snpeff/test.sh create mode 100644 src/snpeff/test_data/cancer.vcf create mode 100644 src/snpeff/test_data/my_annotations.bed create mode 100644 src/snpeff/test_data/script.sh create mode 100644 src/snpeff/test_data/test.vcf diff --git a/src/snpeff/config.vsh.yaml b/src/snpeff/config.vsh.yaml new file mode 100644 index 00000000..5fb8622d --- /dev/null +++ b/src/snpeff/config.vsh.yaml @@ -0,0 +1,297 @@ +name: snpeff +description: | + Genetic variant annotation, and functional effect prediction toolbox. + It annotates and predicts the effects of genetic variants on genes and + proteins (such as amino acid changes). +keywords: [ "annotation", "effect prediction", "snp", "variant", "vcf"] + +links: + repository: https://github.com/pcingola/SnpEff + homepage: https://pcingola.github.io/SnpEff/ + documentation: https://pcingola.github.io/SnpEff/ +references: + doi: 10.3389/fgene.2012.00035 +license: MIT +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: Input variants file. + example: test.vcf + required: true + - name: --genome_version + type: string + description: Reference genome version. + example: GRCh37.75 + required: true + - name: Outputs + arguments: + - name: --output + type: file + description: The output file. + example: out.vcf + direction: output + required: true + - name: --summary + type: file + description: Summary file directory. + example: summary_dir + direction: output + - name: --genes + type: file + description: Txt file directory. + example: genes_dir + direction: output + - name: Options + arguments: + - name: --chr + type: string + description: | + Prepend 'string' to chromosome name (e.g. 'chr1' instead of '1'). Only on TXT output. + - name: --classic + type: boolean_true + description: Use old style annotations instead of Sequence Ontology and Hgvs. + - name: --csv_stats + type: file + description: Create CSV summary file. + - name: --download + type: boolean_true + description: Download reference genome if not available. + - name: --input_format + alternatives: [-i] + type: string + description: | + Input format [ vcf, bed ]. Default: VCF. + example: "VCF" + - name: --file_list + type: boolean_true + description: Input actually contains a list of files to process. + - name: --output_format + alternatives: [-o] + type: string + description: | + Output format [ vcf, gatk, bed, bedAnn ]. Default: VCF. + example: "VCF" + - name: --stats + alternatives: [-s, --htmlStats] + type: boolean_true + description: Create HTML summary file. + - name: --no_stats + type: boolean_true + description: Do not create stats (summary) file. + - name: Results filter options + arguments: + - name: --fi + alternatives: [--filterInterval] + type: file + description: | + Only analyze changes that intersect with the intervals + specified in this file. This option can be used several times. + - name: --no_downstream + type: boolean_true + description: Do not show DOWNSTREAM changes + - name: --no_intergenic + type: boolean_true + description: Do not show INTERGENIC changes. + - name: --no_intron + type: boolean_true + description: Do not show INTRON changes. + - name: --no_upstream + type: boolean_true + description: Do not show UPSTREAM changes. + - name: --no_utr + type: boolean_true + description: Do not show 5_PRIME_UTR or 3_PRIME_UTR changes. + - name: --no + type: string + description: | + Do not show 'EffectType'. This option can be used several times. + - name: Annotations options + arguments: + - name: --cancer + type: boolean_true + description: Perform 'cancer' comparisons (Somatic vs Germline). + - name: --cancer_samples + type: file + description: Two column TXT file defining 'original \t derived' samples. + - name: --fastaprot + type: file + description: | + Create an output file containing the resulting protein sequences. + - name: --format_eff + type: boolean_true + description: | + Use 'EFF' field compatible with older versions (instead of 'ANN'). + - name: --gene_id + type: boolean_true + description: Use gene ID instead of gene name (VCF output). + - name: --hgvs + type: boolean_true + description: Use HGVS annotations for amino acid sub-field. + - name: --hgvs_old + type: boolean_true + description: Use old HGVS notation. + - name: --hgvs1_letter_aa + type: boolean_true + description: Use one letter Amino acid codes in HGVS notation. + - name: --hgvs_tr_id + type: boolean_true + description: Use transcript ID in HGVS notation. + - name: --lof + type: boolean_true + description: | + Add loss of function (LOF) and Nonsense mediated decay (NMD) tags. + - name: -no_hgvs + type: boolean_true + description: Do not add HGVS annotations. + - name: --no_lof + type: boolean_true + description: Do not add LOF and NMD annotations. + - name: --no_shift_hgvs + type: boolean_true + description: | + Do not shift variants according to HGVS notation (most 3prime end). + - name: --oicr + type: boolean_true + description: Add OICR tag in VCF file. + - name: --sequence_ontology + type: boolean_true + description: Use Sequence Ontology terms. + - name: Generic options + arguments: + - name: --config + alternatives: [-c] + type: file + description: Specify config file + - name: --config_option + type: string + description: Override a config file option (name=value). + - name: --debug + alternatives: [-d] + type: boolean_true + description: Debug mode (very verbose). + - name: --data_dir + type: file + description: Override data_dir parameter from config file. + - name: --no_download + type: boolean_true + description: Do not download a SnpEff database, if not available locally. + - name: --no_log + type: boolean_true + description: Do not report usage statistics to server. + - name: --quiet + alternatives: [-q] + type: boolean_true + description: Quiet mode (do not show any messages or errors) + - name: --verbose + alternatives: [-v] + type: boolean_true + description: Verbose mode. + - name: Database options + arguments: + - name: --canon + type: boolean_true + description: Only use canonical transcripts. + - name: --canon_list + type: file + description: | + Only use canonical transcripts, replace some transcripts using the 'gene_id + transcript_id' entries in . + - name: --tag + type: string + description: | + Only use transcript having a tag 'tagName'. This option can be used multiple times. + - name: --no_tag + type: boolean_true + description: | + Filter out transcript having a tag 'tagName'. This option can be used multiple times. + - name: --interaction + type: boolean_true + description: Annotate using interactions (requires interaction database). + - name: --interval + type: file + description: | + Use a custom intervals in TXT/BED/BigBed/VCF/GFF file (you may use this option many times). + - name: --max_tsl + type: integer + description: Only use transcripts having Transcript Support Level lower than . + - name: --motif + type: boolean_true + description: Annotate using motifs (requires Motif database). + - name: --nextprot + type: boolean_true + description: Annotate using NextProt (requires NextProt database). + - name: --no_genome + type: boolean_true + description: Do not load any genomic database (e.g. annotate using custom files). + - name: --no_expand_iub + type: boolean_true + description: Disable IUB code expansion in input variants. + - name: --no_interaction + type: boolean_true + description: Disable inteaction annotations. + - name: --no_motif + type: boolean_true + description: Disable motif annotations. + - name: --no_nextprot + type: boolean_true + description: Disable NextProt annotations. + - name: --only_reg + type: boolean_true + description: Only use regulation tracks. + - name: --only_protein + type: boolean_true + description: Only use protein coding transcripts. + - name: --only_tr + type: file + description: | + Only use the transcripts in this file. Format: One transcript ID per line. + example: file.txt + - name: --reg + type: string + description: Regulation track to use (this option can be used add several times). + - name: --ss + alternatives: [--spliceSiteSize] + type: integer + description: | + Set size for splice sites (donor and acceptor) in bases. Default: 2. + - name: --splice_region_exon_size + type: integer + description: | + Set size for splice site region within exons. Default: 3 bases. + - name: --splice_region_intron_min + type: integer + description: | + Set minimum number of bases for splice site region within intron. Default: 3 bases. + - name: --splice_region_intron_max + type: integer + description: | + Set maximum number of bases for splice site region within intron. Default: 8 bases. + - name: --strict + type: boolean_true + description: Only use 'validated' transcripts (i.e. sequence has been checked). + - name: --ud + alternatives: [--upDownStreamLen] + type: integer + description: Set upstream downstream interval length (in bases). +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/staphb/snpeff:5.2a + setup: + - type: docker + run: | + version=$(snpEff -version) && \ + version_trimmed=$(echo "$version" | awk '{print $1, $2}') && \ + echo "$version_trimmed" > /var/software_versions.txt +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/snpeff/help.txt b/src/snpeff/help.txt new file mode 100644 index 00000000..d1950220 --- /dev/null +++ b/src/snpeff/help.txt @@ -0,0 +1,79 @@ +Usage: snpEff [eff] [options] genome_version [input_file] + + variants_file : Default is STDIN + +Options: + -chr : Prepend 'string' to chromosome name (e.g. 'chr1' instead of '1'). Only on TXT output. + -classic : Use old style annotations instead of Sequence Ontology and Hgvs. + -csvStats : Create CSV summary file. + -download : Download reference genome if not available. Default: true + -i : Input format [ vcf, bed ]. Default: VCF. + -fileList : Input actually contains a list of files to process. + -o : Ouput format [ vcf, gatk, bed, bedAnn ]. Default: VCF. + -s , -stats, -htmlStats : Create HTML summary file. Default is 'snpEff_summary.html' + -noStats : Do not create stats (summary) file + +Results filter options: + -fi , -filterInterval : Only analyze changes that intersect with the intervals specified in this file (you may use this option many times) + -no-downstream : Do not show DOWNSTREAM changes + -no-intergenic : Do not show INTERGENIC changes + -no-intron : Do not show INTRON changes + -no-upstream : Do not show UPSTREAM changes + -no-utr : Do not show 5_PRIME_UTR or 3_PRIME_UTR changes + -no : Do not show 'EffectType'. This option can be used several times. + +Annotations options: + -cancer : Perform 'cancer' comparisons (Somatic vs Germline). Default: false + -cancerSamples : Two column TXT file defining 'oringinal \t derived' samples. + -fastaProt : Create an output file containing the resulting protein sequences. + -formatEff : Use 'EFF' field compatible with older versions (instead of 'ANN'). + -geneId : Use gene ID instead of gene name (VCF output). Default: false + -hgvs : Use HGVS annotations for amino acid sub-field. Default: true + -hgvsOld : Use old HGVS notation. Default: false + -hgvs1LetterAa : Use one letter Amino acid codes in HGVS notation. Default: false + -hgvsTrId : Use transcript ID in HGVS notation. Default: false + -lof : Add loss of function (LOF) and Nonsense mediated decay (NMD) tags. + -noHgvs : Do not add HGVS annotations. + -noLof : Do not add LOF and NMD annotations. + -noShiftHgvs : Do not shift variants according to HGVS notation (most 3prime end). + -oicr : Add OICR tag in VCF file. Default: false + -sequenceOntology : Use Sequence Ontology terms. Default: true + +Generic options: + -c , -config : Specify config file + -configOption name=value : Override a config file option + -d , -debug : Debug mode (very verbose). + -dataDir : Override data_dir parameter from config file. + -download : Download a SnpEff database, if not available locally. Default: true + -nodownload : Do not download a SnpEff database, if not available locally. + -h , -help : Show this help and exit + -noLog : Do not report usage statistics to server + -q , -quiet : Quiet mode (do not show any messages or errors) + -v , -verbose : Verbose mode + -version : Show version number and exit + +Database options: + -canon : Only use canonical transcripts. + -canonList : Only use canonical transcripts, replace some transcripts using the 'gene_id transcript_id' entries in . + -tag : Only use transcript having a tag 'tagName'. This option can be used multiple times. + -notag : Filter out transcript having a tag 'tagName'. This option can be used multiple times. + -interaction : Annotate using interactions (requires interaction database). Default: true + -interval : Use a custom intervals in TXT/BED/BigBed/VCF/GFF file (you may use this option many times) + -maxTSL : Only use transcripts having Transcript Support Level lower than . + -motif : Annotate using motifs (requires Motif database). Default: true + -nextProt : Annotate using NextProt (requires NextProt database). + -noGenome : Do not load any genomic database (e.g. annotate using custom files). + -noExpandIUB : Disable IUB code expansion in input variants + -noInteraction : Disable inteaction annotations + -noMotif : Disable motif annotations. + -noNextProt : Disable NextProt annotations. + -onlyReg : Only use regulation tracks. + -onlyProtein : Only use protein coding transcripts. Default: false + -onlyTr : Only use the transcripts in this file. Format: One transcript ID per line. + -reg : Regulation track to use (this option can be used add several times). + -ss , -spliceSiteSize : Set size for splice sites (donor and acceptor) in bases. Default: 2 + -spliceRegionExonSize : Set size for splice site region within exons. Default: 3 bases + -spliceRegionIntronMin : Set minimum number of bases for splice site region within intron. Default: 3 bases + -spliceRegionIntronMax : Set maximum number of bases for splice site region within intron. Default: 8 bases + -strict : Only use 'validated' transcripts (i.e. sequence has been checked). Default: false + -ud , -upDownStreamLen : Set upstream downstream interval length (in bases) \ No newline at end of file diff --git a/src/snpeff/script.sh b/src/snpeff/script.sh new file mode 100644 index 00000000..bf3914bb --- /dev/null +++ b/src/snpeff/script.sh @@ -0,0 +1,148 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +## VIASH END + +# Unset flags if 'false' +unset_if_false=( + par_classic + par_download + par_file_list + par_stats + par_cancer + par_format_eff + par_gene_id + par_hgvs + par_hgvs_old + par_hgvs1_letter_aa + par_hgvs_tr_id + par_lof + par_oicr + par_sequence_ontology + par_debug + par_quiet + par_verbose + par_canon + par_interaction + par_motif + par_nextprot + par_only_reg + par_only_protein + par_strict + par_no_stats + par_no_downstream + par_no_intergenic + par_no_intron + par_no_upstream + par_no_utr + par_no_hgvs + par_no_lof + par_no_shift_hgvs + par_no_download + par_no_log + par_no_tag + par_no_genome + par_no_expand_iub + par_no_interaction + par_no_motif + par_no_nextprot +) +for par in ${unset_if_false[@]}; do + test_val="${!par}" # contains the value of the 'par' + [[ "$test_val" == "false" ]] && unset $par +done + + +# Run SnpEff +snpEff \ + ${par_chr:+-chr "$par_chr"} \ + ${par_classic:+-classic} \ + ${par_csv_stats:+-csvStats "$par_csv_stats"} \ + ${par_download:+-download} \ + ${par_input_format:+-i "$par_input_format"} \ + ${par_file_list:+-fileList} \ + ${par_output_format:+-o "$par_output_format"} \ + ${par_stats:+-stats} \ + ${par_no_stats:+-noStats} \ + ${par_fi:+-fi "$par_fi"} \ + ${par_no_downstream:+-no-downstream} \ + ${par_no_intergenic:+-no-intergenic} \ + ${par_no_intron:+-no-intron} \ + ${par_no_upstream:+-no-upstream} \ + ${par_no_utr:+-no-utr} \ + ${par_no:+-no "$par_no"} \ + ${par_cancer:+-cancer} \ + ${par_cancer_samples:+-cancerSamples "$par_cancer_samples]"} \ + ${par_fastaprot:+-fastaProt "$par_fastaprot]"} \ + ${par_format_eff:+-formatEff} \ + ${par_gene_id:+-geneId} \ + ${par_hgvs:+-hgvs} \ + ${par_hgvs_old:+-hgvsOld} \ + ${par_hgvs1_letter_aa:+-hgvs1LetterAa} \ + ${par_hgvs_tr_id:+-hgvsTrId} \ + ${par_lof:+-lof} \ + ${par_no_hgvs:+-noHgvs} \ + ${par_no_lof:+-noLof} \ + ${par_no_shift_hgvs:+-noShiftHgvs} \ + ${par_oicr:+-oicr} \ + ${par_sequence_ontology:+-sequenceOntology} \ + ${par_config:+-config "$par_config"} \ + ${par_config_option:+-configOption "$par_config_option"} \ + ${par_debug:+-debug} \ + ${par_data_dir:+-dataDir "$par_data_dir"} \ + ${par_no_download:+-nodownload} \ + ${par_no_log:+-noLog} \ + ${par_quiet:+-quiet} \ + ${par_verbose:+-verbose} \ + ${par_canon:+-canon} \ + ${par_canon_list:+-canonList "$par_canon_list"} \ + ${par_tag:+-tag "$par_tag"} \ + ${par_no_tag:+-notag} \ + ${par_interaction:+-interaction} \ + ${par_interval:+-interval "$par_interval"} \ + ${par_max_tsl:+-maxTSL "$par_max_tsl"} \ + ${par_motif:+-motif} \ + ${par_nextprot:+-nextProt} \ + ${par_no_genome:+-noGenome} \ + ${par_no_expand_iub:+-noExpandIUB} \ + ${par_no_interaction:+-noInteraction} \ + ${par_no_motif:+-noMotif} \ + ${par_no_nextprot:+-noNextProt} \ + ${par_only_reg:+-onlyReg} \ + ${par_only_protein:+-onlyProtein} \ + ${par_only_tr:+-onlyTr "$par_onlyTr"} \ + ${par_reg:+-reg "$par_reg"} \ + ${par_ss:+-ss "$par_ss"} \ + ${par_splice_region_exon_size:+-spliceRegionExonSize "$par_splice_region_exon_size"} \ + ${par_splice_region_intron_min:+-spliceRegionIntronMin "$par_splice_region_intron_min"} \ + ${par_splice_region_intron_max:+-spliceRegionIntronMax "$par_splice_region_intron_max"} \ + ${par_strict:+-strict} \ + ${par_ud:+-ud "$par_ud"} \ + "$par_genome_version" \ + "$par_input" \ + > "$par_output" + +# Path of the output file (par_output) +absolute_path=$(realpath "$par_output") +directory_path=$(dirname "$absolute_path") + +# Move the automatically generated outputs to their locations +if [ -z "$par_no_stats" ]; then + if [ ! -z "$par_summary" ]; then + mv -n snpEff_summary.html "$par_summary" + else + mv -n snpEff_summary.html "$directory_path" + fi +fi + +if [ -z "$par_no_stats" ]; then + if [ ! -z "$par_genes" ]; then + mv -n snpEff_genes.txt "$par_genes" + else + mv -n snpEff_genes.txt "$directory_path" + fi +fi + +exit 0 diff --git a/src/snpeff/test.sh b/src/snpeff/test.sh new file mode 100644 index 00000000..d8c72c20 --- /dev/null +++ b/src/snpeff/test.sh @@ -0,0 +1,129 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +## VIASH END + +########################################################################### + +# Test 1: Run SnpEff with only required parameters + +mkdir test1 +pushd test1 > /dev/null # cd test1 (stack) + +echo "> Run Test 1: required parameters" +"$meta_executable" \ + --genome_version GRCh37.75 \ + --input "$meta_resources_dir/test_data/cancer.vcf" \ + --output out.vcf + +# Check if output files are generated +output_files=("out.vcf" "snpEff_genes.txt" "snpEff_summary.html") + +# Check if any of the files do not exist +for file in "${output_files[@]}"; do + if [ ! -e "$file" ]; then + echo "File $file does not exist." + fi +done + +# Check if files are empty +for file in "${output_files[@]}"; do + if [ ! -s "$file" ]; then + echo "File $file is empty." + fi +done + +popd > /dev/null # Remove directory from stack (LIFO) + +echo "Test 1 succeeded." + +########################################################################### + +# Test 2: Run SnpEff with a different input + options + +mkdir test2 +pushd test2 > /dev/null + +echo "> Run Test 2: different input + options" +"$meta_executable" \ + --genome_version GRCh37.75 \ + --input "$meta_resources_dir/test_data/test.vcf" \ + --interval "$meta_resources_dir/test_data/my_annotations.bed" \ + --no_stats \ + --output output.vcf + +# Check if output.vcf exists +if [ ! -e "output.vcf" ]; then + echo "File output.vcf does not exist." +fi + +# These files should not exist +files=("snpEff_genes.txt" "snpEff_summary.html") +for file in "${files[@]}"; do + if [ -e "$file" ]; then + echo "Error: File $file exists." + fi +done + +# Check if output.vcf is empty +if [ ! -s "output.vcf" ]; then + echo "File output.vcf is empty." +fi + +popd > /dev/null + +echo "Test 2 succeeded." + +########################################################################### + +# Test 3: Move the output files to other locations + +mkdir test3 +pushd test3 > /dev/null + +mkdir temp + +echo "> Run Test 3: move output files" +"$meta_executable" \ + --genome_version GRCh37.75 \ + --input "$meta_resources_dir/test_data/test.vcf" \ + --output output.vcf \ + --summary temp \ + --genes temp + +# Check if output.vcf exists +if [ ! -e "output.vcf" ]; then + echo "File output.vcf does not exist." +fi + +# Check if the other output files have been moved to temp folder +output_files=("snpEff_genes.txt" "snpEff_summary.html") + +# Check if any of the files do not exist +for file in "${output_files[@]}"; do + if [ ! -e "temp/$file" ]; then + echo "File $file does not exist in 'temp' folder." + fi +done + +# Check if output.vcf is empty +if [ ! -s "output.vcf" ]; then + echo "File output.vcf is empty." +fi + +# Check if the other output files in temp folder are empty +for file in "${output_files[@]}"; do + if [ ! -s "temp/$file" ]; then + echo "File $file is empty." + fi +done + +popd > /dev/null + +echo "Test 3 succeeded." + +########################################################################### + +echo "All tests successfully completed!" \ No newline at end of file diff --git a/src/snpeff/test_data/cancer.vcf b/src/snpeff/test_data/cancer.vcf new file mode 100644 index 00000000..f37ad8c3 --- /dev/null +++ b/src/snpeff/test_data/cancer.vcf @@ -0,0 +1,2 @@ +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Patient_01_Germline Patient_01_Somatic +1 69091 . A C,G . PASS AC=1 GT 1/0 2/1 diff --git a/src/snpeff/test_data/my_annotations.bed b/src/snpeff/test_data/my_annotations.bed new file mode 100644 index 00000000..a5247f97 --- /dev/null +++ b/src/snpeff/test_data/my_annotations.bed @@ -0,0 +1 @@ +1 10000 20000 MY_ANNOTATION diff --git a/src/snpeff/test_data/script.sh b/src/snpeff/test_data/script.sh new file mode 100644 index 00000000..a47ec136 --- /dev/null +++ b/src/snpeff/test_data/script.sh @@ -0,0 +1,15 @@ +# Test files from SnpEff examples +if [ ! -f snpEff_latest_core.zip ]; then + wget https://snpeff.blob.core.windows.net/versions/snpEff_latest_core.zip +fi + +if [ ! -d snpEff ]; then + unzip snpEff_latest_core.zip +fi + +mv snpEff/examples/test.vcf src/snpeff/test_data/ +mv snpEff/examples/cancer.vcf src/snpeff/test_data/ +mv snpEff/examples/my_annotations.bed src/snpeff/test_data/ + +rm -rf snpEff_latest_core.zip +rm -rf snpEff \ No newline at end of file diff --git a/src/snpeff/test_data/test.vcf b/src/snpeff/test_data/test.vcf new file mode 100644 index 00000000..d552ef18 --- /dev/null +++ b/src/snpeff/test_data/test.vcf @@ -0,0 +1 @@ +1 10469 . C G 365.78 PASS AC=30;AF=0.0732