diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ca743c6..7eb90569 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,10 @@ * `pear`: Paired-end read merger (PR #10). +* `lofreq/call`: Call variants from a BAM file (PR #17). + +* `lofreq/indelqual`: Insert indel qualities into BAM file (PR #17). + ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/lofreq/call/config.vsh.yaml b/src/lofreq/call/config.vsh.yaml new file mode 100644 index 00000000..97b98e6f --- /dev/null +++ b/src/lofreq/call/config.vsh.yaml @@ -0,0 +1,251 @@ +functionality: + name: lofreq_call + namespace: lofreq + description: | + Call variants from a BAM file. + + LoFreq* (i.e. LoFreq version 2) is a fast and sensitive variant-caller for inferring SNVs and indels from next-generation sequencing data. It makes full use of base-call qualities and other sources of errors inherent in sequencing (e.g. mapping or base/indel alignment uncertainty), which are usually ignored by other methods or only used for filtering. + + LoFreq* can run on almost any type of aligned sequencing data (e.g. Illumina, IonTorrent or Pacbio) since no machine- or sequencing-technology dependent thresholds are used. It automatically adapts to changes in coverage and sequencing quality and can therefore be applied to a variety of data-sets e.g. viral/quasispecies, bacterial, metagenomics or somatic data. + + LoFreq* is very sensitive; most notably, it is able to predict variants below the average base-call quality (i.e. sequencing error rate). Each variant call is assigned a p-value which allows for rigorous false positive control. Even though it uses no approximations or heuristics, it is very efficient due to several runtime optimizations and also provides a (pseudo-)parallel implementation. LoFreq* is generic and fast enough to be applied to high-coverage data and large genomes. On a single processor it takes a minute to analyze Dengue genome sequencing data with nearly 4000X coverage, roughly one hour to call SNVs on a 600X coverage E.coli genome and also roughly an hour to run on a 100X coverage human exome dataset. + info: + keywords: [ "variant calling", "low frequancy variant calling", "lofreq", "lofreq/call"] + links: + homepage: https://csb5.github.io/lofreq/ + documentation: https://csb5.github.io/lofreq/commands/ + reference: + doi: 10.1093/nar/gks918 + license: "MIT" + requirements: + commands: [ lofreq ] + argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: | + Input BAM file. + required: true + example: "normal.bam" + - name: --input_bai + type: file + description: | + Index file for the input BAM file. + required: true + example: "normal.bai" + - name: --ref + alternatives: -f + type: file + description: | + Indexed reference fasta file (gzip supported). Default: none. + required: true + example: "reference.fasta" + - name: Outputs + arguments: + - name: --out + alternatives: -o + type: file + description: | + Vcf output file. Default: stdout. + required: true + direction: output + example: "output.vcf" + - name: Arguments + arguments: + - name: --region + alternatives: -r + type: string + description: | + Limit calls to this region (chrom:start-end). Default: none. + required: false + example: "chr1:1000-2000" + - name: --bed + alternatives: -l + type: file + description: | + List of positions (chr pos) or regions (BED). Default: none. + required: false + example: "regions.bed" + - name: --min_bq + alternatives: -q + type: integer + description: | + Skip any base with baseQ smaller than INT. Default: 6. + required: false + example: 6 + - name: --min_alt_bq + alternatives: -Q + type: integer + description: | + Skip alternate bases with baseQ smaller than INT. Default: 6. + required: false + example: 6 + - name: --def_alt_bq + alternatives: -R + type: integer + description: | + Overwrite baseQs of alternate bases (that passed bq filter) with this value (-1: use median ref-bq; 0: keep). Default: 0. + required: false + example: 0 + - name: --min_jq + alternatives: -j + type: integer + description: | + Skip any base with joinedQ smaller than INT. Default: 0. + example: 0 + - name: --min_alt_jq + alternatives: -J + type: integer + description: | + Skip alternate bases with joinedQ smaller than INT. Default: 0. + required: false + example: 0 + - name: --def_alt_jq + alternatives: -K + type: integer + description: | + Overwrite joinedQs of alternate bases (that passed jq filter) with this value (-1: use median ref-bq; 0: keep). Default: 0. + required: false + example: 0 + - name: --no_baq + alternatives: -B + type: boolean_true + description: | + Disable use of base-alignment quality (BAQ). + - name: --no_idaq + alternatives: -A + type: boolean_true + description: | + Don't use IDAQ values (NOT recommended under ANY circumstances other than debugging). + - name: --del_baq + alternatives: -D + type: boolean_true + description: | + Delete pre-existing BAQ values, i.e. compute even if already present in BAM. + - name: --no_ext_baq + alternatives: -e + type: boolean_true + description: | + Use 'normal' BAQ (samtools default) instead of extended BAQ (both computed on the fly if not already present in lb tag). + - name: --min_mq + alternatives: -m + type: integer + description: | + Skip reads with mapping quality smaller than INT. Default: 0. + required: false + example: 0 + - name: --max_mq + alternatives: -M + type: integer + description: | + Cap mapping quality at INT. Default: 255. + required: false + example: 255 + - name: --no_mq + alternatives: -N + type: boolean_true + description: | + Don't merge mapping quality in LoFreq's model. + - name: --call_indels + type: boolean_true + description: | + Enable indel calls (note: preprocess your file to include indel alignment qualities!). + - name: --only_indels + type: boolean_true + description: | + Only call indels; no SNVs. + - name: --src_qual + alternatives: -s + type: boolean_true + description: | + Enable computation of source quality. + - name: --ign_vcf + alternatives: -S + type: file + description: | + Ignore variants in this vcf file for source quality computation. Multiple files can be given separated by commas. + required: false + example: "variants.vcf" + - name: --def_nm_q + alternatives: -T + type: integer + description: | + If >= 0, then replace non-match base qualities with this default value. Default: -1. + required: false + example: -1 + - name: --sig + alternatives: -a + type: double + description: | + P-Value cutoff / significance level. Default: 0.010000. + required: false + example: 0.01 + - name: --bonf + alternatives: -b + type: string + description: | + Bonferroni factor. 'dynamic' (increase per actually performed test) or INT. Default: Dynamic. + required: false + example: "dynamic" + - name: --min_cov + alternatives: -C + type: integer + description: | + Test only positions having at least this coverage. Default: 1. + (note: without --no-default-filter default filters (incl. coverage) kick in after predictions are done). + required: false + example: 1 + - name: --max_depth + alternatives: -d + type: integer + description: | + Cap coverage at this depth. Default: 1000000. + required: false + example: 1000000 + - name: --illumina_13 + type: boolean_true + description: | + Assume the quality is Illumina-1.3-1.7/ASCII+64 encoded. + - name: --use_orphan + type: boolean_true + description: | + Count anomalous read pairs (i.e. where mate is not aligned properly). + - name: --plp_summary_only + type: boolean_true + description: | + No variant calling. Just output pileup summary per column. + - name: --no_default_filter + type: boolean_true + description: | + Don't run default 'lofreq filter' automatically after calling variants. + - name: --force_overwrite + type: boolean_true + description: | + Overwrite any existing output. + - name: --verbose + type: boolean_true + description: | + Be verbose. + - name: --debug + type: boolean_true + description: | + Enable debugging. + resources: + - type: bash_script + path: script.sh + test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +platforms: + - type: docker + image: quay.io/biocontainers/lofreq:2.1.5--py38h794fc9e_10 + setup: + - type: docker + run: | + version=$(lofreq version | grep 'version' | sed 's/version: //') && \ + echo "lofreq: $version" > /var/software_versions.txt + - type: nextflow + diff --git a/src/lofreq/call/help.txt b/src/lofreq/call/help.txt new file mode 100644 index 00000000..16178f07 --- /dev/null +++ b/src/lofreq/call/help.txt @@ -0,0 +1,49 @@ +lofreq call: call variants from BAM file + +Usage: lofreq call [options] in.bam + +Options: +- Reference: + -f | --ref FILE Indexed reference fasta file (gzip supported) [null] +- Output: + -o | --out FILE Vcf output file [- = stdout] +- Regions: + -r | --region STR Limit calls to this region (chrom:start-end) [null] + -l | --bed FILE List of positions (chr pos) or regions (BED) [null] +- Base-call quality: + -q | --min-bq INT Skip any base with baseQ smaller than INT [6] + -Q | --min-alt-bq INT Skip alternate bases with baseQ smaller than INT [6] + -R | --def-alt-bq INT Overwrite baseQs of alternate bases (that passed bq filter) with this value (-1: use median ref-bq; 0: keep) [0] + -j | --min-jq INT Skip any base with joinedQ smaller than INT [0] + -J | --min-alt-jq INT Skip alternate bases with joinedQ smaller than INT [0] + -K | --def-alt-jq INT Overwrite joinedQs of alternate bases (that passed jq filter) with this value (-1: use median ref-bq; 0: keep) [0] +- Base-alignment (BAQ) and indel-aligment (IDAQ) qualities: + -B | --no-baq Disable use of base-alignment quality (BAQ) + -A | --no-idaq Don't use IDAQ values (NOT recommended under ANY circumstances other than debugging) + -D | --del-baq Delete pre-existing BAQ values, i.e. compute even if already present in BAM + -e | --no-ext-baq Use 'normal' BAQ (samtools default) instead of extended BAQ (both computed on the fly if not already present in lb tag) +- Mapping quality: + -m | --min-mq INT Skip reads with mapping quality smaller than INT [0] + -M | --max-mq INT Cap mapping quality at INT [255] + -N | --no-mq Don't merge mapping quality in LoFreq's model +- Indels: + --call-indels Enable indel calls (note: preprocess your file to include indel alignment qualities!) + --only-indels Only call indels; no SNVs +- Source quality: + -s | --src-qual Enable computation of source quality + -S | --ign-vcf FILE Ignore variants in this vcf file for source quality computation. Multiple files can be given separated by commas + -T | --def-nm-q INT If >= 0, then replace non-match base qualities with this default value [-1] +- P-values: + -a | --sig P-Value cutoff / significance level [0.010000] + -b | --bonf Bonferroni factor. 'dynamic' (increase per actually performed test) or INT ['dynamic'] +- Misc.: + -C | --min-cov INT Test only positions having at least this coverage [1] + (note: without --no-default-filter default filters (incl. coverage) kick in after predictions are done) + -d | --max-depth INT Cap coverage at this depth [1000000] + --illumina-1.3 Assume the quality is Illumina-1.3-1.7/ASCII+64 encoded + --use-orphan Count anomalous read pairs (i.e. where mate is not aligned properly) + --plp-summary-only No variant calling. Just output pileup summary per column + --no-default-filter Don't run default 'lofreq filter' automatically after calling variants + --force-overwrite Overwrite any existing output + --verbose Be verbose + --debug Enable debugging \ No newline at end of file diff --git a/src/lofreq/call/script.sh b/src/lofreq/call/script.sh new file mode 100644 index 00000000..863fe986 --- /dev/null +++ b/src/lofreq/call/script.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Unset all parameters that are set to "false" +[[ "$par_no_baq" == "false" ]] && unset par_no_baq +[[ "$par_no_idaq" == "false" ]] && unset par_no_idaq +[[ "$par_del_baq" == "false" ]] && unset par_del_baq +[[ "$par_no_ext_baq" == "false" ]] && unset par_no_ext_baq +[[ "$par_no_mq" == "false" ]] && unset par_no_mq +[[ "$par_call_indels" == "false" ]] && unset par_call_indels +[[ "$par_only_indels" == "false" ]] && unset par_only_indels +[[ "$par_src_qual" == "false" ]] && unset par_src_qual +[[ "$par_illumina_13" == "false" ]] && unset par_illumina_13 +[[ "$par_use_orphan" == "false" ]] && unset par_use_orphan +[[ "$par_plp_summary_only" == "false" ]] && unset par_plp_summary_only +[[ "$par_no_default_filter" == "false" ]] && unset par_no_default_filter +[[ "$par_force_overwrite" == "false" ]] && unset par_force_overwrite +[[ "$par_verbose" == "false" ]] && unset par_verbose +[[ "$par_debug" == "false" ]] && unset par_debug + +# Run lofreq call +lofreq call \ + -f "$par_ref" \ + -o "$par_out" \ + ${par_region:+-r "${par_region}"} \ + ${par_bed:+-l "${par_bed}"} \ + ${par_min_bq:+-q "${par_min_bq}"} \ + ${par_min_alt_bq:+-Q "${par_min_alt_bq}"} \ + ${par_def_alt_bq:+-R "${par_def_alt_bq}"} \ + ${par_min_jq:+-j "${par_min_jq}"} \ + ${par_alt_jq:+-K "${par_alt_jq}"} \ + ${par_no_baq:+-B} \ + ${par_no_idaq:+-A} \ + ${par_del_baq:+-D} \ + ${par_no_ext_baq:+-e} \ + ${par_min_mq:+-m "${par_min_mq}"} \ + ${par_max_mq:+-M "${par_max_mq}"} \ + ${par_no_mq:+-N} \ + ${par_call_indels:+--call-indels} \ + ${par_only_indels:+--only-indels} \ + ${par_src_qual:+-s} \ + ${par_ign_vcf:+-S "${par_ign_vcf}"} \ + ${par_def_nm_q:+-T "${par_def_nm_q}"} \ + ${par_sig:+-a "${par_sig}"} \ + ${par_bonf:+-b "${par_bonf}"} \ + ${par_min_cov:+-C "${par_min_cov}"} \ + ${par_max_depth:+-d "${par_max_depth}"} \ + ${par_illumina_13:+--illumina-1.3} \ + ${par_use_orphan:+--use-orphan} \ + ${par_plp_summary_only:+--plp-summary-only} \ + ${par_no_default_filter:+--no-default-filter} \ + ${par_force_overwrite:+--force-overwrite} \ + ${par_verbose:+--verbose} \ + ${par_debug:+--debug} \ + "$par_input" \ No newline at end of file diff --git a/src/lofreq/call/test.sh b/src/lofreq/call/test.sh new file mode 100644 index 00000000..d8556398 --- /dev/null +++ b/src/lofreq/call/test.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +set -e + +dir_in="${meta_resources_dir%/}/test_data" + +echo "> Run lofreq call" +"$meta_executable" \ + --input "$dir_in/a.bam" \ + --input_bai "$dir_in/a.bai" \ + --ref "$dir_in/genome.fasta" \ + --out "output.vcf" \ + +echo ">> Checking output" +[ ! -f "output.vcf" ] && echo "Output file output.vcf does not exist" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "output.vcf" ] && echo "Output file output.vcf is empty" && exit 1 + +echo "> Test successful" \ No newline at end of file diff --git a/src/lofreq/call/test_data/a.bai b/src/lofreq/call/test_data/a.bai new file mode 100644 index 00000000..fd401327 Binary files /dev/null and b/src/lofreq/call/test_data/a.bai differ diff --git a/src/lofreq/call/test_data/a.bam b/src/lofreq/call/test_data/a.bam new file mode 100644 index 00000000..109b5fac Binary files /dev/null and b/src/lofreq/call/test_data/a.bam differ diff --git a/src/lofreq/call/test_data/genome.fasta b/src/lofreq/call/test_data/genome.fasta new file mode 100644 index 00000000..e2015391 --- /dev/null +++ b/src/lofreq/call/test_data/genome.fasta @@ -0,0 +1,8 @@ +>SheilaA +GCTAGCTCAGAAAAAAAAAA +>SheilaB +GCTAGCTCAGAAAAAAAAAA +>SheilaC +GCTAGCTCAGAAAAAAAAAA +>SheilaD +GCTAGCTCAGAAAAAAAAAA diff --git a/src/lofreq/call/test_data/genome.fasta.fai b/src/lofreq/call/test_data/genome.fasta.fai new file mode 100644 index 00000000..e42bfe2c --- /dev/null +++ b/src/lofreq/call/test_data/genome.fasta.fai @@ -0,0 +1,4 @@ +SheilaA 20 9 20 21 +SheilaB 20 39 20 21 +SheilaC 20 69 20 21 +SheilaD 20 99 20 21 diff --git a/src/lofreq/call/test_data/script.sh b/src/lofreq/call/test_data/script.sh new file mode 100644 index 00000000..9a90bf48 --- /dev/null +++ b/src/lofreq/call/test_data/script.sh @@ -0,0 +1,10 @@ +# pear test data + +# Test data was obtained from https://github.com/snakemake/snakemake-wrappers/tree/master/bio/lofreq/call/test/data + +if [ ! -d /tmp/snakemake-wrappers ]; then + git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers +fi + +cp -r /tmp/snakemake-wrappers/bio/lofreq/call/test/data/* src/lofreq/call/test_data + diff --git a/src/lofreq/indelqual/config.vsh.yaml b/src/lofreq/indelqual/config.vsh.yaml new file mode 100644 index 00000000..821d5d72 --- /dev/null +++ b/src/lofreq/indelqual/config.vsh.yaml @@ -0,0 +1,82 @@ +functionality: + name: lofreq_indelqual + namespace: lofreq + description: | + Insert indel qualities into BAM file (required for indel predictions). + + The preferred way of inserting indel qualities should be via GATK's BQSR (>=2) If that's not possible, use this subcommand. + The command has two modes: 'uniform' and 'dindel': + - 'uniform' will assign a given value uniformly, whereas + - 'dindel' will insert indel qualities based on Dindel (PMID 20980555). + Both will overwrite any existing values. + Do not realign your BAM file afterwards! + info: + keywords: [ "bam", "indel", "qualities", "indelqual", "lofreq", "lofreq/indelqual"] + links: + homepage: https://csb5.github.io/lofreq/ + documentation: https://csb5.github.io/lofreq/commands/ + reference: + doi: 10.1093/nar/gks918 + license: "MIT" + requirements: + commands: [ lofreq ] + argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: | + Input BAM file. + required: true + example: "normal.bam" + - name: --ref + alternatives: -f + type: file + description: | + Reference sequence used for mapping (Only required for --dindel). + required: false + example: "reference.fasta" + - name: Outputs + arguments: + - name: --out + alternatives: -o + type: file + description: | + Output BAM file. + required: true + direction: output + example: "output.bam" + - name: Arguments + arguments: + - name: --uniform + alternatives: -u + type: string + description: | + Add this indel quality uniformly to all bases. Use two comma separated values to specify insertion and deletion quality separately. (clashes with --dindel). + required: false + example: "50,50" + - name: --dindel + type: boolean_true + description: | + Add Dindel's indel qualities (Illumina specific) (clashes with -u; needs --ref). + - name: --verbose + type: boolean_true + description: | + Be verbose. + resources: + - type: bash_script + path: script.sh + test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +platforms: + - type: docker + image: quay.io/biocontainers/lofreq:2.1.5--py38h794fc9e_10 + setup: + - type: docker + run: | + version=$(lofreq version | grep 'version' | sed 's/version: //') && \ + echo "lofreq: $version" > /var/software_versions.txt + - type: nextflow diff --git a/src/lofreq/indelqual/help.txt b/src/lofreq/indelqual/help.txt new file mode 100644 index 00000000..d520f1ad --- /dev/null +++ b/src/lofreq/indelqual/help.txt @@ -0,0 +1,21 @@ +lofreq indelqual: Insert indel qualities into BAM file (required for indel predictions) + +Usage: lofreq indelqual [options] in.bam +Options: + -u | --uniform INT[,INT] Add this indel quality uniformly to all bases. + Use two comma separated values to specify + insertion and deletion quality separately. + (clashes with --dindel) + --dindel Add Dindel's indel qualities (Illumina specific) + (clashes with -u; needs --ref) + -f | --ref Reference sequence used for mapping + (Only required for --dindel) + -o | --out FILE Output BAM file [- = stdout = default] + --verbose Be verbose + +The preferred way of inserting indel qualities should be via GATK's BQSR (>=2) If that's not possible, use this subcommand. +The command has two modes: 'uniform' and 'dindel': +- 'uniform' will assign a given value uniformly, whereas +- 'dindel' will insert indel qualities based on Dindel (PMID 20980555). +Both will overwrite any existing values. +Do not realign your BAM file afterwards! \ No newline at end of file diff --git a/src/lofreq/indelqual/script.sh b/src/lofreq/indelqual/script.sh new file mode 100644 index 00000000..341886ba --- /dev/null +++ b/src/lofreq/indelqual/script.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Unset all parameters that are set to "false" +[[ "$par_dindel" == "false" ]] && unset par_dindel +[[ "$par_verbose" == "false" ]] && unset par_verbose + +# run lofreq indelqual +lofreq indelqual \ + -o "$par_out" \ + ${par_uniform:+-u "${par_uniform}"} \ + ${par_dindel:+--dindel} \ + ${par_ref:+-f "${par_ref}"} \ + ${par_verbose:+--verbose} \ + "$par_input" diff --git a/src/lofreq/indelqual/test.sh b/src/lofreq/indelqual/test.sh new file mode 100644 index 00000000..9e7f6fe3 --- /dev/null +++ b/src/lofreq/indelqual/test.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +set -e + +dir_in="${meta_resources_dir%/}/test_data" + +############################################# +mkdir uniform +cd uniform + +echo "> Run lofreq indelqual uniform" +"$meta_executable" \ + --input "$dir_in/test.bam" \ + -u 15 \ + --out "uniform.bam" \ + +echo ">> Checking output" +[ ! -f "uniform.bam" ] && echo "Output file uniform.bam does not exist" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "uniform.bam" ] && echo "Output file uniform.bam is empty" && exit 1 + +cd .. + +############################################# +mkdir dindel +cd dindel + +echo "> run lofreq indelqual dindel" +"$meta_executable" \ + --input "$dir_in/test.bam" \ + --ref "$dir_in/test.fa" \ + --dindel \ + --out "dindel.bam" + +echo ">> Checking output" +[ ! -f "dindel.bam" ] && echo "Output file dindel.bam does not exist" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "dindel.bam" ] && echo "Output file dindel.bam is empty" && exit 1 + +cd .. + +############################################# + +echo "> Test successful" diff --git a/src/lofreq/indelqual/test_data/script.sh b/src/lofreq/indelqual/test_data/script.sh new file mode 100755 index 00000000..ba348067 --- /dev/null +++ b/src/lofreq/indelqual/test_data/script.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +set -e + +TMPDIR=$(mktemp -d) +trap "rm -rf $TMPDIR" EXIT + +### Step 1: Generate Test Reference FASTA File (`test.fa`) + +cat > $TMPDIR/test.fa <chr1 +AACTCTCCGTGCTGTCCGGGGTCACTGTGATGCCAGTGCCGTCGACGGACCACAGGAGCGCCGCCAATTACGATTTATA +GGCGGCCCGGCCGATTATATCTTTGGCGGTCCCCTAGGCTCTCTAGGGGCCCGCACTGAAGAGGGCAACTCTGCAAGGA +CACGAATCTGACTCCTTAATAAAGGTGTGAAATCTGTCCGGTCGTCTCCTAATATGGGGCTTCATCATCTCAGGCGAAA +TCAGCGCCCGACGGGCCATAGTAAGCGGTGTTGTGGCATAGGTGCAGGTGGCCACCGATTATAACAGGATGACATACGC +GGAATTCGGGGTATGATGCTCTCCCGACACTTTGAGACAATAAATAGTTTAGTGTCCTGATGGTCTAAACCGAAGTCAT +TCAAAATAGCTAAGTGTAGTCTTCCCGTTCTAGGGATAGTCTAGGACATGCCCTATATTGGTTTTCTCTTACCGCGGAC +TACTCCCGCGCCCTCGGAGGTGTCTCAATTCATCCATGTTGATCCTTCAAATCGGGGCAGCGACGGGGGCACGGAGGGG +GTACGATAACCGCTAAATTGACCACCACCATCGATGATTCTACCATCTCTATCCATCCAACCCTTTTTTTGTTTATTTC +CTCTATGGGTTACAGCTA +EOF + +### Step 2: Index the Reference FASTA File + +samtools faidx $TMPDIR/test.fa + +### Step 3: Generate Test Reads with `wgsim` + +wgsim -N 100 -1 70 -2 70 $TMPDIR/test.fa $TMPDIR/reads1.fq $TMPDIR/reads2.fq + +### Step 4: Align Reads to Generate BAM File + +bwa index $TMPDIR/test.fa + +bwa mem $TMPDIR/test.fa $TMPDIR/reads1.fq $TMPDIR/reads2.fq > $TMPDIR/aligned_reads.sam + +### Step 5: Convert SAM to BAM, Sort, and Index + +samtools view -Sb $TMPDIR/aligned_reads.sam > $TMPDIR/test.bam + +### Step 6: Copy output + +cp $TMPDIR/test.bam src/lofreq/indelqual/test_data/test.bam +cp $TMPDIR/test.fa src/lofreq/indelqual/test_data/test.fa \ No newline at end of file diff --git a/src/lofreq/indelqual/test_data/test.bam b/src/lofreq/indelqual/test_data/test.bam new file mode 100644 index 00000000..2d326400 Binary files /dev/null and b/src/lofreq/indelqual/test_data/test.bam differ diff --git a/src/lofreq/indelqual/test_data/test.fa b/src/lofreq/indelqual/test_data/test.fa new file mode 100644 index 00000000..6f39d3e9 --- /dev/null +++ b/src/lofreq/indelqual/test_data/test.fa @@ -0,0 +1,10 @@ +>chr1 +AACTCTCCGTGCTGTCCGGGGTCACTGTGATGCCAGTGCCGTCGACGGACCACAGGAGCGCCGCCAATTACGATTTATA +GGCGGCCCGGCCGATTATATCTTTGGCGGTCCCCTAGGCTCTCTAGGGGCCCGCACTGAAGAGGGCAACTCTGCAAGGA +CACGAATCTGACTCCTTAATAAAGGTGTGAAATCTGTCCGGTCGTCTCCTAATATGGGGCTTCATCATCTCAGGCGAAA +TCAGCGCCCGACGGGCCATAGTAAGCGGTGTTGTGGCATAGGTGCAGGTGGCCACCGATTATAACAGGATGACATACGC +GGAATTCGGGGTATGATGCTCTCCCGACACTTTGAGACAATAAATAGTTTAGTGTCCTGATGGTCTAAACCGAAGTCAT +TCAAAATAGCTAAGTGTAGTCTTCCCGTTCTAGGGATAGTCTAGGACATGCCCTATATTGGTTTTCTCTTACCGCGGAC +TACTCCCGCGCCCTCGGAGGTGTCTCAATTCATCCATGTTGATCCTTCAAATCGGGGCAGCGACGGGGGCACGGAGGGG +GTACGATAACCGCTAAATTGACCACCACCATCGATGATTCTACCATCTCTATCCATCCAACCCTTTTTTTGTTTATTTC +CTCTATGGGTTACAGCTA