From 99dec5923bfb3da165601a3f13502d498395b14d Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Fri, 6 Sep 2024 23:46:11 +0200 Subject: [PATCH 01/13] Bedtools genomecov (#150 and #128) * Initial commit * Update config.vsh.yaml * Update script.sh * update on test.sh * bug fixing * adding ibam option tests * depthzero and strand option tests * 5prime and max tests * more tests * Changelog * Update config.vsh.yaml * Update config.vsh.yaml * Update script.sh * Update test.sh * TMPDIR * Unset Variables * par_trackopts multiple: true * Minor update to CHANGELOG --------- Co-authored-by: tgaspe --- CHANGELOG.md | 2 + .../bedtools_genomecov/config.vsh.yaml | 208 +++++++++++ src/bedtools/bedtools_genomecov/help.txt | 101 ++++++ src/bedtools/bedtools_genomecov/script.sh | 55 +++ src/bedtools/bedtools_genomecov/test.sh | 333 ++++++++++++++++++ .../bedtools_genomecov/test_data/example.bam | Bin 0 -> 334 bytes 6 files changed, 699 insertions(+) create mode 100644 src/bedtools/bedtools_genomecov/config.vsh.yaml create mode 100644 src/bedtools/bedtools_genomecov/help.txt create mode 100644 src/bedtools/bedtools_genomecov/script.sh create mode 100644 src/bedtools/bedtools_genomecov/test.sh create mode 100644 src/bedtools/bedtools_genomecov/test_data/example.bam diff --git a/CHANGELOG.md b/CHANGELOG.md index 98e78c17..8f772450 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ * `bedtools`: - `bedtools/bedtools_intersect`: Allows one to screen for overlaps between two sets of genomic features (PR #94). - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98). + - `bedtools/bedtools_genomecov`: Compute the coverage of a feature file (bed/gff/vcf/bam) among a genome (PR #128). - `bedtools/bedtools_groupby`: Summarizes a dataset column based upon common column groupings. Akin to the SQL "group by" command (PR #123). - `bedtools/bedtools_merge`: Merges overlapping BED/GFF/VCF entries into a single interval (PR #118). - `bedtools/bedtools_bamtofastq`: Convert BAM alignments to FASTQ files (PR #101). @@ -45,6 +46,7 @@ * `fastqc`: High throughput sequence quality control analysis tool (PR #92). + ## MINOR CHANGES * `busco` components: update BUSCO to `5.7.1` (PR #72). diff --git a/src/bedtools/bedtools_genomecov/config.vsh.yaml b/src/bedtools/bedtools_genomecov/config.vsh.yaml new file mode 100644 index 00000000..775587de --- /dev/null +++ b/src/bedtools/bedtools_genomecov/config.vsh.yaml @@ -0,0 +1,208 @@ +name: bedtools_genomecov +namespace: bedtools +description: | + Compute the coverage of a feature file among a genome. +keywords: [genome coverage, BED, GFF, VCF, BAM] +links: + homepage: https://bedtools.readthedocs.io/en/latest/# + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/genomecov.html + repository: https://github.com/arq5x/bedtools2 + issue_tracker: https://github.com/arq5x/bedtools2/issues +references: + doi: 10.1093/bioinformatics/btq033 +license: MIT +requirements: + commands: [bedtools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + direction: input + description: | + The input file (BED/GFF/VCF) to be used. + example: input.bed + + - name: --input_bam + alternatives: -ibam + type: file + description: | + The input file is in BAM format. + Note: BAM _must_ be sorted by positions. + '--genome' option is ignored if you use '--input_bam' option! + + - name: --genome + alternatives: -g + type: file + direction: input + description: | + The genome file to be used. + example: genome.txt + + - name: Outputs + arguments: + - name: --output + type: file + direction: output + description: | + The output BED file. + required: true + example: output.bed + + - name: Options + arguments: + + - name: --depth + alternatives: -d + type: boolean_true + description: | + Report the depth at each genome position (with one-based coordinates). + Default behavior is to report a histogram. + + - name: --depth_zero + alternatives: -dz + type: boolean_true + description: | + Report the depth at each genome position (with zero-based coordinates). + Reports only non-zero positions. + Default behavior is to report a histogram. + + - name: --bed_graph + alternatives: -bg + type: boolean_true + description: | + Report depth in BedGraph format. For details, see: + genome.ucsc.edu/goldenPath/help/bedgraph.html + + - name: --bed_graph_zero_coverage + alternatives: -bga + type: boolean_true + description: | + Report depth in BedGraph format, as above (-bg). + However with this option, regions with zero + coverage are also reported. This allows one to + quickly extract all regions of a genome with 0 + coverage by applying: "grep -w 0$" to the output. + + - name: --split + type: boolean_true + description: | + Treat "split" BAM or BED12 entries as distinct BED intervals. + when computing coverage. + For BAM files, this uses the CIGAR "N" and "D" operations + to infer the blocks for computing coverage. + For BED12 files, this uses the BlockCount, BlockStarts, and BlockEnds + fields (i.e., columns 10,11,12). + + - name: --ignore_deletion + alternatives: -ignoreD + type: boolean_true + description: | + Ignore local deletions (CIGAR "D" operations) in BAM entries + when computing coverage. + + - name: --strand + type: string + choices: ["+", "-"] + description: | + Calculate coverage of intervals from a specific strand. + With BED files, requires at least 6 columns (strand is column 6). + + - name: --pair_end_coverage + alternatives: -pc + type: boolean_true + description: | + Calculate coverage of pair-end fragments. + Works for BAM files only + + - name: --fragment_size + alternatives: -fs + type: boolean_true + description: | + Force to use provided fragment size instead of read length + Works for BAM files only + + - name: --du + type: boolean_true + description: | + Change strand af the mate read (so both reads from the same strand) useful for strand specific + Works for BAM files only + + - name: --five_prime + alternatives: -5 + type: boolean_true + description: | + Calculate coverage of 5" positions (instead of entire interval). + + - name: --three_prime + alternatives: -3 + type: boolean_true + description: | + Calculate coverage of 3" positions (instead of entire interval). + + - name: --max + type: integer + min: 0 + description: | + Combine all positions with a depth >= max into + a single bin in the histogram. Irrelevant + for -d and -bedGraph + - (INTEGER) + + - name: --scale + type: double + min: 0 + description: | + Scale the coverage by a constant factor. + Each coverage value is multiplied by this factor before being reported. + Useful for normalizing coverage by, e.g., reads per million (RPM). + - Default is 1.0; i.e., unscaled. + - (FLOAT) + + - name: --trackline + type: boolean_true + description: | + Adds a UCSC/Genome-Browser track line definition in the first line of the output. + - See here for more details about track line definition: + http://genome.ucsc.edu/goldenPath/help/bedgraph.html + - NOTE: When adding a trackline definition, the output BedGraph can be easily + uploaded to the Genome Browser as a custom track, + BUT CAN NOT be converted into a BigWig file (w/o removing the first line). + + - name: --trackopts + type: string + description: | + Writes additional track line definition parameters in the first line. + - Example: + -trackopts 'name="My Track" visibility=2 color=255,30,30' + Note the use of single-quotes if you have spaces in your parameters. + - (TEXT) + multiple: true + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bedtools, procps] + - type: docker + run: | + echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/bedtools/bedtools_genomecov/help.txt b/src/bedtools/bedtools_genomecov/help.txt new file mode 100644 index 00000000..f13a71d3 --- /dev/null +++ b/src/bedtools/bedtools_genomecov/help.txt @@ -0,0 +1,101 @@ +```bash +bedtools genomecov +``` + +Tool: bedtools genomecov (aka genomeCoverageBed) +Version: v2.30.0 +Summary: Compute the coverage of a feature file among a genome. + +Usage: bedtools genomecov [OPTIONS] -i -g + +Options: + -ibam The input file is in BAM format. + Note: BAM _must_ be sorted by position + + -d Report the depth at each genome position (with one-based coordinates). + Default behavior is to report a histogram. + + -dz Report the depth at each genome position (with zero-based coordinates). + Reports only non-zero positions. + Default behavior is to report a histogram. + + -bg Report depth in BedGraph format. For details, see: + genome.ucsc.edu/goldenPath/help/bedgraph.html + + -bga Report depth in BedGraph format, as above (-bg). + However with this option, regions with zero + coverage are also reported. This allows one to + quickly extract all regions of a genome with 0 + coverage by applying: "grep -w 0$" to the output. + + -split Treat "split" BAM or BED12 entries as distinct BED intervals. + when computing coverage. + For BAM files, this uses the CIGAR "N" and "D" operations + to infer the blocks for computing coverage. + For BED12 files, this uses the BlockCount, BlockStarts, and BlockEnds + fields (i.e., columns 10,11,12). + + -ignoreD Ignore local deletions (CIGAR "D" operations) in BAM entries + when computing coverage. + + -strand Calculate coverage of intervals from a specific strand. + With BED files, requires at least 6 columns (strand is column 6). + - (STRING): can be + or - + + -pc Calculate coverage of pair-end fragments. + Works for BAM files only + -fs Force to use provided fragment size instead of read length + Works for BAM files only + -du Change strand af the mate read (so both reads from the same strand) useful for strand specific + Works for BAM files only + -5 Calculate coverage of 5" positions (instead of entire interval). + + -3 Calculate coverage of 3" positions (instead of entire interval). + + -max Combine all positions with a depth >= max into + a single bin in the histogram. Irrelevant + for -d and -bedGraph + - (INTEGER) + + -scale Scale the coverage by a constant factor. + Each coverage value is multiplied by this factor before being reported. + Useful for normalizing coverage by, e.g., reads per million (RPM). + - Default is 1.0; i.e., unscaled. + - (FLOAT) + + -trackline Adds a UCSC/Genome-Browser track line definition in the first line of the output. + - See here for more details about track line definition: + http://genome.ucsc.edu/goldenPath/help/bedgraph.html + - NOTE: When adding a trackline definition, the output BedGraph can be easily + uploaded to the Genome Browser as a custom track, + BUT CAN NOT be converted into a BigWig file (w/o removing the first line). + + -trackopts Writes additional track line definition parameters in the first line. + - Example: + -trackopts 'name="My Track" visibility=2 color=255,30,30' + Note the use of single-quotes if you have spaces in your parameters. + - (TEXT) + +Notes: + (1) The genome file should tab delimited and structured as follows: + + + For example, Human (hg19): + chr1 249250621 + chr2 243199373 + ... + chr18_gl000207_random 4262 + + (2) The input BED (-i) file must be grouped by chromosome. + A simple "sort -k 1,1 > .sorted" will suffice. + + (3) The input BAM (-ibam) file must be sorted by position. + A "samtools sort " should suffice. + +Tips: + One can use the UCSC Genome Browser's MySQL database to extract + chromosome sizes. For example, H. sapiens: + + mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \ + "select chrom, size from hg19.chromInfo" > hg19.genome + diff --git a/src/bedtools/bedtools_genomecov/script.sh b/src/bedtools/bedtools_genomecov/script.sh new file mode 100644 index 00000000..20fbd968 --- /dev/null +++ b/src/bedtools/bedtools_genomecov/script.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +# Unset variables +unset_if_false=( + par_input_bam + par_depth + par_depth_zero + par_bed_graph + par_bed_graph_zero_coverage + par_split + par_ignore_deletion + par_pair_end_coverage + par_fragment_size + par_du + par_five_prime + par_three_prime + par_trackline +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +# Create input array +IFS=";" read -ra trackopts <<< $par_trackopts + +bedtools genomecov \ + ${par_depth:+-d} \ + ${par_depth_zero:+-dz} \ + ${par_bed_graph:+-bg} \ + ${par_bed_graph_zero_coverage:+-bga} \ + ${par_split:+-split} \ + ${par_ignore_deletion:+-ignoreD} \ + ${par_du:+-du} \ + ${par_five_prime:+-5} \ + ${par_three_prime:+-3} \ + ${par_trackline:+-trackline} \ + ${par_strand:+-strand "$par_strand"} \ + ${par_max:+-max "$par_max"} \ + ${par_scale:+-scale "$par_scale"} \ + ${par_trackopts:+-trackopts "${trackopts[*]}"} \ + ${par_input_bam:+-ibam "$par_input_bam"} \ + ${par_input:+-i "$par_input"} \ + ${par_genome:+-g "$par_genome"} \ + ${par_pair_end_coverage:+-pc} \ + ${par_fragment_size:+-fs} \ + > "$par_output" + \ No newline at end of file diff --git a/src/bedtools/bedtools_genomecov/test.sh b/src/bedtools/bedtools_genomecov/test.sh new file mode 100644 index 00000000..7e4487da --- /dev/null +++ b/src/bedtools/bedtools_genomecov/test.sh @@ -0,0 +1,333 @@ +#!/bin/bash + +# exit on error +set -eo pipefail + +## VIASH START +meta_executable="target/executable/bedtools/bedtools_intersect/bedtools_intersect" +meta_resources_dir="src/bedtools/bedtools_intersect" +## VIASH END + +# directory of the bam file +test_data="$meta_resources_dir/test_data" + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create and populate input files +printf "chr1\t248956422\nchr2\t198295559\nchr3\t242193529\n" > "$TMPDIR/genome.txt" +printf "chr2\t128\t228\tmy_read/1\t37\t+\nchr2\t428\t528\tmy_read/2\t37\t-\n" > "$TMPDIR/example.bed" +printf "chr2\t128\t228\tmy_read/1\t60\t+\t128\t228\t255,0,0\t1\t100\t0\nchr2\t428\t528\tmy_read/2\t60\t-\t428\t528\t255,0,0\t1\t100\t0\n" > "$TMPDIR/example.bed12" +printf "chr2\t100\t103\n" > "$TMPDIR/example_dz.bed" + +# expected outputs +cat > "$TMPDIR/expected_default.bed" < "$TMPDIR/expected_ibam.bed" < "$TMPDIR/expected_ibam_pc.bed" < "$TMPDIR/expected_ibam_fs.bed" < "$TMPDIR/expected_dz.bed" < "$TMPDIR/expected_strand.bed" < "$TMPDIR/expected_5.bed" < "$TMPDIR/expected_bg_scale.bed" < "$TMPDIR/expected_trackopts.bed" < "$TMPDIR/expected_split.bed" < "$TMPDIR/expected_ignoreD_du.bed" < /dev/null + +echo "> Run bedtools_genomecov on BED file" +"$meta_executable" \ + --input "../example.bed" \ + --genome "../genome.txt" \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_default.bed" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: ibam option +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bedtools_genomecov on BAM file with -ibam" +"$meta_executable" \ + --input_bam "$test_data/example.bam" \ + --output "output.bed" \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_ibam.bed" +echo "- test2 succeeded -" + +popd > /dev/null + +# Test 3: depth option +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null + +echo "> Run bedtools_genomecov on BED file with -dz" +"$meta_executable" \ + --input "../example_dz.bed" \ + --genome "../genome.txt" \ + --output "output.bed" \ + --depth_zero + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_dz.bed" +echo "- test3 succeeded -" + +popd > /dev/null + +# Test 4: strand option +mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null + +echo "> Run bedtools_genomecov on BED file with -strand" +"$meta_executable" \ + --input "../example.bed" \ + --genome "../genome.txt" \ + --output "output.bed" \ + --strand "-" \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_strand.bed" +echo "- test4 succeeded -" + +popd > /dev/null + +# Test 5: 5' end option +mkdir "$TMPDIR/test5" && pushd "$TMPDIR/test5" > /dev/null + +echo "> Run bedtools_genomecov on BED file with -5" +"$meta_executable" \ + --input "../example.bed" \ + --genome "../genome.txt" \ + --output "output.bed" \ + --five_prime \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_5.bed" +echo "- test5 succeeded -" + +popd > /dev/null + +# Test 6: max option +mkdir "$TMPDIR/test6" && pushd "$TMPDIR/test6" > /dev/null + +echo "> Run bedtools_genomecov on BED file with -max" +"$meta_executable" \ + --input "../example.bed" \ + --genome "../genome.txt" \ + --output "output.bed" \ + --max 100 \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_default.bed" +echo "- test6 succeeded -" + +popd > /dev/null + +# Test 7: bedgraph and scale option +mkdir "$TMPDIR/test7" && pushd "$TMPDIR/test7" > /dev/null + +echo "> Run bedtools_genomecov on BED file with -bg and -scale" +"$meta_executable" \ + --input "../example.bed" \ + --genome "../genome.txt" \ + --output "output.bed" \ + --bed_graph \ + --scale 100 \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_bg_scale.bed" +echo "- test7 succeeded -" + +popd > /dev/null + +# Test 8: trackopts option +mkdir "$TMPDIR/test8" && pushd "$TMPDIR/test8" > /dev/null + +echo "> Run bedtools_genomecov on BED file with -bg and -trackopts" +"$meta_executable" \ + --input "../example.bed" \ + --genome "../genome.txt" \ + --output "output.bed" \ + --bed_graph \ + --trackopts "name=example" \ + --trackopts "llama=Alpaco" \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_trackopts.bed" +echo "- test8 succeeded -" + +popd > /dev/null + +# Test 9: ibam pc options +mkdir "$TMPDIR/test9" && pushd "$TMPDIR/test9" > /dev/null + +echo "> Run bedtools_genomecov on BAM file with -ibam, -pc" +"$meta_executable" \ + --input_bam "$test_data/example.bam" \ + --output "output.bed" \ + --fragment_size \ + --pair_end_coverage \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_ibam_pc.bed" +echo "- test9 succeeded -" + +popd > /dev/null + +# Test 10: ibam fs options +mkdir "$TMPDIR/test10" && pushd "$TMPDIR/test10" > /dev/null + +echo "> Run bedtools_genomecov on BAM file with -ibam, -fs" +"$meta_executable" \ + --input_bam "$test_data/example.bam" \ + --output "output.bed" \ + --fragment_size \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_ibam_fs.bed" +echo "- test10 succeeded -" + +popd > /dev/null + +# Test 11: split +mkdir "$TMPDIR/test11" && pushd "$TMPDIR/test11" > /dev/null + +echo "> Run bedtools_genomecov on BED12 file with -split" +"$meta_executable" \ + --input "../example.bed12" \ + --genome "../genome.txt" \ + --output "output.bed" \ + --split \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_split.bed" +echo "- test11 succeeded -" + +popd > /dev/null + +# Test 12: ignore deletion and du +mkdir "$TMPDIR/test12" && pushd "$TMPDIR/test12" > /dev/null + +echo "> Run bedtools_genomecov on BAM file with -ignoreD and -du" +"$meta_executable" \ + --input_bam "$test_data/example.bam" \ + --output "output.bed" \ + --ignore_deletion \ + --du \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_ignoreD_du.bed" +echo "- test12 succeeded -" + +popd > /dev/null + +echo "---- All tests succeeded! ----" +exit 0 diff --git a/src/bedtools/bedtools_genomecov/test_data/example.bam b/src/bedtools/bedtools_genomecov/test_data/example.bam new file mode 100644 index 0000000000000000000000000000000000000000..ffc075ab83a83a98ed1edbf88b26cc27ad8946c6 GIT binary patch literal 334 zcmb2|=3rp}f&Xj_PR>jWAq>SuUsA6mBqS7Y@IB%Aw%O~PhS4S?6Z1_bX2zRMuCZ>` z;o;@Ato^fw$CpQUheTtRYNNz-r#8JXHa3Ry>s4lk0?m>~GxQF_-U<7&m>dP#pU;|5 z)~CHK)-&PMX8(zQnRkjz7tzu&Q_9lpm^;_nXXDZb**~)OH9hZA+GbYw!F2!1eU^u& z=6?J8db>^n+vnS58VqGOpQde!^LhT^FPno$sK1R;RVb&i_o5|>-LG;Kg??MHCx&;~ ziZww?R#r16X1LX_ZFYQZ=WBLl9Y-y@V*W$>-;Wo3eOwoN_@m-GsXhDI Date: Mon, 9 Sep 2024 08:19:44 +0200 Subject: [PATCH 02/13] Fq subsample (#147) --- CHANGELOG.md | 2 + src/fq_subsample/config.vsh.yaml | 68 ++++++++++++++++++++++++ src/fq_subsample/help.txt | 20 +++++++ src/fq_subsample/script.sh | 26 +++++++++ src/fq_subsample/test.sh | 36 +++++++++++++ src/fq_subsample/test_data/a.3.fastq.gz | Bin 0 -> 292 bytes src/fq_subsample/test_data/a.4.fastq.gz | Bin 0 -> 301 bytes 7 files changed, 152 insertions(+) create mode 100644 src/fq_subsample/config.vsh.yaml create mode 100644 src/fq_subsample/help.txt create mode 100755 src/fq_subsample/script.sh create mode 100644 src/fq_subsample/test.sh create mode 100644 src/fq_subsample/test_data/a.3.fastq.gz create mode 100644 src/fq_subsample/test_data/a.4.fastq.gz diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f772450..6534eed1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -142,6 +142,8 @@ - `bedtools_getfasta`: extract sequences from a FASTA file for each of the intervals defined in a BED/GFF/VCF file (PR #59). +* `fq_subsample`: Sample a subset of records from single or paired FASTQ files (PR #147). + ## MINOR CHANGES * Uniformize component metadata (PR #23). diff --git a/src/fq_subsample/config.vsh.yaml b/src/fq_subsample/config.vsh.yaml new file mode 100644 index 00000000..2455a341 --- /dev/null +++ b/src/fq_subsample/config.vsh.yaml @@ -0,0 +1,68 @@ +name: fq_subsample +description: fq subsample outputs a subset of records from single or paired FASTQ files. +keywords: [fastq, subsample, subset] +links: + homepage: https://github.com/stjude-rust-labs/fq/blob/master/README.md + documentation: https://github.com/stjude-rust-labs/fq/blob/master/README.md + repository: https://github.com/stjude-rust-labs/fq +license: MIT + +argument_groups: +- name: "Input" + arguments: + - name: "--input_1" + type: file + required: true + description: First input fastq file to subsample. Accepts both raw and gzipped FASTQ inputs. + - name: "--input_2" + type: file + description: Second input fastq files to subsample. Accepts both raw and gzipped FASTQ inputs. + +- name: "Output" + arguments: + - name: "--output_1" + type: file + direction: output + description: Sampled read 1 fastq files. Output will be gzipped if ends in `.gz`. + - name: "--output_2" + type: file + direction: output + description: Sampled read 2 fastq files. Output will be gzipped if ends in `.gz`. + +- name: "Options" + arguments: + - name: "--probability" + type: double + description: The probability a record is kept, as a percentage (0.0, 1.0). Cannot be used with `record-count` + - name: "--record_count" + type: integer + description: The exact number of records to keep. Cannot be used with `probability` + - name: "--seed" + type: integer + description: Seed to use for the random number generator + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: + - type: docker + image: rust:1.81-slim + setup: + - type: docker + run: | + apt-get update && apt-get install -y git procps && \ + git clone --depth 1 --branch v0.12.0 https://github.com/stjude-rust-labs/fq.git && \ + cd fq && \ + cargo install --locked --path . && \ + mv target/release/fq /usr/local/bin/ && \ + cd / && rm -rf /fq + +runners: + - type: executable + - type: nextflow diff --git a/src/fq_subsample/help.txt b/src/fq_subsample/help.txt new file mode 100644 index 00000000..6f4a9acf --- /dev/null +++ b/src/fq_subsample/help.txt @@ -0,0 +1,20 @@ +``` +fq subsample -h +``` + +Outputs a subset of records + +Usage: fq subsample [OPTIONS] --r1-dst <--probability |--record-count > [R2_SRC] + +Arguments: + Read 1 source. Accepts both raw and gzipped FASTQ inputs + [R2_SRC] Read 2 source. Accepts both raw and gzipped FASTQ inputs + +Options: + -p, --probability The probability a record is kept, as a percentage (0.0, 1.0). Cannot be used with `record-count` + -n, --record-count The exact number of records to keep. Cannot be used with `probability` + -s, --seed Seed to use for the random number generator + --r1-dst Read 1 destination. Output will be gzipped if ends in `.gz` + --r2-dst Read 2 destination. Output will be gzipped if ends in `.gz` + -h, --help Print help + -V, --version \ No newline at end of file diff --git a/src/fq_subsample/script.sh b/src/fq_subsample/script.sh new file mode 100755 index 00000000..bcc81b40 --- /dev/null +++ b/src/fq_subsample/script.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -eo pipefail + + +required_args=("-p" "--probability" "-n" "--record_count") + +# exclusive OR for required arguments $par_probability and $par_record_count +if [[ -n $par_probability && -n $par_record_count ]] || [[ -z $par_probability && -z $par_record_count ]]; then + echo "FQ/SUBSAMPLE requires either --probability or --record_count to be specified" + exit 1 +fi + + +fq subsample \ + ${par_output_1:+--r1-dst "${par_output_1}"} \ + ${par_output_2:+--r2-dst "${par_output_2}"} \ + ${par_probability:+--probability "${par_probability}"} \ + ${par_record_count:+--record-count "${par_record_count}"} \ + ${par_seed:+--seed "${par_seed}"} \ + ${par_input_1} \ + ${par_input_2} + diff --git a/src/fq_subsample/test.sh b/src/fq_subsample/test.sh new file mode 100644 index 00000000..1de48e95 --- /dev/null +++ b/src/fq_subsample/test.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +echo ">>> Testing $meta_executable" + +echo ">>> Testing for paired-end reads" +"$meta_executable" \ + --input_1 $meta_resources_dir/test_data/a.3.fastq.gz \ + --input_2 $meta_resources_dir/test_data/a.4.fastq.gz \ + --record_count 3 \ + --seed 1 \ + --output_1 a.1.subsampled.fastq \ + --output_2 a.2.subsampled.fastq + +echo ">> Checking if the correct files are present" +[ ! -f "a.1.subsampled.fastq" ] && echo "Subsampled FASTQ file for read 1 is missing!" && exit 1 +[ $(wc -l < a.1.subsampled.fastq) -ne 12 ] && echo "Subsampled FASTQ file for read 1 does not contain the expected number of records" && exit 1 +[ ! -f "a.2.subsampled.fastq" ] && echo "Subsampled FASTQ file for read 2 is missing" && exit 1 +[ $(wc -l < a.2.subsampled.fastq) -ne 12 ] && echo "Subsampled FASTQ file for read 2 does not contain the expected number of records" && exit 1 + +rm a.1.subsampled.fastq a.2.subsampled.fastq + +echo ">>> Testing for single-end reads" +"$meta_executable" \ + --input_1 $meta_resources_dir/test_data/a.3.fastq.gz \ + --record_count 3 \ + --seed 1 \ + --output_1 a.1.subsampled.fastq + + +echo ">> Checking if the correct files are present" +[ ! -f "a.1.subsampled.fastq" ] && echo "Subsampled FASTQ file is missing" && exit 1 +[ $(wc -l < a.1.subsampled.fastq) -ne 12 ] && echo "Subsampled FASTQ file does not contain the expected number of records" && exit 1 + +echo ">>> Tests finished successfully" +exit 0 + diff --git a/src/fq_subsample/test_data/a.3.fastq.gz b/src/fq_subsample/test_data/a.3.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..3e38d06dc5213e2b60cf8feab54214ef6ae72095 GIT binary patch literal 292 zcmV+<0o(o`iwFopgw{AqTPku(#7Qf5avvPnFZM+Apx**Oa!?txr3QV-KZjV->mD8KoHv7#-z3Z4r{+?NKtIv%ALBp1?Z)Zpi_@6{>nZp{l zpnEW6Vuo5f3k)_CS+Enz@c0PCa|geeO1hrf6{+00VDfRbahk2}!7qJ+#!w%N%nT$& q=?+h;4lV-+mG4KNiZN0-nItN^Op$ogq>5BhG3pb)YY7nG0ssKu1BeL# literal 0 HcmV?d00001 diff --git a/src/fq_subsample/test_data/a.4.fastq.gz b/src/fq_subsample/test_data/a.4.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..3164c6148650e36532545b7946efa9a16055db5d GIT binary patch literal 301 zcmV+|0n+{-iwFpdgVkmL17R*SE@okKba4QclD}%iFbs$HJcai{?fi98G@K%^_p4su zpdHG=4W&beK793aE?fgCy*k8_6@xxLe!66TNB^7^ZV)idU^Ud zeZIYXbyM3^!osXsSp}(z+L2i-5vyb?=8QX%SyifsYQ{=`tlNd^@PlcHb+G8JahHhM z8WtRM#0J7;1BHM|@mewSy+pUQA?nAj9oxxW<1dbKR7_YiGA zZiwo>i^DWVwk#hC~KOM9F)iI44T9dZJXoXJ35-@igEx-~s>uZ@7@Z literal 0 HcmV?d00001 From 320d044fe45e565fbc9772640ebf6f39c5584b4a Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Mon, 9 Sep 2024 08:49:14 +0200 Subject: [PATCH 03/13] Sortmerna (#146) --- CHANGELOG.md | 3 + src/sortmerna/config.vsh.yaml | 290 ++++++++++++++++++++ src/sortmerna/help.txt | 319 ++++++++++++++++++++++ src/sortmerna/script.sh | 108 ++++++++ src/sortmerna/test.sh | 101 +++++++ src/sortmerna/test_data/rRNA/database1.fa | 24 ++ src/sortmerna/test_data/rRNA/database2.fa | 16 ++ src/sortmerna/test_data/reads_1.fq.gz | Bin 0 -> 189 bytes src/sortmerna/test_data/reads_2.fq.gz | Bin 0 -> 147 bytes src/sortmerna/test_data/script.sh | 8 + 10 files changed, 869 insertions(+) create mode 100644 src/sortmerna/config.vsh.yaml create mode 100644 src/sortmerna/help.txt create mode 100755 src/sortmerna/script.sh create mode 100644 src/sortmerna/test.sh create mode 100644 src/sortmerna/test_data/rRNA/database1.fa create mode 100644 src/sortmerna/test_data/rRNA/database2.fa create mode 100644 src/sortmerna/test_data/reads_1.fq.gz create mode 100644 src/sortmerna/test_data/reads_2.fq.gz create mode 100755 src/sortmerna/test_data/script.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 6534eed1..5041f082 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -142,6 +142,9 @@ - `bedtools_getfasta`: extract sequences from a FASTA file for each of the intervals defined in a BED/GFF/VCF file (PR #59). +* `sortmerna`: Local sequence alignment tool for mapping, clustering, and filtering rRNA from metatranscriptomic + data. (PR #146) + * `fq_subsample`: Sample a subset of records from single or paired FASTQ files (PR #147). ## MINOR CHANGES diff --git a/src/sortmerna/config.vsh.yaml b/src/sortmerna/config.vsh.yaml new file mode 100644 index 00000000..6477660f --- /dev/null +++ b/src/sortmerna/config.vsh.yaml @@ -0,0 +1,290 @@ +name: sortmerna +description: | + Local sequence alignment tool for filtering, mapping and clustering. The main + application of SortMeRNA is filtering rRNA from metatranscriptomic data. +keywords: [sort, mRNA, rRNA, alignment, filtering, mapping, clustering] +links: + homepage: https://sortmerna.readthedocs.io/en/latest/ + documentation: https://sortmerna.readthedocs.io/en/latest/manual4.0.html + repository: https://github.com/sortmerna/sortmerna +references: + doi: 10.1093/bioinformatics/bts611 +license: GPL-3.0 + +argument_groups: +- name: "Input" + arguments: + - name: "--paired" + type: boolean_true + description: | + Reads are paired-end. If a single reads file is provided, use this option + to indicate the file contains interleaved paired reads when neither + 'paired_in' | 'paired_out' | 'out2' | 'sout' are specified. + - name: "--input" + type: file + multiple: true + description: Input fastq + - name: "--ref" + type: file + multiple: true + description: Reference fasta file(s) for rRNA database. + - name: "--ribo_database_manifest" + type: file + description: Text file containing paths to fasta files (one per line) that will be used to create the database for SortMeRNA. + +- name: "Output" + arguments: + - name: "--log" + type: file + direction: output + must_exist: false + example: $id.sortmerna.log + description: Sortmerna log file. + - name: "--output" + alternatives: ["--aligned"] + type: string + description: | + Directory and file prefix for aligned output. The appropriate extension: + (fasta|fastq|blast|sam|etc) is automatically added. + If 'dir' is not specified, the output is created in the WORKDIR/out/. + If 'pfx' is not specified, the prefix 'aligned' is used. + - name: "--other" + type: string + description: Create Non-aligned reads output file with this path/prefix. Must be used with fastx. + +- name: "Options" + arguments: + - name: "--kvdb" + type: string + description: Path to directory of the key-value database file, used for storing the alignment results. + - name: "--idx_dir" + type: string + description: Path to the directory for storing the reference index files. + - name: "--readb" + type: string + description: Path to the directory for storing pre-processed reads. + - name: "--fastx" + type: boolean_true + description: Output aligned reads into FASTA/FASTQ file + - name: "--sam" + type: boolean_true + description: Output SAM alignment for aligned reads. + - name: "--sq" + type: boolean_true + description: Add SQ tags to the SAM file + - name: "--blast" + type: string + description: | + Blast options: + * '0' - pairwise + * '1' - tabular(Blast - m 8 format) + * '1 cigar' - tabular + column for CIGAR + * '1 cigar qcov' - tabular + columns for CIGAR and query coverage + * '1 cigar qcov qstrand' - tabular + columns for CIGAR, query coverage and strand + choices: ['0', '1', '1 cigar', '1 cigar qcov', '1 cigar qcov qstrand'] + - name: "--num_alignments" + type: integer + description: | + Report first INT alignments per read reaching E-value. If Int = 0, all alignments will be output. Default: '0' + example: 0 + - name: "--min_lis" + type: integer + description: | + search all alignments having the first INT longest LIS. LIS stands for Longest Increasing Subsequence, it is + computed using seeds’ positions to expand hits into longer matches prior to Smith-Waterman alignment. Default: '2'. + example: 2 + - name: "--print_all_reads" + type: boolean_true + description: output null alignment strings for non-aligned reads to SAM and/or BLAST tabular files. + - name: "--paired_in" + type: boolean_true + description: | + In the case where a pair of reads is aligned with a score above the threshold, the output of the reads is controlled + by the following options: + * --paired_in and --paired_out are both false: Only one read per pair is output to the aligned fasta file. + * --paired_in is true and --paired_out is false: Both reads of the pair are output to the aligned fasta file. + * --paired_in is false and --paired_out is true: Both reads are output the the other fasta file (if it is specified). + - name: "--paired_out" + type: boolean_true + description: See description of --paired_in. + - name: "--out2" + type: boolean_true + description: | + Output paired reads into separate files. Must be used with '--fastx'. If a single reads file is provided, this options + implies interleaved paired reads. When used with 'sout', four (4) output files for aligned reads will be generated: + 'aligned-paired-fwd, aligned-paired-rev, aligned-singleton-fwd, aligned-singleton-rev'. If 'other' option is also used, + eight (8) output files will be generated. + - name: "--sout" + type: boolean_true + description: | + Separate paired and singleton aligned reads. Must be used with '--fastx'. If a single reads file is provided, + this options implies interleaved paired reads. Cannot be used with '--paired_in' or '--paired_out'. + - name: "--zip_out" + type: string + description: | + Compress the output files. The possible values are: + * '1/true/t/yes/y' + * '0/false/f/no/n' + *'-1' (the same format as input - default) + The values are Not case sensitive. + choices: ['1', 'true', 't', 'yes', 'y', '0', 'false', 'f', 'no', 'n', '-1'] + example: "-1" + - name: "--match" + type: integer + description: | + Smith-Waterman score for a match (positive integer). Default: '2'. + example: 2 + - name: "--mismatch" + type: integer + description: | + Smith-Waterman penalty for a mismatch (negative integer). Default: '-3'. + example: -3 + - name: "--gap_open" + type: integer + description: | + Smith-Waterman penalty for introducing a gap (positive integer). Default: '5'. + example: 5 + - name: "--gap_ext" + type: integer + description: | + Smith-Waterman penalty for extending a gap (positive integer). Default: '2'. + example: 2 + - name: "--N" + type: integer + description: | + Smith-Waterman penalty for ambiguous letters (N’s) scored as --mismatch. Default: '-1'.\ + example: -1 + - name: "--a" + type: integer + description: | + Number of threads to use. Default: '1'. + example: 1 + - name: "--e" + type: double + description: | + E-value threshold. Default: '1'. + example: 1 + - name: "--F" + type: boolean_true + description: Search only the forward strand. + - name: "--R" + type: boolean_true + description: Search only the reverse-complementary strand. + - name: "--num_alignment" + type: integer + description: | + Report first INT alignments per read reaching E-value (--num_alignments 0 signifies all alignments will be output). + Default: '-1' + example: -1 + - name: "--best" + type: integer + description: | + Report INT best alignments per read reaching E-value by searching --min_lis INT candidate alignments (--best 0 + signifies all candidate alignments will be searched) Default: '1'. + example: 1 + - name: "--verbose" + alternatives: ["-v"] + type: boolean_true + description: Verbose output. + +- name: "OTU picking options" + arguments: + - name: "--id" + type: double + description: | + %id similarity threshold (the alignment must still pass the E-value threshold). Default: '0.97'. + example: 0.97 + - name: "--coverage" + type: double + description: | + %query coverage threshold (the alignment must still pass the E-value threshold). Default: '0.97'. + example: 0.97 + - name: "--de_novo" + type: boolean_true + description: | + FASTA/FASTQ file for reads matching database < %id off (set using --id) and < %cov (set using --coverage) + (alignment must still pass the E-value threshold). + - name: "--otu_map" + type: boolean_true + description: | + Output OTU map (input to QIIME’s make_otu_table.py). + +- name: "Advanced options" + arguments: + - name: "--num_seed" + type: integer + description: | + Number of seeds matched before searching for candidate LIS. Default: '2'. + example: 2 + - name: "--passes" + type: integer + multiple: true + description: | + Three intervals at which to place the seed on the read L,L/2,3 (L is the seed length set in ./indexdb_rna). + - name: "--edge" + type: string + description: | + The number (or percentage if followed by %) of nucleotides to add to each edge of the alignment region on the + reference sequence before performing Smith-Waterman alignment. Default: '4'. + example: 4 + - name: "--full_search" + type: boolean_true + description: | + Search for all 0-error and 1-error seed off matches in the index rather than stopping after finding a 0-error match + (<1% gain in sensitivity with up four-fold decrease in speed). + +- name: "Indexing Options" + arguments: + - name: "--index" + type: integer + description: | + Create index files for the reference database. By default when this option is not used, the program checks the + reference index and builds it if not already existing. + This can be changed by using '-index' as follows: + * '-index 0' - skip indexing. If the index does not exist, the program will terminate + and warn to build the index prior performing the alignment + * '-index 1' - only perform the indexing and terminate + * '-index 2' - the default behaviour, the same as when not using this option at all + example: 2 + choices: [0, 1, 2] + - name: "-L" + type: double + description: | + Indexing seed length. Default: '18' + example: 18 + - name: "--interval" + type: integer + description: | + Index every Nth L-mer in the reference database. Default: '1' + example: 1 + - name: "--max_pos" + type: integer + description: | + Maximum number of positions to store for each unique L-mer. Set to 0 to store all positions. Default: '1000' + example: 1000 + + + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: +- type: docker + image: ubuntu:22.04 + setup: + - type: docker + run: | + apt-get update && \ + apt-get install -y --no-install-recommends gzip cmake g++ wget && \ + apt-get clean && \ + wget --no-check-certificate https://github.com/sortmerna/sortmerna/releases/download/v4.3.6/sortmerna-4.3.6-Linux.sh && \ + bash sortmerna-4.3.6-Linux.sh --skip-license +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/sortmerna/help.txt b/src/sortmerna/help.txt new file mode 100644 index 00000000..f0842707 --- /dev/null +++ b/src/sortmerna/help.txt @@ -0,0 +1,319 @@ +``` +sortmerna -h +``` + + + Program: SortMeRNA version 4.3.6 + Copyright: 2016-2020 Clarity Genomics BVBA: + Turnhoutseweg 30, 2340 Beerse, Belgium + 2014-2016 Knight Lab: + Department of Pediatrics, UCSD, La Jolla + 2012-2014 Bonsai Bioinformatics Research Group: + LIFL, University Lille 1, CNRS UMR 8022, INRIA Nord-Europe + Disclaimer: SortMeRNA comes with ABSOLUTELY NO WARRANTY; without even the + implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU Lesser General Public License for more details. + Contributors: Jenya Kopylova jenya.kopylov@gmail.com + Laurent Noé laurent.noe@lifl.fr + Pierre Pericard pierre.pericard@lifl.fr + Daniel McDonald wasade@gmail.com + Mikaël Salson mikael.salson@lifl.fr + Hélène Touzet helene.touzet@lifl.fr + Rob Knight robknight@ucsd.edu + + Usage: sortmerna -ref FILE [-ref FILE] -reads FWD_READS [-reads REV_READS] [OPTIONS]: + ------------------------------------------------------------------------------------------------------------- + | option type-format description default | + ------------------------------------------------------------------------------------------------------------- + + [REQUIRED] + --ref PATH Required Reference file (FASTA) absolute or relative path. + + Use mutliple times, once per a reference file + + + --reads PATH Required Raw reads file (FASTA/FASTQ/FASTA.GZ/FASTQ.GZ). + + Use twice for files with paired reads. + The file extensions are Not important. The program automatically + recognizes the file format as flat/compressed, fasta/fastq + + + + [COMMON] + --workdir PATH Optional Workspace directory USRDIR/sortmerna/run/ + + Default structure: WORKDIR/ + idx/ (References index) + kvdb/ (Key-value storage for alignments) + out/ (processing output) + readb/ (pre-processed reads/index) + + + --kvdb PATH Optional Directory for Key-value database WORKDIR/kvdb + + KVDB is used for storing the alignment results. + + + --idx-dir PATH Optional Directory for storing Reference index. WORKDIR/idx + + + --readb PATH Optional Storage for pre-processed reads WORKDIR/readb/ + + Directory storing the split reads, or the random access index of compressed reads + + + --fastx BOOL Optional Output aligned reads into FASTA/FASTQ file + --sam BOOL Optional Output SAM alignment for aligned reads. + + + --SQ BOOL Optional Add SQ tags to the SAM file + + + --blast STR Optional output alignments in various Blast-like formats + + Sample values: '0' - pairwise + '1' - tabular (Blast - m 8 format) + '1 cigar' - tabular + column for CIGAR + '1 cigar qcov' - tabular + columns for CIGAR and query coverage + '1 cigar qcov qstrand' - tabular + columns for CIGAR, query coverage, + and strand + + + --aligned STR/BOOL Optional Aligned reads file prefix [dir/][pfx] WORKDIR/out/aligned + + Directory and file prefix for aligned output i.e. each + output file goes into the specified directory with the given prefix. + The appropriate extension: (fasta|fastq|blast|sam|etc) is automatically added. + Both 'dir' and 'pfx' are optional. + The 'dir' can be a relative or an absolute path. + If 'dir' is not specified, the output is created in the WORKDIR/out/ + If 'pfx' is not specified, the prefix 'aligned' is used + Examples: + '-aligned $MYDIR/dir_1/dir_2/1' -> $MYDIR/dir_1/dir_2/1.fasta + '-aligned dir_1/apfx' -> $PWD/dir_1/apfx.fasta + '-aligned dir_1/' -> $PWD/aligned.fasta + '-aligned apfx' -> $PWD/apfx.fasta + '-aligned (no argument)' -> WORKDIR/out/aligned.fasta + + + --other STR/BOOL Optional Non-aligned reads file prefix [dir/][pfx] WORKDIR/out/other + + Directory and file prefix for non-aligned output i.e. each + output file goes into the specified directory with the given prefix. + The appropriate extension: (fasta|fastq|blast|sam|etc) is automatically added. + Must be used with 'fastx'. + Both 'dir' and 'pfx' are optional. + The 'dir' can be a relative or an absolute path. + If 'dir' is not specified, the output is created in the WORKDIR/out/ + If 'pfx' is not specified, the prefix 'other' is used + Examples: + '-other $MYDIR/dir_1/dir_2/1' -> $MYDIR/dir_1/dir_2/1.fasta + '-other dir_1/apfx' -> $PWD/dir_1/apfx.fasta + '-other dir_1/' -> $PWD/dir_1/other.fasta + '-other apfx' -> $PWD/apfx.fasta + '-other (no argument)' -> aligned_out/other.fasta + i.e. the same output directory + as used for aligned output + + + --num_alignments INT Optional Positive integer (INT >=0). + + If used with '-no-best' reports first INT alignments per read reaching + E-value threshold, which allows to lower the CPU time and memory use. + Otherwise outputs INT best alignments. + If INT = 0, all alignments are output + + + --no-best BOOL Optional Disable best alignments search False + + The 'best' alignment is the highest scoring alignment out of All alignments of a read, + and the read can potentially be aligned (reaching E-value threshold) to multiple reference + sequences. + By default the program searches for best alignments i.e. performs an exhaustive search + over all references. Using '-no-best' will make the program to search just + the first N alignments, where N is set using '-num_alignments' i.e. 1 by default. + + + --min_lis INT Optional Search only alignments that have the LIS 2 + of at least N seeds long + + LIS stands for Longest Increasing Subsequence. It is computed using seeds, which + are k-mers common to the read and the reference sequence. Sorted sequences of such seeds + are used to filter the candidate references prior performing the Smith-Waterman alignment. + + + --print_all_reads BOOL Optional Output null alignment strings for non-aligned reads False + to SAM and/or BLAST tabular files + + --paired BOOL Optional Flags paired reads False + + If a single reads file is provided, use this option to indicate + the file contains interleaved paired reads when neither + 'paired_in' | 'paired_out' | 'out2' | 'sout' are specified. + + + --paired_in BOOL Optional Flags the paired-end reads as Aligned, False + when either of them is Aligned. + + With this option both reads are output into Aligned FASTA/Q file + Must be used with 'fastx'. + Mutually exclusive with 'paired_out'. + + + --paired_out BOOL Optional Flags the paired-end reads as Non-aligned, False + when either of them is non-aligned. + + With this option both reads are output into Non-Aligned FASTA/Q file + Must be used with 'fastx'. + Mutually exclusive with 'paired_in'. + + + --out2 BOOL Optional Output paired reads into separate files. False + + Must be used with 'fastx'. + If a single reads file is provided, this options implies interleaved paired reads + When used with 'sout', four (4) output files for aligned reads will be generated: + 'aligned-paired-fwd, aligned-paired-rev, aligned-singleton-fwd, aligned-singleton-rev'. + If 'other' option is also used, eight (8) output files will be generated. + + + --sout BOOL Optional Separate paired and singleton aligned reads. False + + To be used with 'fastx'. + If a single reads file is provided, this options implies interleaved paired reads + Cannot be used with 'paired_in' | 'paired_out' + + + --zip-out STR/BOOL Optional Controls the output compression '-1' + + By default the report files are produced in the same format as the input i.e. + if the reads files are compressed (gz), the output is also compressed. + The default behaviour can be overriden by using '-zip-out'. + The possible values: '1/true/t/yes/y' + '0/false/f/no/n' + '-1' (the same format as input - default) + The values are Not case sensitive i.e. 'Yes, YES, yEs, Y, y' are all OK + Examples: + '-reads freads.gz -zip-out n' : generate flat output when the input is compressed + '-reads freads.flat -zip-out' : compress the output when the input files are flat + + + --match INT Optional SW score (positive integer) for a match. 2 + + --mismatch INT Optional SW penalty (negative integer) for a mismatch. -3 + + --gap_open INT Optional SW penalty (positive integer) for introducing a gap. 5 + + --gap_ext INT Optional SW penalty (positive integer) for extending a gap. 2 + + -e DOUBLE Optional E-value threshold. 1 + + Defines the 'statistical significance' of a local alignment. + Exponentially correllates with the Minimal Alignment score. + Higher E-values (100, 1000, ...) cause More reads to Pass the alignment threshold + + + -F BOOL Optional Search only the forward strand. False + + -N BOOL Optional SW penalty for ambiguous letters (N's) scored + as --mismatch + + -R BOOL Optional Search only the reverse-complementary strand. False + + + [OTU_PICKING] + --id INT Optional %%id similarity threshold (the alignment 0.97 + must still pass the E-value threshold). + + --coverage INT Optional %%query coverage threshold (the alignment must 0.97 + still pass the E-value threshold) + + --de_novo_otu BOOL Optional Output FASTA file with 'de novo' reads False + + Read is 'de novo' if its alignment score passes E-value threshold, but both the identity + '-id', and the '-coverage' are below their corresponding thresholds + i.e. ID < %%id and COV < %%cov + + + --otu_map BOOL Optional Output OTU map (input to QIIME's make_otu_table.py). False + Cannot be used with 'no-best because + the grouping is done around the best alignment' + + + [ADVANCED] + --passes INT,INT,INT Optional Three intervals at which to place the seed on L,L/2,3 + the read (L is the seed length) + + --edges INT Optional Number (or percent if INT followed by %% sign) of 4 + nucleotides to add to each edge of the read + prior to SW local alignment + + --num_seeds BOOL Optional Number of seeds matched before searching 2 + for candidate LIS + + --full_search INT Optional Search for all 0-error and 1-error seed False + matches in the index rather than stopping + after finding a 0-error match (<1%% gain in + sensitivity with up four-fold decrease in speed) + + --pid BOOL Optional Add pid to output file names. False + + -a INT Optional DEPRECATED in favour of '-threads'. Number of numCores + processing threads to use. + Automatically redirects to '-threads' + + --threads INT Optional Number of Processing threads to use 2 + + + [INDEXING] + --index INT Optional Build reference database index 2 + + By default when this option is not used, the program checks the reference index and + builds it if not already existing. + This can be changed by using '-index' as follows: + '-index 0' - skip indexing. If the index does not exist, the program will terminate + and warn to build the index prior performing the alignment + '-index 1' - only perform the indexing and terminate + '-index 2' - the default behaviour, the same as when not using this option at all + + + -L DOUBLE Optional Indexing: seed length. 18 + + -m DOUBLE Optional Indexing: the amount of memory (in Mbytes) for 3072 + building the index. + + -v BOOL Optional Produce verbose output when building the index True + + --interval INT Optional Indexing: Positive integer: index every Nth L-mer in 1 + the reference database e.g. '-interval 2'. + + --max_pos INT Optional Indexing: maximum (integer) number of positions to 1000 + store for each unique L-mer. + If 0 - all positions are stored. + + + [HELP] + -h BOOL Optional Print help information + + --version BOOL Optional Print SortMeRNA version number + + + [DEVELOPER] + --dbg_put_db BOOL Optional + --cmd BOOL Optional Launch an interactive session (command prompt) False + + --task INT Optional Processing Task 4 + + Possible values: 0 - align. Only perform alignment + 1 - post-processing (log writing) + 2 - generate reports + 3 - align and post-process + 4 - all + + + --dbg-level INT Optional Debug level 0 + + Controls verbosity of the execution trace. Default value of 0 corresponds to + the least verbose output. + The highest value currently is 2. diff --git a/src/sortmerna/script.sh b/src/sortmerna/script.sh new file mode 100755 index 00000000..8dda3d60 --- /dev/null +++ b/src/sortmerna/script.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -eo pipefail + +unset_if_false=( par_fastx par_sq par_fastx par_print_all_reads par_paired_in par_paired_out + par_F par_R par_verbose par_de_novo par_otu_map par_full_search par_out2 + par_sout par_sam par_paired ) + + +for var in "${unset_if_false[@]}"; do + if [ "${!var}" == "false" ]; then + unset $var + fi +done + +reads=() +IFS=";" read -ra input <<< "$par_input" +if [ "${#input[@]}" -eq 2 ]; then + reads="--reads ${input[0]} --reads ${input[1]}" + # set paired to true in case it's not + par_paired=true +else + reads="--reads ${input[0]}" + par_paired=false +fi + +refs=() + +# check if references are input normally or through a manifest file +if [[ ! -z "$par_ribo_database_manifest" ]]; then + while IFS= read -r path || [[ -n $path ]]; do + refs=$refs" --ref $path" + done < $par_ribo_database_manifest + +elif [[ ! -z "$par_ref" ]]; then + IFS=";" read -ra ref <<< "$par_ref" + # check if length is 2 and par_paired is set to true + if [[ "${#ref[@]}" -eq 2 && "$par_paired" == "true" ]]; then + refs="--ref ${ref[0]} --ref ${ref[1]}" + # check if length is 1 and par_paired is set to false + elif [[ "${#ref[@]}" -eq 1 && "$par_paired" == "false" ]]; then + refs="--ref $par_ref" + else # if one reference provided but paired is set to true: + echo "Two reference fasta files are required for paired-end reads" + exit 1 + fi +else + echo "No reference fasta file(s) provided" + exit 1 +fi + + +sortmerna \ + $refs \ + $reads \ + --workdir . \ + ${par_output:+--aligned "${par_output}"} \ + ${par_fastx:+--fastx} \ + ${par_other:+--other "${par_other}"} \ + ${par_kvdb:+--kvdb "${par_kvdb}"} \ + ${par_idx_dir:+--idx-dir "${par_idx_dir}"} \ + ${par_readb:+--readb "${par_readb}"} \ + ${par_sam:+--sam} \ + ${par_sq:+--sq} \ + ${par_blast:+--blast "${par_blast}"} \ + ${par_num_alignments:+--num_alignments "${par_num_alignments}"} \ + ${par_min_lis:+--min_lis "${par_min_lis}"} \ + ${par_print_all_reads:+--print_all_reads} \ + ${par_paired_in:+--paired_in} \ + ${par_paired_out:+--paired_out} \ + ${par_out2:+--out2} \ + ${par_sout:+--sout} \ + ${par_zip_out:+--zip-out "${par_zip_out}"} \ + ${par_match:+--match "${par_match}"} \ + ${par_mismatch:+--mismatch "${par_mismatch}"} \ + ${par_gap_open:+--gap_open "${par_gap_open}"} \ + ${par_gap_ext:+--gap_ext "${par_gap_ext}"} \ + ${par_N:+-N "${par_N}"} \ + ${par_a:+-a "${par_a}"} \ + ${par_e:+-e "${par_e}"} \ + ${par_F:+-F} \ + ${par_R:+-R} \ + ${par_num_alignment:+--num_alignment "${par_num_alignment}"} \ + ${par_best:+--best "${par_best}"} \ + ${par_verbose:+--verbose} \ + ${par_id:+--id "${par_id}"} \ + ${par_coverage:+--coverage "${par_coverage}"} \ + ${par_de_novo:+--de_novo} \ + ${par_otu_map:+--otu_map} \ + ${par_num_seed:+--num_seed "${par_num_seed}"} \ + ${par_passes:+--passes "${par_passes}"} \ + ${par_edge:+--edge "${par_edge}"} \ + ${par_full_search:+--full_search} \ + ${par_index:+--index "${par_index}"} \ + ${par_L:+-L $par_L} \ + ${par_interval:+--interval "${par_interval}"} \ + ${par_max_pos:+--max_pos "${par_max_pos}"} + + +if [ ! -z $par_log ]; then + mv "${par_output}.log" $par_log +fi + +exit 0 + diff --git a/src/sortmerna/test.sh b/src/sortmerna/test.sh new file mode 100644 index 00000000..390b9307 --- /dev/null +++ b/src/sortmerna/test.sh @@ -0,0 +1,101 @@ +#!/bin/bash + +echo ">>> Testing $meta_functionality_name" + +find $meta_resources_dir/test_data/rRNA -type f > test_data/rrna-db.txt + +echo ">>> Testing for paired-end reads and database manifest" +# out2 separates the read pairs into two files (one fwd and one rev) +# paired_in outputs both reads of a pair +# other is the output file for non-rRNA reads +"$meta_executable" \ + --output "rRNA_reads" \ + --other "non_rRNA_reads" \ + --input "$meta_resources_dir/test_data/reads_1.fq.gz;$meta_resources_dir/test_data/reads_2.fq.gz" \ + --ribo_database_manifest test_data/rrna-db.txt \ + --log test_log.log \ + --paired_in \ + --fastx \ + --out2 + + +echo ">> Checking if the correct files are present" +[[ -f "rRNA_reads_fwd.fq.gz" ]] || [[ -f "rRNA_reads_rev.fq.gz" ]] || { echo "rRNA output fastq file is missing!"; exit 1; } +[[ -s "rRNA_reads_fwd.fq.gz" ]] && [[ -s "rRNA_reads_rev.fq.gz" ]] || { echo "rRNA output fastq file is empty!"; exit 1; } +[[ -f "non_rRNA_reads_fwd.fq.gz" ]] || [[ -f "non_rRNA_reads_rev.fq.gz" ]] || { echo "Non-rRNA output fastq file is missing!"; exit 1;} +gzip -dk non_rRNA_reads_fwd.fq.gz +gzip -dk non_rRNA_reads_rev.fq.gz +[[ ! -s "non_rRNA_reads_fwd.fq" ]] && [[ ! -s "non_rRNA_reads_rev.fq" ]] || { echo "Non-rRNA output fastq file is not empty!"; exit 1;} + +rm -f rRNA_reads_fwd.fq.gz rRNA_reads_rev.fq.gz non_rRNA_reads_fwd.fq.gz non_rRNA_reads_rev.fq.gz test_log.log +rm -rf kvdb/ + +################################################################################ +echo ">>> Testing for paired-end reads and --ref and --paired_out argumens" +"$meta_executable" \ + --output "rRNA_reads" \ + --other "non_rRNA_reads" \ + --input "$meta_resources_dir/test_data/reads_1.fq.gz;$meta_resources_dir/test_data/reads_2.fq.gz" \ + --ref "$meta_resources_dir/test_data/rRNA/database1.fa;$meta_resources_dir/test_data/rRNA/database2.fa" \ + --log test_log.log \ + --paired_out \ + --fastx \ + --out2 + +echo ">> Checking if the correct files are present" +[[ -f "rRNA_reads_fwd.fq.gz" ]] || [[ -f "rRNA_reads_rev.fq.gz" ]] || { echo "rRNA output fastq file is missing!"; exit 1; } +gzip -dkf rRNA_reads_fwd.fq.gz +[[ ! -s "rRNA_reads_fwd.fq" ]] && [[ ! -s "rRNA_reads_rev.fq" ]] || { echo "rRNA output fastq file is not empty!"; exit 1; } +[[ -f "non_rRNA_reads_fwd.fq.gz" ]] || [[ -f "non_rRNA_reads_rev.fq.gz" ]] || { echo "Non-rRNA output fastq file is missing!"; exit 1;} +gzip -dkf non_rRNA_reads_fwd.fq.gz +gzip -dkf non_rRNA_reads_rev.fq.gz +[[ -s "non_rRNA_reads_fwd.fq" ]] && [[ -s "non_rRNA_reads_rev.fq" ]] || { echo "Non-rRNA output fastq file is empty!"; exit 1; } + +rm -f rRNA_reads_fwd.fq.gz rRNA_reads_rev.fq.gz non_rRNA_reads_fwd.fq.gz non_rRNA_reads_rev.fq.gz test_log.log +rm -rf kvdb/ + +################################################################################ + +echo ">>> Testing for single-end reads and --ref argument" +"$meta_executable" \ + --aligned "rRNA_reads" \ + --other "non_rRNA_reads" \ + --input $meta_resources_dir/test_data/reads_1.fq.gz \ + --ref $meta_resources_dir/test_data/rRNA/database1.fa \ + --log test_log.log \ + --fastx + +echo ">> Checking if the correct files are present" +[[ ! -f "rRNA_reads.fq.gz" ]] && echo "rRNA output fastq file is missing!" && exit 1 +gzip -dk rRNA_reads.fq.gz +[[ -s "rRNA_reads.fq" ]] && echo "rRNA output fastq file is not empty!" && exit 1 +[[ ! -f "non_rRNA_reads.fq.gz" ]] && echo "Non-rRNA output fastq file is missing!" && exit 1 +[[ ! -s "non_rRNA_reads.fq.gz" ]] && echo "Non-rRNA output fastq file is empty!" && exit 1 + +rm -f rRNA_reads.fq.gz non_rRNA_reads.fq.gz test_log.log +rm -rf kvdb/ + +################################################################################ + +echo ">>> Testing for single-end reads with singleton output files" +"$meta_executable" \ + --aligned "rRNA_reads" \ + --other "non_rRNA_reads" \ + --input "$meta_resources_dir/test_data/reads_1.fq.gz;$meta_resources_dir/test_data/reads_2.fq.gz" \ + --ribo_database_manifest test_data/rrna-db.txt \ + --log test_log.log \ + --fastx \ + --sout + +echo ">> Checking if the correct files are present" +[[ ! -f "rRNA_reads_paired.fq.gz" ]] && echo "Aligned paired fwd output fastq file is missing!" && exit 1 +[[ ! -f "rRNA_reads_singleton.fq.gz" ]] && echo "Aligned singleton fwd output fastq file is missing!" && exit 1 +[[ ! -f "non_rRNA_reads_fwd.fq" ]] && echo "Non-rRNA fwd output fastq file is missing!" && exit 1 +[[ ! -f "non_rRNA_reads_rev.fq" ]] && echo "Non-rRNA rev output fastq file is missing!" && exit 1 +[[ ! -f "non_rRNA_reads_singleton.fq.gz" ]] && echo "Non-rRNA singleton output fastq file is missing!" && exit 1 +[[ ! -f "non_rRNA_reads_paired.fq.gz" ]] && echo "Non-rRNA paired output fastq file is missing!" && exit 1 + + + +echo ">>> All tests passed" +exit 0 \ No newline at end of file diff --git a/src/sortmerna/test_data/rRNA/database1.fa b/src/sortmerna/test_data/rRNA/database1.fa new file mode 100644 index 00000000..bae23aba --- /dev/null +++ b/src/sortmerna/test_data/rRNA/database1.fa @@ -0,0 +1,24 @@ +>AY846379.1.1791 Eukaryota;Archaeplastida;Chloroplastida;Chlorophyta;Chlorophyceae;Sphaeropleales;Monoraphidium;Monoraphidium sp. Itas 9/21 14-6w +CCUGGUUGAUCCUGCCAGUAGUCAUAUGCUUGUCUCAAAGAUUAAGCCAUGCAUGUCUAAGUAUAAACUGCUUAUACUGU +GAAACUGCGAAUGGCUCAUUAAAUCAGUUAUAGUUUAUUUGAUGGUACCUCUACACGGAUAACCGUAGUAAUUCUAGAGC +UAAUACGUGCGUAAAUCCCGACUUCUGGAAGGGACGUAUUUAUUAGAUAAAAGGCCGACCGAGCUUUGCUCGACCCGCGG +UGAAUCAUGAUAACUUCACGAAUCGCAUAGCCUUGUGCUGGCGAUGUUUCAUUCAAAUUUCUGCCCUAUCAACUUUCGAU +GGUAGGAUAGAGGCCUACCAUGGUGGUAACGGGUGACGGAGGAUUAGGGUUCGAUUCCGGAGAGGGAGCCUGAGAAACGG +CUACCACAUCCAAGGAAGGCAGCAGGCGCGCAAAUUACCCAAUCCUGAUACGGGGAGGUAGUGACAAUAAAUAACAAUGC +CGGGCAUUUCAUGUCUGGCAAUUGGAAUGAGUACAAUCUAAAUCCCUUAACGAGGAUCAAUUGGAGGGCAAGUCUGGUGC +CAGCAGCCGCGGUAAUUCCAGCUCCAAUAGCGUAUAUUUAAGUUGUUGCAGUUAAAAAGCUCGUAGUUGGAUUUCGGGUG +GGUUCCAGCGGUCCGCCUAUGGUGAGUACUGCUGUGGCCCUCCUUUUUGUCGGGGACGGGCUCCUGGGCUUCAUUGUCCG +GGACUCGGAGUCGACGAUGAUACUUUGAGUAAAUUAGAGUGUUCAAAGCAAGCCUACGCUCUGAAUACUUUAGCAUGGAA +UAUCGCGAUAGGACUCUGGCCUAUCUCGUUGGUCUGUAGGACCGGAGUAAUGAUUAAGAGGGACAGUCGGGGGCAUUCGU +AUUUCAUUGUCAGAGGUGAAAUUCUUGGAUUUAUGAAAGACGAACUACUGCGAAAGCAUUUGCCAAGGAUGUUUUCAUUA +AUCAAGAACGAAAGUUGGGGGCUCGAAGACGAUUAGAUACCGUCGUAGUCUCAACCAUAAACGAUGCCGACUAGGGAUUG +GAGGAUGUUCUUUUGAUGACUUCUCCAGCACCUUAUGAGAAAUCAAAGUUUUUGGGUUCCGGGGGGAGUAUGGUCGCAAG +GCUGAAACUUAAAGGAAUUGACGGAAGGGCACCACCAGGCGUGGAGCCUGCGGCUUAAUUUGACUCAACACGGGAAAACU +UACCAGGUCCAGACAUAGUGAGGAUUGACAGAUUGAGAGCUCUUUCUUGAUUCUAUGGGUGGUGGUGCAUGGCCGUUCUU +AGUUGGUGGGUUGCCUUGUCAGGUUGAUUCCGGUAACGAACGAGACCUCAGCCUGCUAAAUAUGUCACAUUCGCUUUUUG +CGGAUGGCCGACUUCUUAGAGGGACUAUUGGCGUUUAGUCAAUGGAAGUAUGAGGCAAUAACAGGUCUGUGAUGCCCUUA +GAUGUUCUGGGCCGCACGCGCGCUACACUGACGCAUUCAGCAAGCCUAUCCUUGACCGAGAGGUCUGGGUAAUCUUUGAA +ACUGCGUCGUGAUGGGGAUAGAUUAUUGCAAUUAUUAGUCUUCAACGAGGAAUGCCUAGUAAGCGCAAGUCAUCAGCUUG +CGUUGAUUACGUCCCUGCCCUUUGUACACACCGCCCGUCGCUCCUACCGAUUGGGUGUGCUGGUGAAGUGUUCGGAUUGG +CAGAGCGGGUGGCAACACUUGCUUUUGCCGAGAAGUUCAUUAAACCCUCCCACCUAGAGGAAGGAGAAGUCGUAACAAGG +UUUCCGUAGGUGAACCUGCAGAAG \ No newline at end of file diff --git a/src/sortmerna/test_data/rRNA/database2.fa b/src/sortmerna/test_data/rRNA/database2.fa new file mode 100644 index 00000000..87b5bc99 --- /dev/null +++ b/src/sortmerna/test_data/rRNA/database2.fa @@ -0,0 +1,16 @@ +>AB001445.1.1538 Bacteria;Proteobacteria;Gammaproteobacteria;Pseudomonadales;Pseudomonadaceae;Pseudomonas;Pseudomonas amygdali pv. morsprunorum +AGAGUUUGAUCAUGGCUCAGAUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUCGAGCGGCAGCACGGGUACUUGUAC +CUGGUGGCGAGCGGCGGACGGGUGAGUAAUGCCUAGGAAUCUGCCUGGUAGUGGGGGAUAACGCUCGGAAACGGACGCUA +AUACCGCAUACGUCCUACGGGAGAAAGCAGGGGACCUUCGGGCCUUGCGCUAUCAGAUGAGCCUAGGUCGGAUUAGCUAG +UUGGUGAGGUAAUGGCUCACCAAGGCGACGAUCCGUAACUGGUCUGAGAGGAUGAUCAGUCACACUGGAACUGAGACACG +GUCCAGACUCCUACGGGAGGCAGCAGUGGGGAAUAUUGGACAAUGGGCGAAAGCCUGAUCCAGCCAUGCCGCGUGUGUGA +AGAAGGUCUUCGGAUUGUAAAGCACUUUAAGUUGGGAGGAAGGGCAGUUACCUAAUACGUAUCUGUUUUGACGUUACCGA +CAGAAUAAGCACCGGCUAACUCUGUGCCAGCAGCCGCGGUAAUACAGAGGGUGCAAGCGUUAAUCGGAAUUACUGGGCGU +AAAGCGCGCGUAGGUGGUUUGUUAAGUUGAAUGUGAAAUCCCCGGGCUCAACCUGGGAACUGCAUCCAAAACUGGCAAGC +UAGAGUAUGGUAGAGGGUGGUGGAAUUUCCUGUGUAGCGGUGAAAUGCGUAGAUAUAGGAAGGAACACCAGUGGCGAAGG +CGACCACCUGGACUGAUACUGACACUGAGGUGCGAAAGCGUGGGGAGCAAACAGGAUUAGAUACCCUGGUAGUCCACGCC +GUAAACGAUGUCAACUAGCCGUUGGGAGCCUUGAGCUCUUAGUGGCGCAGCUAACGCAUUAAGUUGACCGCCUGGGGAGU +ACGGCCGCAAGGUUAAAACUCAAAUGAAUUGACGGGGGCCCGCACAAGCGGUGGAGCAUGUGGUUUAAUUCGAAGCAACG +CGAAGAACCUUACCAGGCCUUGACAUCCAAUGAAUCCUUUAGAGAUAGAGGAGUGCCUUCGGGAGCAUUGAGACAGGUGC +UGCAUGGCUGUCGUCAGCUCGUGUCGUGAGAUGUUGGGUUAAGUCCCGUAACGAGCGCAACCCUUGUCCUUAGUUACCAG +CACGUCAUGGUGGGCACUCUAAGGAGACUGCCGGUGACAAACCGGAGGAAGGUGGGGAUGACGUCAAGUCAUCAUGGCCC diff --git a/src/sortmerna/test_data/reads_1.fq.gz b/src/sortmerna/test_data/reads_1.fq.gz new file mode 100644 index 0000000000000000000000000000000000000000..41c02a22dbbae13db84acf1e79bc4fc3fa8589e6 GIT binary patch literal 189 zcmV;u07CyCiwFo$iqvKR19D|yWOH9JE@p86wU0dx!Y~Yl_nZQWu>(o}P^}JqwIX+b zPO-%OPl6LsC<}stmpJjW<4E7M&Ycg1S#Apj3L$tq`=RbA_@+MuTFFyl z78alq=EMofi84dLb}3jyAYujED%nAymVqrZpNFl>Dky^%D&v_(cRIcb rrwC*NyuH|rwZ}M?)J Date: Tue, 10 Sep 2024 15:51:12 +0200 Subject: [PATCH 04/13] Bcftools stats (#142) * Initial Commit * Adding options to config * Update on script * update * Adding test 2 and 3 * Update on config and test * adding more tests * debugging and adding tests * Adding last tests * removing test_data dir * Update CHANGELOG.md * small changes * small change in help file * Requested changes --------- Co-authored-by: Jakub Majercik <57993790+jakubmajercik@users.noreply.github.com> --- CHANGELOG.md | 1 + src/bcftools/bcftools_stats/config.vsh.yaml | 240 +++++++++++++++ src/bcftools/bcftools_stats/help.txt | 35 +++ src/bcftools/bcftools_stats/script.sh | 56 ++++ src/bcftools/bcftools_stats/test.sh | 306 ++++++++++++++++++++ 5 files changed, 638 insertions(+) create mode 100644 src/bcftools/bcftools_stats/config.vsh.yaml create mode 100644 src/bcftools/bcftools_stats/help.txt create mode 100644 src/bcftools/bcftools_stats/script.sh create mode 100644 src/bcftools/bcftools_stats/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 5041f082..2dd152bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,6 +42,7 @@ * `rsem/rsem_prepare_reference`: Prepare transcript references for RSEM (PR #89). * `bcftools`: + - `bcftools/bcftools_stats`: Parses VCF or BCF and produces a txt stats file which can be plotted using plot-vcfstats (PR #142). - `bcftools/bcftools_sort`: Sorts BCF/VCF files by position and other criteria (PR #141). * `fastqc`: High throughput sequence quality control analysis tool (PR #92). diff --git a/src/bcftools/bcftools_stats/config.vsh.yaml b/src/bcftools/bcftools_stats/config.vsh.yaml new file mode 100644 index 00000000..8fb57f7a --- /dev/null +++ b/src/bcftools/bcftools_stats/config.vsh.yaml @@ -0,0 +1,240 @@ +name: bcftools_stats +namespace: bcftools +description: | + Parses VCF or BCF and produces a txt stats file which can be plotted using plot-vcfstats. + When two files are given, the program generates separate stats for intersection + and the complements. By default only sites are compared, -s/-S must given to include + also sample columns. +keywords: [Stats, VCF, BCF] +links: + homepage: https://samtools.github.io/bcftools/ + documentation: https://samtools.github.io/bcftools/bcftools.html#stats + repository: https://github.com/samtools/bcftools + issue_tracker: https://github.com/samtools/bcftools/issues +references: + doi: https://doi.org/10.1093/gigascience/giab008 +license: MIT/Expat, GNU +requirements: + commands: [bcftools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author ] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + multiple: true + description: Input VCF/BCF file. Maximum of two files. + required: true + + - name: Outputs + arguments: + - name: --output + alternatives: -o + direction: output + type: file + description: Output txt statistics file. + required: true + + - name: Options + arguments: + + - name: --allele_frequency_bins + alternatives: --af_bins + type: string + description: | + Allele frequency bins, a list of bin values (0.1,0.5,1). + example: 0.1,0.5,1 + + - name: --allele_frequency_bins_file + alternatives: --af_bins_file + type: file + description: | + Same as allele_frequency_bins, but in a file. + Format of file is one value per line. + e.g. + 0.1 + 0.5 + 1 + + - name: --allele_frequency_tag + alternatives: --af_tag + type: string + description: | + Allele frequency tag to use, by default estimated from AN,AC or GT. + + - name: --first_allele_only + alternatives: --first_only + type: boolean_true + description: | + Include only 1st allele at multiallelic sites. + + - name: --collapse + alternatives: --c + type: string + choices: [ snps, indels, both, all, some, none ] + description: | + Treat as identical records with . + See https://samtools.github.io/bcftools/bcftools.html#common_options for details. + + - name: --depth + alternatives: --d + type: string + description: | + Depth distribution: min,max,bin size. + example: 0,500,1 + + - name: --exclude + alternatives: --e + type: string + description: | + Exclude sites for which the expression is true. + See https://samtools.github.io/bcftools/bcftools.html#expressions for details. + example: 'QUAL < 30 && DP < 10' + + - name: --exons + alternatives: --E + type: file + description: | + tab-delimited file with exons for indel frameshifts statistics. + The columns of the file are CHR, FROM, TO, with 1-based, inclusive, positions. + The file is BGZF-compressed and indexed with tabix (e.g. tabix -s1 -b2 -e3 file.gz). + + - name: --apply_filters + alternatives: --f + type: string + description: | + Require at least one of the listed FILTER strings (e.g. "PASS,."). + + - name: --fasta_reference + alternatives: --F + type: file + description: | + Faidx indexed reference sequence file to determine INDEL context. + + - name: --include + alternatives: --i + type: string + description: | + Select sites for which the expression is true. + See https://samtools.github.io/bcftools/bcftools.html#expressions for details. + example: 'QUAL >= 30 && DP >= 10' + + - name: --split_by_ID + alternatives: --I + type: boolean_true + description: | + Collect stats for sites with ID separately (known vs novel). + + - name: --regions + alternatives: --r + type: string + description: | + Restrict to comma-separated list of regions. + Following formats are supported: chr|chr:pos|chr:beg-end|chr:beg-[,…​]. + example: '20:1000000-2000000' + + - name: --regions_file + alternatives: --R + type: file + description: | + Restrict to regions listed in a file. + Regions can be specified either on a VCF, BED, or tab-delimited file (the default). + For more information check manual. + + - name: --regions_overlap + type: string + choices: ['pos', 'record', 'variant', '0', '1', '2'] + description: | + This option controls how overlapping records are determined: + set to 'pos' or '0' if the VCF record has to have POS inside a region (this corresponds to the default behavior of -t/-T); + set to 'record' or '1' if also overlapping records with POS outside a region should be included (this is the default behavior of -r/-R, + and includes indels with POS at the end of a region, which are technically outside the region); + or set to 'variant' or '2' to include only true overlapping variation (compare the full VCF representation "TA>T-" vs the true sequence variation "A>-"). + + - name: --samples + alternatives: --s + type: string + description: | + List of samples for sample stats, "-" to include all samples. + + - name: --samples_file + alternatives: --S + type: file + description: | + File of samples to include. + e.g. + sample1 1 + sample2 2 + sample3 2 + + - name: --targets + alternatives: --t + type: string + description: | + Similar as -r, --regions, but the next position is accessed by streaming the whole VCF/BCF + rather than using the tbi/csi index. Both -r and -t options can be applied simultaneously: -r uses the + index to jump to a region and -t discards positions which are not in the targets. Unlike -r, targets + can be prefixed with "^" to request logical complement. For example, "^X,Y,MT" indicates that + sequences X, Y and MT should be skipped. Yet another difference between the -t/-T and -r/-R is + that -r/-R checks for proper overlaps and considers both POS and the end position of an indel, + while -t/-T considers the POS coordinate only (by default; see also --regions-overlap and --targets-overlap). + Note that -t cannot be used in combination with -T. + Following formats are supported: chr|chr:pos|chr:beg-end|chr:beg-[,…​]. + example: '20:1000000-2000000' + + - name: --targets_file + alternatives: --T + type: file + description: | + Similar to --regions_file option but streams rather than index-jumps. + + - name: --targets_overlaps + type: string + choices: ['pos', 'record', 'variant', '0', '1', '2'] + description: | + Include if POS in the region (0), record overlaps (1), variant overlaps (2). + + - name: --user_tstv + alternatives: --u + type: string + description: | + Collect Ts/Tv stats for any tag using the given binning [0:1:100]. + Format is . + A subfield can be selected as e.g. 'PV4[0]', here the first value of the PV4 tag. + + + - name: --verbose + alternatives: --v + type: boolean_true + description: | + Produce verbose per-site and per-sample output. + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bcftools, procps] + - type: docker + run: | + echo "bcftools: \"$(bcftools --version | grep 'bcftools' | sed -n 's/^bcftools //p')\"" > /var/software_versions.txt + test_setup: + - type: apt + packages: [tabix] + +runners: + - type: executable + - type: nextflow + diff --git a/src/bcftools/bcftools_stats/help.txt b/src/bcftools/bcftools_stats/help.txt new file mode 100644 index 00000000..e702e838 --- /dev/null +++ b/src/bcftools/bcftools_stats/help.txt @@ -0,0 +1,35 @@ +``` +bcftools stats -h +``` + +About: Parses VCF or BCF and produces stats which can be plotted using plot-vcfstats. + When two files are given, the program generates separate stats for intersection + and the complements. By default only sites are compared, -s/-S must given to include + also sample columns. +Usage: bcftools stats [options] [] + +Options: + --af-bins LIST Allele frequency bins, a list (0.1,0.5,1) or a file (0.1\n0.5\n1) + --af-tag STRING Allele frequency tag to use, by default estimated from AN,AC or GT + -1, --1st-allele-only Include only 1st allele at multiallelic sites + -c, --collapse STRING Treat as identical records with , see man page for details [none] + -d, --depth INT,INT,INT Depth distribution: min,max,bin size [0,500,1] + -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details) + -E, --exons FILE.gz Tab-delimited file with exons for indel frameshifts (chr,beg,end; 1-based, inclusive, bgzip compressed) + -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. "PASS,.") + -F, --fasta-ref FILE Faidx indexed reference sequence file to determine INDEL context + -i, --include EXPR Select sites for which the expression is true (see man page for details) + -I, --split-by-ID Collect stats for sites with ID separately (known vs novel) + -r, --regions REGION Restrict to comma-separated list of regions + -R, --regions-file FILE Restrict to regions listed in a file + --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1] + -s, --samples LIST List of samples for sample stats, "-" to include all samples + -S, --samples-file FILE File of samples to include + -t, --targets REGION Similar to -r but streams rather than index-jumps + -T, --targets-file FILE Similar to -R but streams rather than index-jumps + --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0] + -u, --user-tstv TAG[:min:max:n] Collect Ts/Tv stats for any tag using the given binning [0:1:100] + A subfield can be selected as e.g. 'PV4[0]', here the first value of the PV4 tag + --threads INT Use multithreading with worker threads [0] + -v, --verbose Produce verbose per-site and per-sample output + diff --git a/src/bcftools/bcftools_stats/script.sh b/src/bcftools/bcftools_stats/script.sh new file mode 100644 index 00000000..119502fd --- /dev/null +++ b/src/bcftools/bcftools_stats/script.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +# Unset parameters +unset_if_false=( + par_first_allele_only + par_split_by_ID + par_verbose +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +# Create input array +IFS=";" read -ra input <<< $par_input + +# Check the size of the input array +if [[ ${#input[@]} -gt 2 ]]; then + echo "Error: --input only takes a max of two files!" + exit 1 +fi + +# Execute bcftools stats with the provided arguments +bcftools stats \ + ${par_first_allele_only:+--1st-allele-only} \ + ${par_split_by_ID:+--split-by-ID} \ + ${par_verbose:+--verbose} \ + ${par_allele_frequency_bins:+--af-bins "${par_allele_frequency_bins}"} \ + ${par_allele_frequency_bins_file:+--af-bins "${par_allele_frequency_bins_file}"} \ + ${par_allele_frequency_tag:+--af-tag "${par_allele_frequency_tag}"} \ + ${par_collapse:+-c "${par_collapse}"} \ + ${par_depth:+-d "${par_depth}"} \ + ${par_exclude:+-e "${par_exclude}"} \ + ${par_exons:+-E "${par_exons}"} \ + ${par_apply_filters:+-f "${par_apply_filters}"} \ + ${par_fasta_reference:+-F "${par_fasta_reference}"} \ + ${par_include:+-i "${par_include}"} \ + ${par_regions:+-r "${par_regions}"} \ + ${par_regions_file:+-R "${par_regions_file}"} \ + ${par_regions_overlap:+--regions-overlap "${par_regions_overlap}"} \ + ${par_samples:+-s "${par_samples}"} \ + ${par_samples_file:+-S "${par_samples_file}"} \ + ${par_targets:+-t "${par_targets}"} \ + ${par_targets_file:+-T "${par_targets_file}"} \ + ${par_targets_overlaps:+--targets-overlap "${par_targets_overlaps}"} \ + ${par_user_tstv:+-u "${par_user_tstv}"} \ + "${input[@]}" \ + > $par_output + diff --git a/src/bcftools/bcftools_stats/test.sh b/src/bcftools/bcftools_stats/test.sh new file mode 100644 index 00000000..18f0256b --- /dev/null +++ b/src/bcftools/bcftools_stats/test.sh @@ -0,0 +1,306 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +#test_data="$meta_resources_dir/test_data" + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create test data +cat < "$TMPDIR/example.vcf" +##fileformat=VCFv4.0 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=1000GenomesPilot-NCBI36 +##contig= +##contig= +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##ALT= +##ALT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +19 111 . A C 9.6 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 +19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 +20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. +20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,. +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.,. +20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:.:56,60 0|0:48:4:51,51 0/0:61:2:.,. +20 1234567 microsat1 G GA,GAC 50 PASS NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:.:4 0/2:17:2 1/1:40:3 +20 1235237 . T . . . . GT 0/0 0|0 ./. +EOF + +bgzip -c $TMPDIR/example.vcf > $TMPDIR/example.vcf.gz +tabix -p vcf $TMPDIR/example.vcf.gz + +cat < "$TMPDIR/exons.bed" +chr19 12345 12567 +chr20 23456 23789 +EOF + +# Compressing and indexing the exons file +bgzip -c $TMPDIR/exons.bed > $TMPDIR/exons.bed.gz +tabix -s1 -b2 -e3 $TMPDIR/exons.bed.gz + +# Create fai test file +# cat < "$TMPDIR/reference.fasta.fai" +# 19 100 895464957 60 61 +# 20 10000 1083893029 60 61 +# EOF + +# Create allele frequency bins file +cat < "$TMPDIR/allele_frequency_bins.txt" +0.1 +0.2 +0.3 +0.4 +0.5 +0.6 +0.7 +0.8 +0.9 +EOF + +# Test 1: Default Use +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null + +echo "> Run bcftools_stats on VCF file" +"$meta_executable" \ + --input "../example.vcf" \ + --output "stats.txt" \ + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats ../example.vcf" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: First allele only +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bcftools_stats on VCF file with first allele only" +"$meta_executable" \ + --input "../example.vcf" \ + --output "stats.txt" \ + --first_allele_only \ + --allele_frequency_bins "0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9" \ + --allele_frequency_tag "AF" \ + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats --1st-allele-only --af-bins 0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9 --af-tag AF ../example.vcf" +echo "- test2 succeeded -" + +popd > /dev/null + +# Test 3: Split by ID +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null + +echo "> Run bcftools_stats on VCF file with split by ID" +"$meta_executable" \ + --input "../example.vcf" \ + --output "stats.txt" \ + --split_by_ID \ + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats --split-by-ID ../example.vcf" +echo "- test3 succeeded -" + +popd > /dev/null + +# Test 4: Collapse, Depth, Exclude +mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null + +echo "> Run bcftools_stats on VCF file with collapse, depth, and exclude" +"$meta_executable" \ + --input "../example.vcf" \ + --output "stats.txt" \ + --depth "0,500,1" \ + --exclude "GT='mis'" \ + --collapse "snps" \ + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats -c snps -d 0,500,1 -e GT='mis' ../example.vcf" +echo "- test4 succeeded -" + +popd > /dev/null + +# Test 5: Exons, Apply Filters +mkdir "$TMPDIR/test5" && pushd "$TMPDIR/test5" > /dev/null + +echo "> Run bcftools_stats on VCF file with exons, apply filters, and fasta reference" +"$meta_executable" \ + --input "../example.vcf" \ + --output "stats.txt" \ + --exons "../exons.bed.gz" \ + --apply_filters "PASS" \ +# --fasta_reference "../reference.fasta.fai" \ + +# NOTE: fasta_reference option not included in testing because of error from bcftools stats. + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats -E ../exons.bed.gz -f PASS ../example.vcf" +#assert_file_contains "stats.txt" "bcftools stats -E ../exons.bed.gz -f PASS -F ../reference.fasta.fai ../example.vcf" +echo "- test5 succeeded -" + +popd > /dev/null + +# Test 6: Include, Regions +mkdir "$TMPDIR/test6" && pushd "$TMPDIR/test6" > /dev/null + +echo "> Run bcftools_stats on VCF file with include and regions options" +"$meta_executable" \ + --input "../example.vcf.gz" \ + --output "stats.txt" \ + --include "GT='mis'" \ + --regions "20:1000000-2000000" \ + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats -i GT='mis' -r 20:1000000-2000000 ../example.vcf.gz" +echo "- test6 succeeded -" + +popd > /dev/null + +# Test 7: Regions Overlap, Samples +mkdir "$TMPDIR/test7" && pushd "$TMPDIR/test7" > /dev/null + +echo "> Run bcftools_stats on VCF file with regions overlap, and samples options" +"$meta_executable" \ + --input "../example.vcf" \ + --output "stats.txt" \ + --regions_overlap "record" \ + --samples "NA00001,NA00002" \ + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats --regions-overlap record -s NA00001,NA00002 ../example.vcf" +echo "- test7 succeeded -" + +popd > /dev/null + +# Test 8: Targets, Targets File, Targets Overlaps +mkdir "$TMPDIR/test8" && pushd "$TMPDIR/test8" > /dev/null + +echo "> Run bcftools_stats on VCF file with targets, targets file, and targets overlaps" +"$meta_executable" \ + --input "../example.vcf" \ + --output "stats.txt" \ + --targets "20:1000000-2000000" \ + --targets_overlaps "pos" \ + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats -t 20:1000000-2000000 --targets-overlap pos ../example.vcf" +echo "- test8 succeeded -" + +popd > /dev/null + +# Test 9: User TSTV and Verbose +mkdir "$TMPDIR/test9" && pushd "$TMPDIR/test9" > /dev/null + +echo "> Run bcftools_stats on VCF file with user TSTV and verbose" +"$meta_executable" \ + --input "../example.vcf" \ + --output "stats.txt" \ + --user_tstv "DP" \ + --verbose \ + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats --verbose -u DP ../example.vcf" +echo "- test9 succeeded -" + +popd > /dev/null + +# Test 10: Two vcf files +mkdir "$TMPDIR/test10" && pushd "$TMPDIR/test10" > /dev/null + +echo "> Run bcftools_stats on two VCF files" +"$meta_executable" \ + --input "../example.vcf.gz" \ + --input "../example.vcf.gz" \ + --output "stats.txt" \ + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats ../example.vcf.gz ../example.vcf.gz" +echo "- test10 succeeded -" + +popd > /dev/null + +# Test 11: with allele frequency bins file option +mkdir "$TMPDIR/test11" && pushd "$TMPDIR/test11" > /dev/null + +echo "> Run bcftools_stats on VCF file with allele frequency bins file option" +"$meta_executable" \ + --input "../example.vcf" \ + --output "stats.txt" \ + --allele_frequency_bins "../allele_frequency_bins.txt" \ + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats --af-bins ../allele_frequency_bins.txt ../example.vcf" +echo "- test11 succeeded -" + +popd > /dev/null + + +echo "---- All tests succeeded! ----" +exit 0 + + From c3ba4a78497f7518725bb7d3e213b2a9bcee511e Mon Sep 17 00:00:00 2001 From: Theodoro Gasperin Terra Camargo <98555209+tgaspe@users.noreply.github.com> Date: Tue, 10 Sep 2024 15:53:13 +0200 Subject: [PATCH 05/13] Bcftools annotate (#143) * Initial commit * Update config.vsh.yaml * changes in config file * Update script.sh * Help File * Update script.sh * Update test.sh * bug fixing and adding tests * Update test.sh * Update test.sh * adding 3rd test * More tests * Moreee tests * Update test.sh * small changes * Update CHANGELOG.md * Update config.vsh.yaml * bug fixing on config * Requested changes --------- Co-authored-by: Jakub Majercik <57993790+jakubmajercik@users.noreply.github.com> --- CHANGELOG.md | 2 +- .../bcftools_annotate/config.vsh.yaml | 250 ++++++++++++++ src/bcftools/bcftools_annotate/help.txt | 41 +++ src/bcftools/bcftools_annotate/script.sh | 54 ++++ src/bcftools/bcftools_annotate/test.sh | 305 ++++++++++++++++++ 5 files changed, 651 insertions(+), 1 deletion(-) create mode 100644 src/bcftools/bcftools_annotate/config.vsh.yaml create mode 100644 src/bcftools/bcftools_annotate/help.txt create mode 100644 src/bcftools/bcftools_annotate/script.sh create mode 100644 src/bcftools/bcftools_annotate/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 2dd152bb..bb640d50 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,12 +42,12 @@ * `rsem/rsem_prepare_reference`: Prepare transcript references for RSEM (PR #89). * `bcftools`: + - `bcftools_annotate`: Add or remove annotations from a VCF/BCF file (PR #143). - `bcftools/bcftools_stats`: Parses VCF or BCF and produces a txt stats file which can be plotted using plot-vcfstats (PR #142). - `bcftools/bcftools_sort`: Sorts BCF/VCF files by position and other criteria (PR #141). * `fastqc`: High throughput sequence quality control analysis tool (PR #92). - ## MINOR CHANGES * `busco` components: update BUSCO to `5.7.1` (PR #72). diff --git a/src/bcftools/bcftools_annotate/config.vsh.yaml b/src/bcftools/bcftools_annotate/config.vsh.yaml new file mode 100644 index 00000000..67e8f46e --- /dev/null +++ b/src/bcftools/bcftools_annotate/config.vsh.yaml @@ -0,0 +1,250 @@ +name: bcftools_annotate +namespace: bcftools +description: | + Add or remove annotations from a VCF/BCF file. +keywords: [Annotate, VCF, BCF] +links: + homepage: https://samtools.github.io/bcftools/ + documentation: https://samtools.github.io/bcftools/bcftools.html#annotate + repository: https://github.com/samtools/bcftools + issue_tracker: https://github.com/samtools/bcftools/issues +references: + doi: https://doi.org/10.1093/gigascience/giab008 +license: MIT/Expat, GNU +requirements: + commands: [bcftools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [author] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + multiple: true + description: Input VCF/BCF file. + required: true + + - name: Outputs + arguments: + - name: --output + alternatives: -o + direction: output + type: file + description: Output annotated file. + required: true + + - name: Options + description: | + For examples on how to use use bcftools annotate see http://samtools.github.io/bcftools/howtos/annotate.html. + For more details on the options see https://samtools.github.io/bcftools/bcftools.html#annotate. + arguments: + + - name: --annotations + alternatives: --a + type: file + description: | + VCF file or tabix-indexed FILE with annotations: CHR\tPOS[\tVALUE]+ . + + - name: --columns + alternatives: --c + type: string + description: | + List of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. + See man page for details. + + - name: --columns_file + alternatives: --C + type: file + description: | + Read -c columns from FILE, one name per row, with optional --merge_logic TYPE: NAME[ TYPE]. + + - name: --exclude + alternatives: --e + type: string + description: | + Exclude sites for which the expression is true. + See https://samtools.github.io/bcftools/bcftools.html#expressions for details. + example: 'QUAL >= 30 && DP >= 10' + + - name: --force + type: boolean_true + description: | + continue even when parsing errors, such as undefined tags, are encountered. + Note this can be an unsafe operation and can result in corrupted BCF files. + If this option is used, make sure to sanity check the result thoroughly. + + - name: --header_line + alternatives: --H + type: string + description: | + Header line which should be appended to the VCF header, can be given multiple times. + + - name: --header_lines + alternatives: --h + type: file + description: | + File with header lines to append to the VCF header. + For example: + ##INFO= + ##INFO= + + - name: --set_id + alternatives: --I + type: string + description: | + Set ID column using a `bcftools query`-like expression, see man page for details. + + - name: --include + type: string + description: | + Select sites for which the expression is true. + See https://samtools.github.io/bcftools/bcftools.html#expressions for details. + example: 'QUAL >= 30 && DP >= 10' + + - name: --keep_sites + alternatives: --k + type: boolean_true + description: | + Leave --include/--exclude sites unchanged instead of discarding them. + + - name: --merge_logic + alternatives: --l + type: string + choices: + description: | + When multiple regions overlap a single record, this option defines how to treat multiple annotation values. + See man page for more details. + + - name: --mark_sites + alternatives: --m + type: string + description: | + Annotate sites which are present ("+") or absent ("-") in the -a file with a new INFO/TAG flag. + + - name: --min_overlap + type: string + description: | + Minimum overlap required as a fraction of the variant in the annotation -a file (ANN), + in the target VCF file (:VCF), or both for reciprocal overlap (ANN:VCF). + By default overlaps of arbitrary length are sufficient. + The option can be used only with the tab-delimited annotation -a file and with BEG and END columns present. + + - name: --no_version + type: boolean_true + description: | + Do not append version and command line information to the output VCF header. + + - name: --output_type + alternatives: --O + type: string + choices: ['u', 'z', 'b', 'v'] + description: | + Output type: + u: uncompressed BCF + z: compressed VCF + b: compressed BCF + v: uncompressed VCF + + - name: --pair_logic + type: string + choices: ['snps', 'indels', 'both', 'all', 'some', 'exact'] + description: | + Controls how to match records from the annotation file to the target VCF. + Effective only when -a is a VCF or BCF file. + The option replaces the former uninuitive --collapse. + See Common Options for more. + + - name: --regions + alternatives: --r + type: string + description: | + Restrict to comma-separated list of regions. + Following formats are supported: chr|chr:pos|chr:beg-end|chr:beg-[,…​]. + example: '20:1000000-2000000' + + - name: --regions_file + alternatives: --R + type: file + description: | + Restrict to regions listed in a file. + Regions can be specified either on a VCF, BED, or tab-delimited file (the default). + For more information check manual. + + - name: --regions_overlap + type: string + choices: ['pos', 'record', 'variant', '0', '1', '2'] + description: | + This option controls how overlapping records are determined: + set to 'pos' or '0' if the VCF record has to have POS inside a region (this corresponds to the default behavior of -t/-T); + set to 'record' or '1' if also overlapping records with POS outside a region should be included (this is the default behavior of -r/-R, + and includes indels with POS at the end of a region, which are technically outside the region); + or set to 'variant' or '2' to include only true overlapping variation (compare the full VCF representation "TA>T-" vs the true sequence variation "A>-"). + + - name: --rename_annotations + type: file + description: | + Rename annotations: TYPE/old\tnew, where TYPE is one of FILTER,INFO,FORMAT. + + - name: --rename_chromosomes + type: file + description: | + Rename chromosomes according to the map in file, with "old_name new_name\n" pairs + separated by whitespaces, each on a separate line. + + - name: --samples + type: string + description: | + Subset of samples to annotate. + See also https://samtools.github.io/bcftools/bcftools.html#common_options. + + - name: --samples_file + type: file + description: | + Subset of samples to annotate in file format. + See also https://samtools.github.io/bcftools/bcftools.html#common_options. + + - name: --single_overlaps + type: boolean_true + description: | + Use this option to keep memory requirements low with very large annotation files. + Note, however, that this comes at a cost, only single overlapping intervals are considered in this mode. + This was the default mode until the commit af6f0c9 (Feb 24 2019). + + - name: --remove + alternatives: --x + type: string + description: | + List of annotations to remove. + Use "FILTER" to remove all filters or "FILTER/SomeFilter" to remove a specific filter. + Similarly, "INFO" can be used to remove all INFO tags and "FORMAT" to remove all FORMAT tags except GT. + To remove all INFO tags except "FOO" and "BAR", use "^INFO/FOO,INFO/BAR" (and similarly for FORMAT and FILTER). + "INFO" can be abbreviated to "INF" and "FORMAT" to "FMT". + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bcftools, procps] + - type: docker + run: | + echo "bcftools: \"$(bcftools --version | grep 'bcftools' | sed -n 's/^bcftools //p')\"" > /var/software_versions.txt + test_setup: + - type: apt + packages: [tabix] + +runners: + - type: executable + - type: nextflow + diff --git a/src/bcftools/bcftools_annotate/help.txt b/src/bcftools/bcftools_annotate/help.txt new file mode 100644 index 00000000..2d1c7807 --- /dev/null +++ b/src/bcftools/bcftools_annotate/help.txt @@ -0,0 +1,41 @@ +``` +bcftools annotate -h +``` + +annotate: option requires an argument -- 'h' + +About: Annotate and edit VCF/BCF files. +Usage: bcftools annotate [options] VCF + +Options: + -a, --annotations FILE VCF file or tabix-indexed FILE with annotations: CHR\tPOS[\tVALUE]+ + -c, --columns LIST List of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details + -C, --columns-file FILE Read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE] + -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details) + --force Continue despite parsing error (at your own risk!) + -H, --header-line STR Header line which should be appended to the VCF header, can be given multiple times + -h, --header-lines FILE Lines which should be appended to the VCF header + -I, --set-id [+]FORMAT Set ID column using a `bcftools query`-like expression, see man page for details + -i, --include EXPR Select sites for which the expression is true (see man page for details) + -k, --keep-sites Leave -i/-e sites unchanged instead of discarding them + -l, --merge-logic TAG:TYPE Merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL + -m, --mark-sites [+-]TAG Add INFO/TAG flag to sites which are ("+") or are not ("-") listed in the -a file + --min-overlap ANN:VCF Required overlap as a fraction of variant in the -a file (ANN), the VCF (:VCF), or reciprocal (ANN:VCF) + --no-version Do not append version and command line to the header + -o, --output FILE Write output to a file [standard output] + -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v] + --pair-logic STR Matching records by , see man page for details [some] + -r, --regions REGION Restrict to comma-separated list of regions + -R, --regions-file FILE Restrict to regions listed in FILE + --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1] + --rename-annots FILE Rename annotations: TYPE/old\tnew, where TYPE is one of FILTER,INFO,FORMAT + --rename-chrs FILE Rename sequences according to the mapping: old\tnew + -s, --samples [^]LIST Comma separated list of samples to annotate (or exclude with "^" prefix) + -S, --samples-file [^]FILE File of samples to annotate (or exclude with "^" prefix) + --single-overlaps Keep memory low by avoiding complexities arising from handling multiple overlapping intervals + -x, --remove LIST List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with "^" prefix). See man page for details + --threads INT Number of extra output compression threads [0] + +Examples: + http://samtools.github.io/bcftools/howtos/annotate.html + diff --git a/src/bcftools/bcftools_annotate/script.sh b/src/bcftools/bcftools_annotate/script.sh new file mode 100644 index 00000000..18137bbf --- /dev/null +++ b/src/bcftools/bcftools_annotate/script.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +# Unset parameters +unset_if_false=( + par_force + par_keep_sites + par_no_version + par_single_overlaps +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +# Execute bcftools annotate with the provided arguments +bcftools annotate \ + ${par_annotations:+-a "$par_annotations"} \ + ${par_columns:+-c "$par_columns"} \ + ${par_columns_file:+-C "$par_columns_file"} \ + ${par_exclude:+-e "$par_exclude"} \ + ${par_force:+--force} \ + ${par_header_line:+-H "$par_header_line"} \ + ${par_header_lines:+-h "$par_header_lines"} \ + ${par_set_id:+-I "$par_set_id"} \ + ${par_include:+-i "$par_include"} \ + ${par_keep_sites:+-k} \ + ${par_merge_logic:+-l "$par_merge_logic"} \ + ${par_mark_sites:+-m "$par_mark_sites"} \ + ${par_min_overlap:+--min-overlap "$par_min_overlap"} \ + ${par_no_version:+--no-version} \ + ${par_samples_file:+-S "$par_samples_file"} \ + ${par_output_type:+-O "$par_output_type"} \ + ${par_pair_logic:+--pair-logic "$par_pair_logic"} \ + ${par_regions:+-r "$par_regions"} \ + ${par_regions_file:+-R "$par_regions_file"} \ + ${par_regions_overlap:+--regions-overlap "$par_regions_overlap"} \ + ${par_rename_annotations:+--rename-annots "$par_rename_annotations"} \ + ${par_rename_chromosomes:+--rename-chrs "$par_rename_chromosomes"} \ + ${par_samples:+-s "$par_samples"} \ + ${par_single_overlaps:+--single-overlaps} \ + ${par_threads:+--threads "$par_threads"} \ + ${par_remove:+-x "$par_remove"} \ + -o $par_output \ + $par_input + + + \ No newline at end of file diff --git a/src/bcftools/bcftools_annotate/test.sh b/src/bcftools/bcftools_annotate/test.sh new file mode 100644 index 00000000..39835c82 --- /dev/null +++ b/src/bcftools/bcftools_annotate/test.sh @@ -0,0 +1,305 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +#test_data="$meta_resources_dir/test_data" + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create test data +cat < "$TMPDIR/example.vcf" +##fileformat=VCFv4.1 +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 +1 752567 llama A C . . . . . +1 752722 . G A . . . . . +EOF + +bgzip -c $TMPDIR/example.vcf > $TMPDIR/example.vcf.gz +tabix -p vcf $TMPDIR/example.vcf.gz + +cat < "$TMPDIR/annots.tsv" +1 752567 752567 FooValue1 12345 +1 752722 752722 FooValue2 67890 +EOF + +cat < "$TMPDIR/rename.tsv" +INFO/. Luigi +EOF + +bgzip $TMPDIR/annots.tsv +tabix -s1 -b2 -e3 $TMPDIR/annots.tsv.gz + +cat < "$TMPDIR/header.hdr" +##FORMAT= +##INFO= +EOF + +cat < "$TMPDIR/rename_chrm.tsv" +1 chr1 +2 chr2 +EOF + +# Test 1: Remove ID annotations +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null + +echo "> Run bcftools_annotate remove annotations" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --remove "ID" \ + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "1 752567 . A C" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: Annotate with -a, -c and -h options +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bcftools_annotate with -a, -c and -h options" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --annotations "../annots.tsv.gz" \ + --header_lines "../header.hdr" \ + --columns "CHROM,FROM,TO,FMT/FOO,BAR" \ + --mark_sites "BAR" \ + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" $(echo -e "1\t752567\tllama\tA\tC\t.\t.\tBAR=12345\tFOO\tFooValue1") +echo "- test2 succeeded -" + +popd > /dev/null + +# Test 3: +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null + +echo "> Run bcftools_annotate with --set_id option" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --set_id "+'%CHROM\_%POS\_%REF\_%FIRST_ALT'" \ + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "'1_752722_G_A'" +echo "- test3 succeeded -" + +popd > /dev/null + +# Test 4: +mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null + +echo "> Run bcftools_annotate with --rename-annotations option" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --rename_annotations "../rename.tsv" + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "##bcftools_annotateCommand=annotate --rename-annots ../rename.tsv -o annotated.vcf" +echo "- test4 succeeded -" + +popd > /dev/null + +# Test 5: Rename chromosomes +mkdir "$TMPDIR/test5" && pushd "$TMPDIR/test5" > /dev/null + +echo "> Run bcftools_annotate with --rename-chromosomes option" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --rename_chromosomes "../rename_chrm.tsv" + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "chr1" +echo "- test5 succeeded -" + +popd > /dev/null + +# Test 6: Sample option +mkdir "$TMPDIR/test6" && pushd "$TMPDIR/test6" > /dev/null + +echo "> Run bcftools_annotate with -s option" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --samples "SAMPLE1" + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "##bcftools_annotateCommand=annotate -s SAMPLE1 -o annotated.vcf ../example.vcf" +echo "- test6 succeeded -" + +popd > /dev/null + +# Test 7: Single overlaps +mkdir "$TMPDIR/test7" && pushd "$TMPDIR/test7" > /dev/null + +echo "> Run bcftools_annotate with --single-overlaps option" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --single_overlaps \ + --keep_sites \ + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "annotate -k --single-overlaps -o annotated.vcf ../example.vcf" +echo "- test7 succeeded -" + +popd > /dev/null + +# Test 8: Min overlap +mkdir "$TMPDIR/test8" && pushd "$TMPDIR/test8" > /dev/null + +echo "> Run bcftools_annotate with --min-overlap option" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --annotations "../annots.tsv.gz" \ + --columns "CHROM,FROM,TO,FMT/FOO,BAR" \ + --header_lines "../header.hdr" \ + --min_overlap "1" + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "annotate -a ../annots.tsv.gz -c CHROM,FROM,TO,FMT/FOO,BAR -h ../header.hdr --min-overlap 1 -o annotated.vcf ../example.vcf" +echo "- test8 succeeded -" + +popd > /dev/null + +# Test 9: Regions +mkdir "$TMPDIR/test9" && pushd "$TMPDIR/test9" > /dev/null + +echo "> Run bcftools_annotate with -r option" +"$meta_executable" \ + --input "../example.vcf.gz" \ + --output "annotated.vcf" \ + --regions "1:752567-752722" + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "annotate -r 1:752567-752722 -o annotated.vcf ../example.vcf.gz" +echo "- test9 succeeded -" + +popd > /dev/null + +# Test 10: pair-logic +mkdir "$TMPDIR/test10" && pushd "$TMPDIR/test10" > /dev/null + +echo "> Run bcftools_annotate with --pair-logic option" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --pair_logic "all" + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "annotate --pair-logic all -o annotated.vcf ../example.vcf" +echo "- test10 succeeded -" + +popd > /dev/null + +# Test 11: regions-overlap +mkdir "$TMPDIR/test11" && pushd "$TMPDIR/test11" > /dev/null + +echo "> Run bcftools_annotate with --regions-overlap option" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --regions_overlap "1" + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "annotate --regions-overlap 1 -o annotated.vcf ../example.vcf" +echo "- test11 succeeded -" + +popd > /dev/null + +# Test 12: include +mkdir "$TMPDIR/test12" && pushd "$TMPDIR/test12" > /dev/null + +echo "> Run bcftools_annotate with -i option" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --include "FILTER='PASS'" \ + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "annotate -i FILTER='PASS' -o annotated.vcf ../example.vcf" +echo "- test12 succeeded -" + +popd > /dev/null + +# Test 13: exclude +mkdir "$TMPDIR/test13" && pushd "$TMPDIR/test13" > /dev/null + +echo "> Run bcftools_annotate with -e option" +"$meta_executable" \ + --annotations "../annots.tsv.gz" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --exclude "FILTER='PASS'" \ + --header_lines "../header.hdr" \ + --columns "CHROM,FROM,TO,FMT/FOO,BAR" \ + --merge_logic "FOO:first" \ + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "annotate -a ../annots.tsv.gz -c CHROM,FROM,TO,FMT/FOO,BAR -e FILTER='PASS' -h ../header.hdr -l FOO:first -o annotated.vcf ../example.vcf" +echo "- test13 succeeded -" + +popd > /dev/null + + +echo "---- All tests succeeded! ----" +exit 0 + From dc7b33d51f274cb156b1f1b0fbdc6fed0b757720 Mon Sep 17 00:00:00 2001 From: Theodoro Gasperin Terra Camargo <98555209+tgaspe@users.noreply.github.com> Date: Tue, 10 Sep 2024 16:15:44 +0200 Subject: [PATCH 06/13] Bcftools Norm (#144) * Initial Commit * config and help.txt * script.sh * test template * More tests and debugging * test 5 and 6 * test 7, 8, 9 * Update test.sh * fixing bug on config * Changelog * Update config.vsh.yaml * Requested changes * Bug fixing --------- Co-authored-by: Jakub Majercik <57993790+jakubmajercik@users.noreply.github.com> --- CHANGELOG.md | 1 + src/bcftools/bcftools_norm/config.vsh.yaml | 194 +++++++++++++++++ src/bcftools/bcftools_norm/help.txt | 41 ++++ src/bcftools/bcftools_norm/script.sh | 49 +++++ src/bcftools/bcftools_norm/test.sh | 231 +++++++++++++++++++++ 5 files changed, 516 insertions(+) create mode 100644 src/bcftools/bcftools_norm/config.vsh.yaml create mode 100644 src/bcftools/bcftools_norm/help.txt create mode 100644 src/bcftools/bcftools_norm/script.sh create mode 100644 src/bcftools/bcftools_norm/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index bb640d50..25850193 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,6 +42,7 @@ * `rsem/rsem_prepare_reference`: Prepare transcript references for RSEM (PR #89). * `bcftools`: + - `bcftools_norm`: Left-align and normalize indels, check if REF alleles match the reference, split multiallelic sites into multiple rows; recover multiallelics from multiple rows (PR #144). - `bcftools_annotate`: Add or remove annotations from a VCF/BCF file (PR #143). - `bcftools/bcftools_stats`: Parses VCF or BCF and produces a txt stats file which can be plotted using plot-vcfstats (PR #142). - `bcftools/bcftools_sort`: Sorts BCF/VCF files by position and other criteria (PR #141). diff --git a/src/bcftools/bcftools_norm/config.vsh.yaml b/src/bcftools/bcftools_norm/config.vsh.yaml new file mode 100644 index 00000000..5c525d3a --- /dev/null +++ b/src/bcftools/bcftools_norm/config.vsh.yaml @@ -0,0 +1,194 @@ +name: bcftools_norm +namespace: bcftools +description: | + Left-align and normalize indels, check if REF alleles match the reference, split multiallelic sites into multiple rows; + recover multiallelics from multiple rows. +keywords: [Normalize, VCF, BCF] +links: + homepage: https://samtools.github.io/bcftools/ + documentation: https://samtools.github.io/bcftools/bcftools.html#norm + repository: https://github.com/samtools/bcftools + issue_tracker: https://github.com/samtools/bcftools/issues +references: + doi: https://doi.org/10.1093/gigascience/giab008 +license: MIT/Expat, GNU +requirements: + commands: [bcftools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [author] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + description: Input VCF/BCF file. + required: true + + - name: Outputs + arguments: + - name: --output + alternatives: -o + direction: output + type: file + description: Output normalized VCF/BCF file. + required: true + + - name: Options + arguments: + + - name: --atomize + alternatives: -a + type: boolean_true + description: | + Decompose complex variants (e.g., MNVs become consecutive SNVs). + + - name: --atom_overlaps + type: string + choices: [".", "*"] + description: | + Use the star allele (*) for overlapping alleles or set to missing (.). + + - name: --check_ref + alternatives: -c + type: string + choices: ['e', 'w', 'x', 's'] + description: | + Check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites. + + - name: --remove_duplicates + alternatives: -d + type: string + choices: ['snps', 'indels', 'both', 'all', 'exact', 'none'] + description: Remove duplicate snps, indels, both, all, exact matches, or none (old -D option). + + - name: --fasta_ref + alternatives: -f + type: file + description: Reference fasta sequence file. + + - name: --force + type: boolean_true + description: | + Try to proceed even if malformed tags are encountered. + Experimental, use at your own risk. + + - name: --keep_sum + type: string + description: | + Keep vector sum constant when splitting multiallelics (see github issue #360). + + - name: --multiallelics + alternatives: -m + type: string + choices: ['+snps', '+indels', '+both', '+any', '-snps', '-indels', '-both', '-any'] + description: | + Split multiallelics (-) or join biallelics (+), type: snps, indels, both, any [default: both]. + + - name: --no_version + type: boolean_true + description: Do not append version and command line information to the header. + + - name: --do_not_normalize + alternatives: -N + type: boolean_true + description: Do not normalize indels (with -m or -c s). + + - name: --output_type + alternatives: --O + type: string + choices: ['u', 'z', 'b', 'v'] + description: | + Output type: + u: uncompressed BCF + z: compressed VCF + b: compressed BCF + v: uncompressed VCF + + - name: --old_rec_tag + type: string + description: Annotate modified records with INFO/STR indicating the original variant. + + - name: --regions + alternatives: --r + type: string + description: | + Restrict to comma-separated list of regions. + Following formats are supported: chr|chr:pos|chr:beg-end|chr:beg-[,…​]. + example: '20:1000000-2000000' + + - name: --regions_file + alternatives: --R + type: file + description: | + Restrict to regions listed in a file. + Regions can be specified either on a VCF, BED, or tab-delimited file (the default). + For more information check manual. + + - name: --regions_overlap + type: string + choices: ['pos', 'record', 'variant', '0', '1', '2'] + description: | + This option controls how overlapping records are determined: + set to 'pos' or '0' if the VCF record has to have POS inside a region (this corresponds to the default behavior of -t/-T); + set to 'record' or '1' if also overlapping records with POS outside a region should be included (this is the default behavior of -r/-R, + and includes indels with POS at the end of a region, which are technically outside the region); + or set to 'variant' or '2' to include only true overlapping variation (compare the full VCF representation "TA>T-" vs the true sequence variation "A>-"). + + - name: --site_win + alternatives: -w + type: integer + description: | + Buffer for sorting lines that changed position during realignment. + + - name: --strict_filter + alternatives: -s + type: boolean_true + description: When merging (-m+), merged site is PASS only if all sites being merged PASS. + + - name: --targets + alternatives: -t + type: string + description: Similar to --regions but streams rather than index-jumps. + example: '20:1000000-2000000' + + - name: --targets_file + alternatives: -T + type: file + description: Similar to --regions_file but streams rather than index-jumps. + + - name: --targets_overlap + type: string + choices: ['pos', 'record', 'variant', '0', '1', '2'] + description: | + Include if POS in the region (0), record overlaps (1), variant overlaps (2). + Similar to --regions_overlap. + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bcftools, procps] + - type: docker + run: | + echo "bcftools: \"$(bcftools --version | grep 'bcftools' | sed -n 's/^bcftools //p')\"" > /var/software_versions.txt + test_setup: + - type: apt + packages: [tabix] + +runners: + - type: executable + - type: nextflow + + diff --git a/src/bcftools/bcftools_norm/help.txt b/src/bcftools/bcftools_norm/help.txt new file mode 100644 index 00000000..02e9761a --- /dev/null +++ b/src/bcftools/bcftools_norm/help.txt @@ -0,0 +1,41 @@ +``` +bcftools norm -h +``` + +About: Left-align and normalize indels; check if REF alleles match the reference; + split multiallelic sites into multiple rows; recover multiallelics from + multiple rows. +Usage: bcftools norm [options] + +Options: + -a, --atomize Decompose complex variants (e.g. MNVs become consecutive SNVs) + --atom-overlaps '*'|. Use the star allele (*) for overlapping alleles or set to missing (.) [*] + -c, --check-ref e|w|x|s Check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e] + -D, --remove-duplicates Remove duplicate lines of the same type. + -d, --rm-dup TYPE Remove duplicate snps|indels|both|all|exact + -f, --fasta-ref FILE Reference sequence + --force Try to proceed even if malformed tags are encountered. Experimental, use at your own risk + --keep-sum TAG,.. Keep vector sum constant when splitting multiallelics (see github issue #360) + -m, --multiallelics -|+TYPE Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both] + --no-version Do not append version and command line to the header + -N, --do-not-normalize Do not normalize indels (with -m or -c s) + --old-rec-tag STR Annotate modified records with INFO/STR indicating the original variant + -o, --output FILE Write output to a file [standard output] + -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v] + -r, --regions REGION Restrict to comma-separated list of regions + -R, --regions-file FILE Restrict to regions listed in a file + --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1] + -s, --strict-filter When merging (-m+), merged site is PASS only if all sites being merged PASS + -t, --targets REGION Similar to -r but streams rather than index-jumps + -T, --targets-file FILE Similar to -R but streams rather than index-jumps + --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0] + --threads INT Use multithreading with worker threads [0] + -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000] + +Examples: + # normalize and left-align indels + bcftools norm -f ref.fa in.vcf + + # split multi-allelic sites + bcftools norm -m- in.vcf + diff --git a/src/bcftools/bcftools_norm/script.sh b/src/bcftools/bcftools_norm/script.sh new file mode 100644 index 00000000..0f43e593 --- /dev/null +++ b/src/bcftools/bcftools_norm/script.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +# Unset parameters +unset_if_false=( + par_atomize + par_remove_duplicates + par_force + par_no_version + par_do_not_normalize + par_strict_filter +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +# Execute bcftools norm with the provided arguments +bcftools norm \ + ${par_atomize:+--atomize} \ + ${par_atom_overlaps:+--atom-overlaps "$par_atom_overlaps"} \ + ${par_check_ref:+-c "$par_check_ref"} \ + ${par_remove_duplicates:+-d "$par_remove_duplicates"} \ + ${par_fasta_ref:+-f "$par_fasta_ref"} \ + ${par_force:+--force} \ + ${par_keep_sum:+--keep-sum "$par_keep_sum"} \ + ${par_multiallelics:+-m "$par_multiallelics"} \ + ${par_no_version:+--no-version} \ + ${par_do_not_normalize:+-N} \ + ${par_old_rec_tag:+--old-rec-tag "$par_old_rec_tag"} \ + ${par_regions:+-r "$par_regions"} \ + ${par_regions_file:+-R "$par_regions_file"} \ + ${par_regions_overlap:+--regions-overlap "$par_regions_overlap"} \ + ${par_site_win:+-w "$par_site_win"} \ + ${par_strict_filter:+-s} \ + ${par_targets:+-t "$par_targets"} \ + ${par_targets_file:+-T "$par_targets_file"} \ + ${par_targets_overlap:+--targets-overlap "$par_targets_overlap"} \ + ${meta_cpus:+--threads "$meta_cpus"} \ + ${par_output_type:+-O "$par_output_type"} \ + -o $par_output \ + $par_input + diff --git a/src/bcftools/bcftools_norm/test.sh b/src/bcftools/bcftools_norm/test.sh new file mode 100644 index 00000000..254c7176 --- /dev/null +++ b/src/bcftools/bcftools_norm/test.sh @@ -0,0 +1,231 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +#test_data="$meta_resources_dir/test_data" + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create test data +cat < "$TMPDIR/example.vcf" +##fileformat=VCFv4.1 +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 +1 752567 llama G C,A . . . . 1/2 +1 752722 . G A,AAA . . . . ./. +EOF + +bgzip -c $TMPDIR/example.vcf > $TMPDIR/example.vcf.gz +tabix -p vcf $TMPDIR/example.vcf.gz + +cat < "$TMPDIR/reference.fa" +>1 +ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG +>2 +CGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGAT +EOF + +# Test 1: Remove ID annotations +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null + +echo "> Run bcftools_norm" +"$meta_executable" \ + --input "../example.vcf" \ + --output "normalized.vcf" \ + --atomize \ + --atom_overlaps "." \ + &> /dev/null + +# checks +assert_file_exists "normalized.vcf" +assert_file_not_empty "normalized.vcf" +assert_file_contains "normalized.vcf" "bcftools_normCommand=norm --atomize --atom-overlaps . -o normalized.vcf ../example.vcf" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: Check reference +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bcftools_norm with remove duplicates" +"$meta_executable" \ + --input "../example.vcf" \ + --output "normalized.vcf" \ + --atomize \ + --remove_duplicates 'all' \ + &> /dev/null + +# checks +assert_file_exists "normalized.vcf" +assert_file_not_empty "normalized.vcf" +assert_file_contains "normalized.vcf" "norm --atomize -d all -o normalized.vcf ../example.vcf" +echo "- test2 succeeded -" + +popd > /dev/null + +# Test 3: Check reference and fasta reference +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null + +echo "> Run bcftools_norm with check reference and fasta reference" +"$meta_executable" \ + --input "../example.vcf" \ + --output "normalized.vcf" \ + --atomize \ + --fasta_ref "../reference.fa" \ + --check_ref "e" \ + &> /dev/null + +# checks +assert_file_exists "normalized.vcf" +assert_file_not_empty "normalized.vcf" +assert_file_contains "normalized.vcf" "norm --atomize -c e -f ../reference.fa -o normalized.vcf ../example.vcf" +echo "- test3 succeeded -" + +popd > /dev/null + +# Test 4: Multiallelics +mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null + +echo "> Run bcftools_norm with multiallelics" +"$meta_executable" \ + --input "../example.vcf" \ + --output "normalized.vcf" \ + --multiallelics "-any" \ + --old_rec_tag "wazzaaa" \ + &> /dev/null + +# checks +assert_file_exists "normalized.vcf" +assert_file_not_empty "normalized.vcf" +assert_file_contains "normalized.vcf" "norm -m -any --old-rec-tag wazzaaa -o normalized.vcf ../example.vcf" +echo "- test4 succeeded -" + +popd > /dev/null + +# Test 5: Regions +mkdir "$TMPDIR/test5" && pushd "$TMPDIR/test5" > /dev/null + +echo "> Run bcftools_norm with regions" +"$meta_executable" \ + --input "../example.vcf.gz" \ + --output "normalized.vcf" \ + --atomize \ + --regions "1:752567-752722" \ + &> /dev/null + +# checks +assert_file_exists "normalized.vcf" +assert_file_not_empty "normalized.vcf" +assert_file_contains "normalized.vcf" "norm --atomize -r 1:752567-752722 -o normalized.vcf ../example.vcf.gz" +echo "- test5 succeeded -" + +popd > /dev/null + +# Test 6: Targets +mkdir "$TMPDIR/test6" && pushd "$TMPDIR/test6" > /dev/null + +echo "> Run bcftools_norm with targets" +"$meta_executable" \ + --input "../example.vcf" \ + --output "normalized.vcf" \ + --atomize \ + --targets "1:752567-752722" \ + &> /dev/null + +# checks +assert_file_exists "normalized.vcf" +assert_file_not_empty "normalized.vcf" +assert_file_contains "normalized.vcf" "norm --atomize -t 1:752567-752722 -o normalized.vcf ../example.vcf" +echo "- test6 succeeded -" + +popd > /dev/null + +# Test 7: Regions overlap +mkdir "$TMPDIR/test7" && pushd "$TMPDIR/test7" > /dev/null + +echo "> Run bcftools_norm with regions overlap" +"$meta_executable" \ + --input "../example.vcf" \ + --output "normalized.vcf" \ + --atomize \ + --regions_overlap "pos" \ + &> /dev/null + +# checks +assert_file_exists "normalized.vcf" +assert_file_not_empty "normalized.vcf" +assert_file_contains "normalized.vcf" "norm --atomize --regions-overlap pos -o normalized.vcf ../example.vcf" +echo "- test7 succeeded -" + +popd > /dev/null + +# Test 8: Strict filter and targets overlap +mkdir "$TMPDIR/test8" && pushd "$TMPDIR/test8" > /dev/null + +echo "> Run bcftools_norm with strict filter and targets overlap" +"$meta_executable" \ + --input "../example.vcf" \ + --output "normalized.vcf" \ + --atomize \ + --strict_filter \ + --targets_overlap "1" \ + &> /dev/null + +# checks +assert_file_exists "normalized.vcf" +assert_file_not_empty "normalized.vcf" +assert_file_contains "normalized.vcf" "norm --atomize -s --targets-overlap 1 -o normalized.vcf ../example.vcf" +echo "- test8 succeeded -" + +popd > /dev/null + +# Test 9: Do not normalize +mkdir "$TMPDIR/test9" && pushd "$TMPDIR/test9" > /dev/null + +echo "> Run bcftools_norm with do not normalize" +"$meta_executable" \ + --input "../example.vcf" \ + --output "normalized.vcf" \ + --do_not_normalize \ + --atomize \ + &> /dev/null + +# checks +assert_file_exists "normalized.vcf" +assert_file_not_empty "normalized.vcf" +assert_file_contains "normalized.vcf" "norm --atomize -N -o normalized.vcf ../example.vcf" +echo "- test9 succeeded -" + +popd > /dev/null + +echo "---- All tests succeeded! ----" +exit 0 + + From bd8ca889d13784c5a7502bb977c6659fe420d973 Mon Sep 17 00:00:00 2001 From: Theodoro Gasperin Terra Camargo <98555209+tgaspe@users.noreply.github.com> Date: Tue, 10 Sep 2024 16:17:22 +0200 Subject: [PATCH 07/13] Bcftools Concat (#145) * Initial Commint * Create help.txt * Update config.vsh.yaml * Update config.vsh.yaml * Update config.vsh.yaml * Update script.sh * add template for tests * Update test.sh * small changes in config file * adding more tests * adding more test * Update CHANGELOG.md --------- Co-authored-by: Jakub Majercik <57993790+jakubmajercik@users.noreply.github.com> --- CHANGELOG.md | 5 +- src/bcftools/bcftools_concat/config.vsh.yaml | 172 ++++++++++++++ src/bcftools/bcftools_concat/help.txt | 36 +++ src/bcftools/bcftools_concat/script.sh | 54 +++++ src/bcftools/bcftools_concat/test.sh | 227 +++++++++++++++++++ 5 files changed, 492 insertions(+), 2 deletions(-) create mode 100644 src/bcftools/bcftools_concat/config.vsh.yaml create mode 100644 src/bcftools/bcftools_concat/help.txt create mode 100644 src/bcftools/bcftools_concat/script.sh create mode 100644 src/bcftools/bcftools_concat/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 25850193..034e2422 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,8 +42,9 @@ * `rsem/rsem_prepare_reference`: Prepare transcript references for RSEM (PR #89). * `bcftools`: - - `bcftools_norm`: Left-align and normalize indels, check if REF alleles match the reference, split multiallelic sites into multiple rows; recover multiallelics from multiple rows (PR #144). - - `bcftools_annotate`: Add or remove annotations from a VCF/BCF file (PR #143). + - `bcftools/bcftools_concat`: Concatenate or combine VCF/BCF files (PR #145). + - `bcftools/bcftools_norm`: Left-align and normalize indels, check if REF alleles match the reference, split multiallelic sites into multiple rows; recover multiallelics from multiple rows (PR #144). + - `bcftools/bcftools_annotate`: Add or remove annotations from a VCF/BCF file (PR #143). - `bcftools/bcftools_stats`: Parses VCF or BCF and produces a txt stats file which can be plotted using plot-vcfstats (PR #142). - `bcftools/bcftools_sort`: Sorts BCF/VCF files by position and other criteria (PR #141). diff --git a/src/bcftools/bcftools_concat/config.vsh.yaml b/src/bcftools/bcftools_concat/config.vsh.yaml new file mode 100644 index 00000000..2bb32f1c --- /dev/null +++ b/src/bcftools/bcftools_concat/config.vsh.yaml @@ -0,0 +1,172 @@ +name: bcftools_concat +namespace: bcftools +description: | + Concatenate or combine VCF/BCF files. All source files must have the same sample + columns appearing in the same order. The program can be used, for example, to + concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel + VCF into one. The input files must be sorted by chr and position. The files + must be given in the correct order to produce sorted VCF on output unless + the -a, --allow-overlaps option is specified. With the --naive option, the files + are concatenated without being recompressed, which is very fast. +keywords: [Concatenate, VCF, BCF] +links: + homepage: https://samtools.github.io/bcftools/ + documentation: https://samtools.github.io/bcftools/bcftools.html#concat + repository: https://github.com/samtools/bcftools + issue_tracker: https://github.com/samtools/bcftools/issues +references: + doi: https://doi.org/10.1093/gigascience/giab008 +license: MIT/Expat, GNU +requirements: + commands: [bcftools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [author] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + multiple: true + description: Input VCF/BCF files to concatenate. + + - name: --file_list + alternatives: -f + type: file + description: Read the list of VCF/BCF files from a file, one file name per line. + + - name: Outputs + arguments: + - name: --output + alternatives: -o + direction: output + type: file + description: Output concatenated VCF/BCF file. + required: true + + - name: Options + arguments: + + - name: --allow_overlaps + alternatives: -a + type: boolean_true + description: | + First coordinate of the next file can precede last record of the current file. + + - name: --compact_PS + alternatives: -c + type: boolean_true + description: | + Do not output PS tag at each site, only at the start of a new phase set block. + + - name: --remove_duplicates + alternatives: -d + type: string + choices: ['snps', 'indels', 'both', 'all', 'exact', 'none'] + description: | + Output duplicate records present in multiple files only once: . + + - name: --ligate + alternatives: -l + type: boolean_true + description: Ligate phased VCFs by matching phase at overlapping haplotypes. + + - name: --ligate_force + type: boolean_true + description: Ligate even non-overlapping chunks, keep all sites. + + - name: --ligate_warn + type: boolean_true + description: Drop sites in imperfect overlaps. + + - name: --no_version + type: boolean_true + description: Do not append version and command line information to the header. + + - name: --naive + alternatives: -n + type: boolean_true + description: Concatenate files without recompression, a header check compatibility is performed. + + - name: --naive_force + type: boolean_true + description: | + Same as --naive, but header compatibility is not checked. + Dangerous, use with caution. + + - name: --output_type + alternatives: -O + type: string + choices: ['u', 'z', 'b', 'v'] + description: | + Output type: + u: uncompressed BCF + z: compressed VCF + b: compressed BCF + v: uncompressed VCF + + - name: --min_PQ + alternatives: -q + type: integer + description: Break phase set if phasing quality is lower than . + example: 30 + + - name: --regions + alternatives: -r + type: string + description: | + Restrict to comma-separated list of regions. + Following formats are supported: chr|chr:pos|chr:beg-end|chr:beg-[,…​]. + example: '20:1000000-2000000' + + - name: --regions_file + alternatives: -R + type: file + description: | + Restrict to regions listed in a file. + Regions can be specified either on a VCF, BED, or tab-delimited file (the default). + For more information check manual. + + - name: --regions_overlap + type: string + choices: ['pos', 'record', 'variant', '0', '1', '2'] + description: | + This option controls how overlapping records are determined: + set to 'pos' or '0' if the VCF record has to have POS inside a region (this corresponds to the default behavior of -t/-T); + set to 'record' or '1' if also overlapping records with POS outside a region should be included (this is the default behavior of -r/-R, + and includes indels with POS at the end of a region, which are technically outside the region); + or set to 'variant' or '2' to include only true overlapping variation (compare the full VCF representation "TA>T-" vs the true sequence variation "A>-"). + + #PS: Verbose seems to be broken in this version of bcftools + # - name: --verbose + # alternatives: -v + # type: integer + # choices: [0, 1] + # description: Set verbosity level. + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bcftools, procps] + - type: docker + run: | + echo "bcftools: \"$(bcftools --version | grep 'bcftools' | sed -n 's/^bcftools //p')\"" > /var/software_versions.txt + test_setup: + - type: apt + packages: [tabix] + +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/bcftools/bcftools_concat/help.txt b/src/bcftools/bcftools_concat/help.txt new file mode 100644 index 00000000..fc0f1914 --- /dev/null +++ b/src/bcftools/bcftools_concat/help.txt @@ -0,0 +1,36 @@ +``` +bcftools concat -h +``` + +concat: option requires an argument -- 'h' + +About: Concatenate or combine VCF/BCF files. All source files must have the same sample + columns appearing in the same order. The program can be used, for example, to + concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel + VCF into one. The input files must be sorted by chr and position. The files + must be given in the correct order to produce sorted VCF on output unless + the -a, --allow-overlaps option is specified. With the --naive option, the files + are concatenated without being recompressed, which is very fast. +Usage: bcftools concat [options] [ [...]] + +Options: + -a, --allow-overlaps First coordinate of the next file can precede last record of the current file. + -c, --compact-PS Do not output PS tag at each site, only at the start of a new phase set block. + -d, --rm-dups STRING Output duplicate records present in multiple files only once: + -D, --remove-duplicates Alias for -d exact + -f, --file-list FILE Read the list of files from a file. + -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes + --ligate-force Ligate even non-overlapping chunks, keep all sites + --ligate-warn Drop sites in imperfect overlaps + --no-version Do not append version and command line to the header + -n, --naive Concatenate files without recompression, a header check compatibility is performed + --naive-force Same as --naive, but header compatibility is not checked. Dangerous, use with caution. + -o, --output FILE Write output to a file [standard output] + -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v] + -q, --min-PQ INT Break phase set if phasing quality is lower than [30] + -r, --regions REGION Restrict to comma-separated list of regions + -R, --regions-file FILE Restrict to regions listed in a file + --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1] + --threads INT Use multithreading with worker threads [0] + -v, --verbose 0|1 Set verbosity level [1] + diff --git a/src/bcftools/bcftools_concat/script.sh b/src/bcftools/bcftools_concat/script.sh new file mode 100644 index 00000000..5614cd1b --- /dev/null +++ b/src/bcftools/bcftools_concat/script.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +# Unset parameters +unset_if_false=( + par_allow_overlaps + par_compact_PS + par_ligate + par_ligate_force + par_ligate_warn + par_no_version + par_naive + par_naive_force +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +# Check to see whether the par_input or the par_file_list is set +if [[ -z "${par_input}" && -z "${par_file_list}" ]]; then + echo "Error: One of the parameters '--input' or '--file_list' must be used." + exit 1 +fi + +# Create input array +IFS=";" read -ra input <<< $par_input + +# Execute bcftools concat with the provided arguments +bcftools concat \ + ${par_allow_overlaps:+-a} \ + ${par_compact_PS:+-c} \ + ${par_remove_duplicates:+-d "$par_remove_duplicates"} \ + ${par_ligate:+-l} \ + ${par_ligate_force:+--ligate-force} \ + ${par_ligate_warn:+--ligate-warn} \ + ${par_no_version:+--no-version} \ + ${par_naive:+-n} \ + ${par_naive_force:+--naive-force} \ + ${par_output_type:+--O "$par_output_type"} \ + ${par_min_PQ:+-q "$par_min_PQ"} \ + ${par_regions:+-r "$par_regions"} \ + ${par_regions_file:+-R "$par_regions_file"} \ + ${par_regions_overlap:+--regions-overlap "$par_regions_overlap"} \ + ${meta_cpus:+--threads "$meta_cpus"} \ + -o $par_output \ + ${par_file_list:+-f "$par_file_list"} \ + ${input[@]} \ \ No newline at end of file diff --git a/src/bcftools/bcftools_concat/test.sh b/src/bcftools/bcftools_concat/test.sh new file mode 100644 index 00000000..3c1c7bb6 --- /dev/null +++ b/src/bcftools/bcftools_concat/test.sh @@ -0,0 +1,227 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +#test_data="$meta_resources_dir/test_data" + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create test data +cat < "$TMPDIR/example.vcf" +##fileformat=VCFv4.1 +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 +1 752567 llama G C,A 15 . . . 1/2 +1 752752 . G A,AAA 20 . . . ./. +EOF + +bgzip -c $TMPDIR/example.vcf > $TMPDIR/example.vcf.gz +tabix -p vcf $TMPDIR/example.vcf.gz + +cat < "$TMPDIR/example_2.vcf" +##fileformat=VCFv4.1 +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 +1 752569 cat G C,A 15 . . . 1/2 +1 752739 . G A,AAA 20 . . . ./. +EOF + +bgzip -c $TMPDIR/example_2.vcf > $TMPDIR/example_2.vcf.gz +tabix -p vcf $TMPDIR/example_2.vcf.gz + +cat < "$TMPDIR/file_list.txt" +$TMPDIR/example.vcf.gz +$TMPDIR/example_2.vcf.gz +EOF + +# Test 1: Default test +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null + +echo "> Run bcftools_concat default test" +"$meta_executable" \ + --input "../example.vcf" \ + --input "../example_2.vcf" \ + --output "concatenated.vcf" \ + &> /dev/null + +# checks +assert_file_exists "concatenated.vcf" +assert_file_not_empty "concatenated.vcf" +assert_file_contains "concatenated.vcf" "concat -o concatenated.vcf ../example.vcf ../example_2.vcf" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: Allow overlaps, compact PS and remove duplicates +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bcftools_concat test with allow overlaps, and remove duplicates" +"$meta_executable" \ + --input "../example.vcf.gz" \ + --input "../example_2.vcf.gz" \ + --output "concatenated.vcf" \ + --allow_overlaps \ + --remove_duplicates 'none' \ + &> /dev/null + +# checks +assert_file_exists "concatenated.vcf" +assert_file_not_empty "concatenated.vcf" +assert_file_contains "concatenated.vcf" "concat -a -d none -o concatenated.vcf ../example.vcf.gz ../example_2.vcf.gz" +echo "- test2 succeeded -" + +popd > /dev/null + + +# Test 3: Ligate, ligate force and ligate warn +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null + +echo "> Run bcftools_concat test with ligate, ligate force and ligate warn" +"$meta_executable" \ + --input "../example.vcf.gz" \ + --input "../example_2.vcf.gz" \ + --output "concatenated.vcf" \ + --ligate \ + --compact_PS \ + &> /dev/null + + +# checks +assert_file_exists "concatenated.vcf" +assert_file_not_empty "concatenated.vcf" +assert_file_contains "concatenated.vcf" "concat -c -l -o concatenated.vcf ../example.vcf.gz ../example_2.vcf.gz" +echo "- test3 succeeded -" + +popd > /dev/null + +# Test 4: file list with ligate force and ligate warn +mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null + +echo "> Run bcftools_concat test with file list, ligate force and ligate warn" +"$meta_executable" \ + --file_list "../file_list.txt" \ + --output "concatenated.vcf" \ + --ligate_force \ + &> /dev/null + +# checks +assert_file_exists "concatenated.vcf" +assert_file_not_empty "concatenated.vcf" +assert_file_contains "concatenated.vcf" "concat --ligate-force -o concatenated.vcf -f ../file_list.txt" +echo "- test4 succeeded -" + +popd > /dev/null + +# Test 5: ligate warn and naive +mkdir "$TMPDIR/test5" && pushd "$TMPDIR/test5" > /dev/null + +echo "> Run bcftools_concat test with ligate warn and naive" +"$meta_executable" \ + --input "../example.vcf.gz" \ + --input "../example_2.vcf.gz" \ + --output "concatenated.vcf.gz" \ + --ligate_warn \ + --naive \ + &> /dev/null + +bgzip -d concatenated.vcf.gz + +# checks +assert_file_exists "concatenated.vcf" +assert_file_not_empty "concatenated.vcf" +assert_file_contains "concatenated.vcf" "##fileformat=VCFv4.1" +echo "- test5 succeeded -" + +popd > /dev/null + +# Test 6: minimal PQ +mkdir "$TMPDIR/test6" && pushd "$TMPDIR/test6" > /dev/null + +echo "> Run bcftools_concat test with minimal PQ" +"$meta_executable" \ + --input "../example.vcf.gz" \ + --input "../example_2.vcf.gz" \ + --output "concatenated.vcf" \ + --min_PQ 20 \ + &> /dev/null + +# checks +assert_file_exists "concatenated.vcf" +assert_file_not_empty "concatenated.vcf" +assert_file_contains "concatenated.vcf" "concat -q 20 -o concatenated.vcf ../example.vcf.gz ../example_2.vcf.gz" +echo "- test6 succeeded -" + +popd > /dev/null + +# Test 7: regions +mkdir "$TMPDIR/test7" && pushd "$TMPDIR/test7" > /dev/null + +echo "> Run bcftools_concat test with regions" +"$meta_executable" \ + --input "../example.vcf.gz" \ + --input "../example_2.vcf.gz" \ + --output "concatenated.vcf" \ + --allow_overlaps \ + --regions "1:752569-752739" \ + &> /dev/null + +# checks +assert_file_exists "concatenated.vcf" +assert_file_not_empty "concatenated.vcf" +assert_file_contains "concatenated.vcf" "concat -a -r 1:752569-752739 -o concatenated.vcf ../example.vcf.gz ../example_2.vcf.gz" +echo "- test7 succeeded -" + +popd > /dev/null + +# Test 8: regions overlap +mkdir "$TMPDIR/test8" && pushd "$TMPDIR/test8" > /dev/null + +echo "> Run bcftools_concat test with regions overlap" +"$meta_executable" \ + --input "../example.vcf.gz" \ + --input "../example_2.vcf.gz" \ + --output "concatenated.vcf" \ + --allow_overlaps \ + --regions_overlap 'pos' \ + &> /dev/null + +# checks +assert_file_exists "concatenated.vcf" +assert_file_not_empty "concatenated.vcf" +assert_file_contains "concatenated.vcf" "concat -a --regions-overlap pos -o concatenated.vcf ../example.vcf.gz ../example_2.vcf.gz" +echo "- test8 succeeded -" + +popd > /dev/null + +echo "---- All tests succeeded! ----" +exit 0 + + + From 3f6a1b52f8aedb15ec3bd6e243de3267a94e4e2e Mon Sep 17 00:00:00 2001 From: Emma Rousseau Date: Fri, 13 Sep 2024 09:08:23 +0200 Subject: [PATCH 08/13] Umitools prepare for rsem (#148) --- CHANGELOG.md | 3 +- .../umi_tools_prepareforrsem/config.vsh.yaml | 107 +++++++ .../umi_tools_prepareforrsem/help.txt | 54 ++++ .../prepare-for-rsem.py | 271 ++++++++++++++++++ .../umi_tools_prepareforrsem/script.sh | 32 +++ .../umi_tools_prepareforrsem/test.sh | 55 ++++ .../test_data/log.log | 103 +++++++ .../test_data/test.bam | Bin 0 -> 11123 bytes .../test_data/test.sam | 119 ++++++++ .../test_data/test_dedup.bam | Bin 0 -> 18822 bytes .../test_data/test_dedup.sam | 201 +++++++++++++ 11 files changed, 944 insertions(+), 1 deletion(-) create mode 100644 src/umi_tools/umi_tools_prepareforrsem/config.vsh.yaml create mode 100644 src/umi_tools/umi_tools_prepareforrsem/help.txt create mode 100644 src/umi_tools/umi_tools_prepareforrsem/prepare-for-rsem.py create mode 100755 src/umi_tools/umi_tools_prepareforrsem/script.sh create mode 100644 src/umi_tools/umi_tools_prepareforrsem/test.sh create mode 100644 src/umi_tools/umi_tools_prepareforrsem/test_data/log.log create mode 100644 src/umi_tools/umi_tools_prepareforrsem/test_data/test.bam create mode 100644 src/umi_tools/umi_tools_prepareforrsem/test_data/test.sam create mode 100644 src/umi_tools/umi_tools_prepareforrsem/test_data/test_dedup.bam create mode 100644 src/umi_tools/umi_tools_prepareforrsem/test_data/test_dedup.sam diff --git a/CHANGELOG.md b/CHANGELOG.md index 034e2422..d88d0996 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -137,7 +137,8 @@ - `samtools/samtools_fastq`: Converts a SAM/BAM/CRAM file to FASTA (PR #53). * `umi_tools`: - -`umi_tools/umi_tools_extract`: Flexible removal of UMI sequences from fastq reads (PR #71). + - `umi_tools/umi_tools_extract`: Flexible removal of UMI sequences from fastq reads (PR #71). + - `umi_tools/umi_tools_prepareforrsem`: Fix paired-end reads in name sorted BAM file to prepare for RSEM (PR #148). * `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43). diff --git a/src/umi_tools/umi_tools_prepareforrsem/config.vsh.yaml b/src/umi_tools/umi_tools_prepareforrsem/config.vsh.yaml new file mode 100644 index 00000000..ceac2052 --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/config.vsh.yaml @@ -0,0 +1,107 @@ +name: "umi_tools_prepareforrsem" +namespace: "umi_tools" +description: Make the output from umi-tools dedup or group compatible with RSEM +keywords: [umi_tools, rsem, bam, sam] +links: + homepage: https://umi-tools.readthedocs.io/en/latest/ + documentation: https://umi-tools.readthedocs.io/en/latest/reference/extract.html + repository: https://github.com/CGATOxford/UMI-tools +references: + doi: 10.1101/gr.209601.116 +license: MIT + +argument_groups: +- name: "Input" + arguments: + - name: "--input" + alternatives: ["-I", "--stdin"] + type: file + required: true + example: $id.transcriptome.bam + +- name: "Output" + arguments: + - name: "--output" + alternatives: ["-S", "--stdout"] + type: file + direction: output + example: $id.transcriptome_sorted.bam + - name: "--log" + alternatives: ["-L"] + type: file + direction: output + description: File with logging information [default = stdout]. + - name: "--error" + alternatives: ["-E"] + type: file + direction: output + description: File with error information [default = stderr]. + - name: "--log2stderr" + type: boolean_true + description: Send logging information to stderr [default = False]. + - name: "--temp_dir" + type: string + description: | + Directory for temporary files. If not set, the bash environmental variable + TMPDIR is used. + - name: "--compresslevel" + type: integer + description: | + Level of Gzip compression to use. Default (6) matchesGNU gzip rather than python + gzip default (which is 9). + +- name: "Options" + arguments: + - name: "--tags" + type: string + description: | + Comma-seperated list of tags to transfer from read1 to read2 (Default: 'UG,BX') + example: "UG,BX" + - name: "--sam" + type: boolean_true + description: Input and output SAM rather than BAM. + - name: "--timeit" + type: string + description: | + Store timeing information in file [none]. + - name: "--timeit_name" + type: string + description: | + Name in timing file for this class of jobs [all]. + - name: "--timeit_header" + type: boolean_true + description: Add header for timing information [none]. + - name: "--verbose" + alternatives: ["-v"] + type: integer + description: | + Loglevel [1]. The higher, the more output. + - name: "--random_seed" + type: integer + description: | + Random seed to initialize number generator with [none]. + + +resources: + - type: bash_script + path: script.sh + # copied from https://github.com/nf-core/rnaseq/blob/3.12.0/bin/prepare-for-rsem.py + - path: prepare-for-rsem.py +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data + +engines: + - type: docker + image: quay.io/biocontainers/umi_tools:1.1.5--py38h0020b31_3 + setup: + - type: docker + run: | + umi_tools -v | sed 's/ version//g' > /var/software_versions.txt + + +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/umi_tools/umi_tools_prepareforrsem/help.txt b/src/umi_tools/umi_tools_prepareforrsem/help.txt new file mode 100644 index 00000000..efaf4de6 --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/help.txt @@ -0,0 +1,54 @@ +``` +umi_tools prepare-for-rsem --help +``` + +prepare_for_rsem - make output from dedup or group compatible with RSEM + +Usage: umi_tools prepare_for_rsem [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] + + note: If --stdout is ommited, standard out is output. To + generate a valid BAM file on standard out, please + redirect log with --log=LOGFILE or --log2stderr + +For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/ + +Options: + --version show program's version number and exit + + RSEM preparation specific options: + --tags=TAGS Comma-seperated list of tags to transfer from read1 to + read2 + --sam input and output SAM rather than BAM + + input/output options: + -I FILE, --stdin=FILE + file to read stdin from [default = stdin]. + -L FILE, --log=FILE + file with logging information [default = stdout]. + -E FILE, --error=FILE + file with error information [default = stderr]. + -S FILE, --stdout=FILE + file where output is to go [default = stdout]. + --temp-dir=FILE Directory for temporary files. If not set, the bash + environmental variable TMPDIR is used[default = None]. + --log2stderr send logging information to stderr [default = False]. + --compresslevel=COMPRESSLEVEL + Level of Gzip compression to use. Default (6) + matchesGNU gzip rather than python gzip default (which + is 9) + + profiling options: + --timeit=TIMEIT_FILE + store timeing information in file [none]. + --timeit-name=TIMEIT_NAME + name in timing file for this class of jobs [all]. + --timeit-header add header for timing information [none]. + + common options: + -v LOGLEVEL, --verbose=LOGLEVEL + loglevel [1]. The higher, the more output. + -h, --help output short help (command line options only). + --help-extended Output full documentation + --random-seed=RANDOM_SEED + random seed to initialize number generator with + [none]. \ No newline at end of file diff --git a/src/umi_tools/umi_tools_prepareforrsem/prepare-for-rsem.py b/src/umi_tools/umi_tools_prepareforrsem/prepare-for-rsem.py new file mode 100644 index 00000000..b53d30ac --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/prepare-for-rsem.py @@ -0,0 +1,271 @@ +#!/usr/bin/env python3 + +""" +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Credits +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This script is a clone of the "prepare-for-rsem.py" script written by +Ian Sudbury, Tom Smith and other contributors to the UMI-tools package: +https://github.com/CGATOxford/UMI-tools + +It has been included here to address problems encountered with +Salmon quant and RSEM as discussed in the issue below: +https://github.com/CGATOxford/UMI-tools/issues/465 + +When the "umi_tools prepare-for-rsem" command becomes available in an official +UMI-tools release this script will be replaced and deprecated. + +Commit: +https://github.com/CGATOxford/UMI-tools/blob/bf8608d6a172c5ca0dcf33c126b4e23429177a72/umi_tools/prepare-for-rsem.py + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +prepare_for_rsem - make the output from dedup or group compatible with RSEM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The SAM format specification states that the mnext and mpos fields should point +to the primary alignment of a read's mate. However, not all aligners adhere to +this standard. In addition, the RSEM software requires that the mate of a read1 +appears directly after it in its input BAM. This requires that there is exactly +one read1 alignment for every read2 and vice versa. + +In general (except in a few edge cases) UMI tools outputs only the read2 to that +corresponds to the read specified in the mnext and mpos positions of a selected +read1, and only outputs this read once, even if multiple read1s point to it. +This makes UMI-tools outputs incompatible with RSEM. This script takes the output +from dedup or groups and ensures that each read1 has exactly one read2 (and vice +versa), that read2 always appears directly after read1,and that pairs point to +each other (note this is technically not valid SAM format). Copy any specified +tags from read1 to read2 if they are present (by default, UG and BX, the unique +group and correct UMI tags added by _group_) + +Input must to name sorted. + + +https://raw.githubusercontent.com/CGATOxford/UMI-tools/master/LICENSE + +""" + +from umi_tools import Utilities as U +from collections import defaultdict, Counter +import pysam +import sys + + +usage = """ +prepare_for_rsem - make output from dedup or group compatible with RSEM + +Usage: umi_tools prepare_for_rsem [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] + + note: If --stdout is omited, standard out is output. To + generate a valid BAM file on standard out, please + redirect log with --log=LOGFILE or --log2stderr """ + + +def chunk_bam(bamfile): + """Take in a iterator of pysam.AlignmentSegment entries and yield + lists of reads that all share the same name""" + + last_query_name = None + output_buffer = list() + + for read in bamfile: + if last_query_name is not None and last_query_name != read.query_name: + yield (output_buffer) + output_buffer = list() + + last_query_name = read.query_name + output_buffer.append(read) + + yield (output_buffer) + + +def copy_tags(tags, read1, read2): + """Given a list of tags, copies the values of these tags from read1 + to read2, if the tag is set""" + + for tag in tags: + try: + read1_tag = read1.get_tag(tag, with_value_type=True) + read2.set_tag(tag, value=read1_tag[0], value_type=read1_tag[1]) + except KeyError: + pass + + return read2 + + +def pick_mate(read, template_dict, mate_key): + """Find the mate of read in the template dict using key. It will retrieve + all reads at that key, and then scan to pick the one that refers to _read_ + as it's mate. If there is no such read, it picks a first one it comes to""" + + mate = None + + # get a list of secondary reads at the correct alignment position + potential_mates = template_dict[not read.is_read1][mate_key] + + # search through one at a time to find a read that points to the current read + # as its mate. + for candidate_mate in potential_mates: + if ( + candidate_mate.next_reference_name == read.reference_name + and candidate_mate.next_reference_start == read.pos + ): + mate = candidate_mate + + # if no such read is found, then pick any old secondary alignment at that position + # note: this happens when UMI-tools outputs the wrong read as something's pair. + if mate is None and len(potential_mates) > 0: + mate = potential_mates[0] + + return mate + + +def main(argv=None): + if argv is None: + argv = sys.argv + + # setup command line parser + parser = U.OptionParser(version="%prog version: $Id$", usage=usage, description=globals()["__doc__"]) + group = U.OptionGroup(parser, "RSEM preparation specific options") + + group.add_option( + "--tags", + dest="tags", + type="string", + default="UG,BX", + help="Comma-separated list of tags to transfer from read1 to read2", + ) + group.add_option( + "--sam", dest="sam", action="store_true", default=False, help="input and output SAM rather than BAM" + ) + + parser.add_option_group(group) + + # add common options (-h/--help, ...) and parse command line + (options, args) = U.Start( + parser, argv=argv, add_group_dedup_options=False, add_umi_grouping_options=False, add_sam_options=False + ) + + skipped_stats = Counter() + + if options.stdin != sys.stdin: + in_name = options.stdin.name + options.stdin.close() + else: + in_name = "-" + + if options.sam: + mode = "" + else: + mode = "b" + + inbam = pysam.AlignmentFile(in_name, "r" + mode) + + if options.stdout != sys.stdout: + out_name = options.stdout.name + options.stdout.close() + else: + out_name = "-" + + outbam = pysam.AlignmentFile(out_name, "w" + mode, template=inbam) + + options.tags = options.tags.split(",") + + for template in chunk_bam(inbam): + assert len(set(r.query_name for r in template)) == 1 + current_template = {True: defaultdict(list), False: defaultdict(list)} + + for read in template: + key = (read.reference_name, read.pos, not read.is_secondary) + current_template[read.is_read1][key].append(read) + + output = set() + + for read in template: + mate = None + + # if this read is a non_primary alignment, we first want to check if it has a mate + # with the non-primary alignment flag set. + + mate_key_primary = True + mate_key_secondary = (read.next_reference_name, read.next_reference_start, False) + + # First look for a read that has the same primary/secondary status + # as read (i.e. secondary mate for secondary read, and primary mate + # for primary read) + mate_key = (read.next_reference_name, read.next_reference_start, read.is_secondary) + mate = pick_mate(read, current_template, mate_key) + + # If none was found then look for the opposite (primary mate of secondary + # read or seconadary mate of primary read) + if mate is None: + mate_key = (read.next_reference_name, read.next_reference_start, not read.is_secondary) + mate = pick_mate(read, current_template, mate_key) + + # If we still don't have a mate, then their can't be one? + if mate is None: + skipped_stats["no_mate"] += 1 + U.warn( + "Alignment {} has no mate -- skipped".format( + "\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)])) + ) + ) + continue + + # because we might want to make changes to the read, but not have those changes reflected + # if we need the read again,we copy the read. This is only way I can find to do this. + read = pysam.AlignedSegment().from_dict(read.to_dict(), read.header) + mate = pysam.AlignedSegment().from_dict(mate.to_dict(), read.header) + + # Make it so that if our read is secondary, the mate is also secondary. We don't make the + # mate primary if the read is primary because we would otherwise end up with mulitple + # primary alignments. + if read.is_secondary: + mate.is_secondary = True + + # In a situation where there is already one mate for each read, then we will come across + # each pair twice - once when we scan read1 and once when we scan read2. Thus we need + # to make sure we don't output something already output. + if read.is_read1: + mate = copy_tags(options.tags, read, mate) + output_key = str(read) + str(mate) + + if output_key not in output: + output.add(output_key) + outbam.write(read) + outbam.write(mate) + skipped_stats["pairs_output"] += 1 + + elif read.is_read2: + read = copy_tags(options.tags, mate, read) + output_key = str(mate) + str(read) + + if output_key not in output: + output.add(output_key) + outbam.write(mate) + outbam.write(read) + skipped_stats["pairs_output"] += 1 + + else: + skipped_stats["skipped_not_read_12"] += 1 + U.warn( + "Alignment {} is neither read1 nor read2 -- skipped".format( + "\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)])) + ) + ) + continue + + if not out_name == "-": + outbam.close() + + U.info( + "Total pairs output: {}, Pairs skipped - no mates: {}," + " Pairs skipped - not read1 or 2: {}".format( + skipped_stats["pairs_output"], skipped_stats["no_mate"], skipped_stats["skipped_not_read12"] + ) + ) + U.Stop() + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/src/umi_tools/umi_tools_prepareforrsem/script.sh b/src/umi_tools/umi_tools_prepareforrsem/script.sh new file mode 100755 index 00000000..d6b3775f --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/script.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -eo pipefail + +unset_if_false=( + par_sam + par_error + par_log2stderr + par_timeit_header ) + +for var in "${unset_if_false[@]}"; do + test_val="${!var}" + [[ "$test_val" == "false" ]] && unset $var +done + +umi_tools prepare-for-rsem \ + ${par_log:+--log "${par_log}"} \ + ${par_tags:+--tags "${par_tags}"} \ + ${par_sam:+--sam} \ + --stdin="${par_input}" \ + ${par_output:+--stdout "${par_output}"} \ + ${par_error:+--error "${par_error}"} \ + ${par_temp_dir:+--temp-dir "${par_temp_dir}"} \ + ${par_log2stderr:+--log2stderr} \ + ${par_verbose:+--verbose "${par_verbose}"} \ + ${par_random_seed:+--random-seed "${par_random_seed}"} \ + ${par_compresslevel:+--compresslevel "${par_compresslevel}"} + ${par_timeit:+--timeit "${par_timeit}"} \ + ${par_timeit_name:+--timeit-name "${par_timeit_name}"} \ + ${par_timeit_header:+--timeit-header} + + diff --git a/src/umi_tools/umi_tools_prepareforrsem/test.sh b/src/umi_tools/umi_tools_prepareforrsem/test.sh new file mode 100644 index 00000000..c94a202d --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/test.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +test_dir="$meta_resources_dir/test_data" +apt-get -q update && apt-get -q install -y samtools + +################################################################################ +echo ">>> Test 1: with --sam:" + +"${meta_executable}" \ + --input "$test_dir/test_dedup.sam" \ + --output "$test_dir/test_output.sam" \ + --sam + +echo ">>> Check if output is present" +[[ ! -f "$test_dir/test_output.sam" ]] && echo "Output file not found" && exit 1 +[[ ! -s "$test_dir/test_output.sam" ]] && echo "Output file is empty" && exit 1 + +echo ">>> Check if output is correct" +# use diff but ignoring the header lines (which start with @) as they may differ slightly +diff <(grep -v "^@" "$test_dir/test_output.sam") <(grep -v "^@" "$test_dir/test.sam") && echo "Output is correct" || (echo "Output is incorrect" && exit 1) + +################################################################################ +echo ">>> Test 2: without --sam:" + +"${meta_executable}" \ + --input "$test_dir/test_dedup.bam" \ + --output "$test_dir/test_output.bam" + +echo ">>> Check if output is present" +[[ ! -f "$test_dir/test_output.bam" ]] && echo "Output file not found" && exit 1 +[[ ! -s "$test_dir/test_output.bam" ]] && echo "Output file is empty" && exit 1 + +echo ">>> Check if output is correct" +diff <(samtools view "$test_dir/test_output.bam") <(samtools view "$test_dir/test.bam") || (echo "Output is incorrect" && exit 1) + +################################################################################ +echo ">>> Test 3: with --log:" + +"${meta_executable}" \ + --log "$test_dir/test_log.log" \ + --input "$test_dir/test_dedup.sam" \ + --output "$test_dir/test_output.sam" \ + --sam + +echo ">>> Check if output is present" +[[ ! -f "$test_dir/test_output.sam" ]] && echo "Output file not found" && exit 1 +[[ ! -s "$test_dir/test_output.sam" ]] && echo "Output file is empty" && exit 1 +[[ ! -f "$test_dir/test_log.log" ]] && echo "Log file not found" && exit 1 +[[ ! -s "$test_dir/test_log.log" ]] && echo "Log file is empty" && exit 1 + +echo ">>> Check if log file is correct" +diff <(grep -v '^#' "$test_dir/test_log.log" | sed 's/^[0-9-]* [0-9:]*,[0-9]\{3\} //') <(grep -v '^#' "$test_dir/log.log" | sed 's/^[0-9-]* [0-9:]*,[0-9]\{3\} //') || (echo "Log file is incorrect" && exit 1) + +echo ">>> All test succeeded" +exit 0 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_prepareforrsem/test_data/log.log b/src/umi_tools/umi_tools_prepareforrsem/test_data/log.log new file mode 100644 index 00000000..e4b56e57 --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/test_data/log.log @@ -0,0 +1,103 @@ +# UMI-tools version: 1.1.5 +# output generated by prepare-for-rsem.py --log test_data/log.log --sam --stdin test_data/test_dedup.sam --stdout jnfgioeurg.sam +# job started at Tue Sep 10 06:43:30 2024 on 4855b4607095 -- 07ae7548-56e8-4772-9b48-7406710fd838 +# pid: 28, system: Linux 6.10.0-linuxkit #1 SMP PREEMPT_DYNAMIC Wed Jul 17 10:54:05 UTC 2024 x86_64 +# compresslevel : 6 +# log2stderr : False +# loglevel : 1 +# random_seed : None +# sam : True +# short_help : None +# stderr : <_io.TextIOWrapper name='' mode='w' encoding='utf-8'> +# stdin : <_io.TextIOWrapper name='test_data/test_dedup.sam' mode='r' encoding='UTF-8'> +# stdlog : <_io.TextIOWrapper name='test_data/log.log' mode='a' encoding='UTF-8'> +# stdout : <_io.TextIOWrapper name='jnfgioeurg.sam' mode='w' encoding='UTF-8'> +# tags : UG,BX +# timeit_file : None +# timeit_header : None +# timeit_name : all +# tmpdir : None +2024-09-10 06:43:30,918 WARNING Alignment ERR5069949.114870 99 MT192765.1 642 has no mate -- skipped +2024-09-10 06:43:30,918 WARNING Alignment ERR5069949.147998 163 MT192765.1 673 has no mate -- skipped +2024-09-10 06:43:30,919 WARNING Alignment ERR5069949.114870 147 MT192765.1 747 has no mate -- skipped +2024-09-10 06:43:30,920 WARNING Alignment ERR5069949.147998 83 MT192765.1 918 has no mate -- skipped +2024-09-10 06:43:30,921 WARNING Alignment ERR5069949.184542 99 MT192765.1 1054 has no mate -- skipped +2024-09-10 06:43:30,921 WARNING Alignment ERR5069949.184542 147 MT192765.1 1254 has no mate -- skipped +2024-09-10 06:43:30,922 WARNING Alignment ERR5069949.376959 99 MT192765.1 4104 has no mate -- skipped +2024-09-10 06:43:30,924 WARNING Alignment ERR5069949.376959 147 MT192765.1 4189 has no mate -- skipped +2024-09-10 06:43:30,925 WARNING Alignment ERR5069949.532979 99 MT192765.1 5567 has no mate -- skipped +2024-09-10 06:43:30,926 WARNING Alignment ERR5069949.540529 163 MT192765.1 5569 has no mate -- skipped +2024-09-10 06:43:30,926 WARNING Alignment ERR5069949.532979 147 MT192765.1 5620 has no mate -- skipped +2024-09-10 06:43:30,927 WARNING Alignment ERR5069949.540529 83 MT192765.1 5658 has no mate -- skipped +2024-09-10 06:43:30,930 WARNING Alignment ERR5069949.856527 99 MT192765.1 10117 has no mate -- skipped +2024-09-10 06:43:30,931 WARNING Alignment ERR5069949.870926 99 MT192765.1 10117 has no mate -- skipped +2024-09-10 06:43:30,931 WARNING Alignment ERR5069949.856527 147 MT192765.1 10198 has no mate -- skipped +2024-09-10 06:43:30,931 WARNING Alignment ERR5069949.885966 99 MT192765.1 10229 has no mate -- skipped +2024-09-10 06:43:30,932 WARNING Alignment ERR5069949.870926 147 MT192765.1 10244 has no mate -- skipped +2024-09-10 06:43:30,932 WARNING Alignment ERR5069949.885966 147 MT192765.1 10276 has no mate -- skipped +2024-09-10 06:43:30,932 WARNING Alignment ERR5069949.937422 99 MT192765.1 10421 has no mate -- skipped +2024-09-10 06:43:30,933 WARNING Alignment ERR5069949.937422 147 MT192765.1 10590 has no mate -- skipped +2024-09-10 06:43:30,934 WARNING Alignment ERR5069949.1066259 99 MT192765.1 11336 has no mate -- skipped +2024-09-10 06:43:30,935 WARNING Alignment ERR5069949.1062611 163 MT192765.1 11426 has no mate -- skipped +2024-09-10 06:43:30,936 WARNING Alignment ERR5069949.1067032 163 MT192765.1 11433 has no mate -- skipped +2024-09-10 06:43:30,936 WARNING Alignment ERR5069949.1062611 83 MT192765.1 11453 has no mate -- skipped +2024-09-10 06:43:30,936 WARNING Alignment ERR5069949.1066259 147 MT192765.1 11479 has no mate -- skipped +2024-09-10 06:43:30,937 WARNING Alignment ERR5069949.1067032 83 MT192765.1 11480 has no mate -- skipped +2024-09-10 06:43:30,938 WARNING Alignment ERR5069949.1258508 163 MT192765.1 12424 has no mate -- skipped +2024-09-10 06:43:30,939 WARNING Alignment ERR5069949.1261808 99 MT192765.1 12592 has no mate -- skipped +2024-09-10 06:43:30,940 WARNING Alignment ERR5069949.1258508 83 MT192765.1 12637 has no mate -- skipped +2024-09-10 06:43:30,940 WARNING Alignment ERR5069949.1261808 147 MT192765.1 12653 has no mate -- skipped +2024-09-10 06:43:30,941 WARNING Alignment ERR5069949.1372331 163 MT192765.1 13010 has no mate -- skipped +2024-09-10 06:43:30,941 WARNING Alignment ERR5069949.1372331 83 MT192765.1 13131 has no mate -- skipped +2024-09-10 06:43:30,942 WARNING Alignment ERR5069949.1552198 99 MT192765.1 13943 has no mate -- skipped +2024-09-10 06:43:30,943 WARNING Alignment ERR5069949.1561137 163 MT192765.1 13990 has no mate -- skipped +2024-09-10 06:43:30,943 WARNING Alignment ERR5069949.1552198 147 MT192765.1 14026 has no mate -- skipped +2024-09-10 06:43:30,944 WARNING Alignment ERR5069949.1561137 83 MT192765.1 14080 has no mate -- skipped +2024-09-10 06:43:30,947 WARNING Alignment ERR5069949.2098070 99 MT192765.1 17114 has no mate -- skipped +2024-09-10 06:43:30,947 WARNING Alignment ERR5069949.2064910 99 MT192765.1 17122 has no mate -- skipped +2024-09-10 06:43:30,947 WARNING Alignment ERR5069949.2125592 99 MT192765.1 17179 has no mate -- skipped +2024-09-10 06:43:30,947 WARNING Alignment ERR5069949.2064910 147 MT192765.1 17179 has no mate -- skipped +2024-09-10 06:43:30,948 WARNING Alignment ERR5069949.2098070 147 MT192765.1 17269 has no mate -- skipped +2024-09-10 06:43:30,948 WARNING Alignment ERR5069949.2125592 147 MT192765.1 17288 has no mate -- skipped +2024-09-10 06:43:30,948 WARNING Alignment ERR5069949.2185111 163 MT192765.1 17405 has no mate -- skipped +2024-09-10 06:43:30,949 WARNING Alignment ERR5069949.2151832 163 MT192765.1 17415 has no mate -- skipped +2024-09-10 06:43:30,949 WARNING Alignment ERR5069949.2176303 99 MT192765.1 17441 has no mate -- skipped +2024-09-10 06:43:30,949 WARNING Alignment ERR5069949.2151832 83 MT192765.1 17452 has no mate -- skipped +2024-09-10 06:43:30,949 WARNING Alignment ERR5069949.2205229 99 MT192765.1 17475 has no mate -- skipped +2024-09-10 06:43:30,950 WARNING Alignment ERR5069949.2216307 163 MT192765.1 17503 has no mate -- skipped +2024-09-10 06:43:30,950 WARNING Alignment ERR5069949.2176303 147 MT192765.1 17518 has no mate -- skipped +2024-09-10 06:43:30,950 WARNING Alignment ERR5069949.2185111 83 MT192765.1 17536 has no mate -- skipped +2024-09-10 06:43:30,951 WARNING Alignment ERR5069949.2205229 147 MT192765.1 17584 has no mate -- skipped +2024-09-10 06:43:30,951 WARNING Alignment ERR5069949.2216307 83 MT192765.1 17600 has no mate -- skipped +2024-09-10 06:43:30,952 WARNING Alignment ERR5069949.2270078 163 MT192765.1 17969 has no mate -- skipped +2024-09-10 06:43:30,953 WARNING Alignment ERR5069949.2270078 83 MT192765.1 18102 has no mate -- skipped +2024-09-10 06:43:30,953 WARNING Alignment ERR5069949.2328704 163 MT192765.1 18285 has no mate -- skipped +2024-09-10 06:43:30,954 WARNING Alignment ERR5069949.2342766 99 MT192765.1 18396 has no mate -- skipped +2024-09-10 06:43:30,954 WARNING Alignment ERR5069949.2328704 83 MT192765.1 18411 has no mate -- skipped +2024-09-10 06:43:30,954 WARNING Alignment ERR5069949.2361683 99 MT192765.1 18425 has no mate -- skipped +2024-09-10 06:43:30,954 WARNING Alignment ERR5069949.2342766 147 MT192765.1 18468 has no mate -- skipped +2024-09-10 06:43:30,955 WARNING Alignment ERR5069949.2361683 147 MT192765.1 18512 has no mate -- skipped +2024-09-10 06:43:30,955 WARNING Alignment ERR5069949.2415814 99 MT192765.1 18597 has no mate -- skipped +2024-09-10 06:43:30,955 WARNING Alignment ERR5069949.2385514 99 MT192765.1 18602 has no mate -- skipped +2024-09-10 06:43:30,956 WARNING Alignment ERR5069949.2417063 99 MT192765.1 18648 has no mate -- skipped +2024-09-10 06:43:30,956 WARNING Alignment ERR5069949.2388984 99 MT192765.1 18653 has no mate -- skipped +2024-09-10 06:43:30,956 WARNING Alignment ERR5069949.2385514 147 MT192765.1 18684 has no mate -- skipped +2024-09-10 06:43:30,956 WARNING Alignment ERR5069949.2388984 147 MT192765.1 18693 has no mate -- skipped +2024-09-10 06:43:30,957 WARNING Alignment ERR5069949.2431709 99 MT192765.1 18748 has no mate -- skipped +2024-09-10 06:43:30,957 WARNING Alignment ERR5069949.2415814 147 MT192765.1 18764 has no mate -- skipped +2024-09-10 06:43:30,957 WARNING Alignment ERR5069949.2417063 147 MT192765.1 18765 has no mate -- skipped +2024-09-10 06:43:30,958 WARNING Alignment ERR5069949.2431709 147 MT192765.1 18776 has no mate -- skipped +2024-09-10 06:43:30,959 WARNING Alignment ERR5069949.2668880 99 MT192765.1 23124 has no mate -- skipped +2024-09-10 06:43:30,960 WARNING Alignment ERR5069949.2674295 163 MT192765.1 23133 has no mate -- skipped +2024-09-10 06:43:30,960 WARNING Alignment ERR5069949.2668880 147 MT192765.1 23145 has no mate -- skipped +2024-09-10 06:43:30,960 WARNING Alignment ERR5069949.2674295 83 MT192765.1 23203 has no mate -- skipped +2024-09-10 06:43:30,963 WARNING Alignment ERR5069949.2953930 99 MT192765.1 25344 has no mate -- skipped +2024-09-10 06:43:30,963 WARNING Alignment ERR5069949.2972968 163 MT192765.1 25425 has no mate -- skipped +2024-09-10 06:43:30,963 WARNING Alignment ERR5069949.2953930 147 MT192765.1 25464 has no mate -- skipped +2024-09-10 06:43:30,964 WARNING Alignment ERR5069949.2972968 83 MT192765.1 25518 has no mate -- skipped +2024-09-10 06:43:30,966 WARNING Alignment ERR5069949.3273002 163 MT192765.1 28442 has no mate -- skipped +2024-09-10 06:43:30,966 WARNING Alignment ERR5069949.3277445 99 MT192765.1 28508 has no mate -- skipped +2024-09-10 06:43:30,966 WARNING Alignment ERR5069949.3273002 83 MT192765.1 28543 has no mate -- skipped +2024-09-10 06:43:30,966 WARNING Alignment ERR5069949.3277445 147 MT192765.1 28573 has no mate -- skipped +2024-09-10 06:43:30,968 INFO Total pairs output: 56, Pairs skipped - no mates: 82, Pairs skipped - not read1 or 2: 0 +# job finished in 0 seconds at Tue Sep 10 06:43:30 2024 -- 4.44 0.25 0.00 0.00 -- 07ae7548-56e8-4772-9b48-7406710fd838 diff --git a/src/umi_tools/umi_tools_prepareforrsem/test_data/test.bam b/src/umi_tools/umi_tools_prepareforrsem/test_data/test.bam new file mode 100644 index 0000000000000000000000000000000000000000..7793c7e3635e9c6a428647be7d067194f1d9d522 GIT binary patch literal 11123 zcmV-(D~!}1iwFb&00000{{{d;LjnM70fmy^PQox4#fx{vm)Hx?@l^({R~wKZ$u_4G zZkO&4xP+~^Ry5xGYCe}?$QGBOcc=Zn{!Y$Gr?%VsxVY=k@0u*NBTVWES6tt4skknADPwV< z`eJ5>LjYrB7$bu~Xb0}k=>Xw2EkvHhWK-}q;zdu21``6QF3I-epG8_Po&!mqD520^6NV7@DeX#>$<_dF}pM*P-JgGE!-FF(?Tr{A67(_wKp=z#&5V}SPHNO>oDuliP znC$J!Zged{rgwP){h&b`-j?>%RqHM93U>o?!?_kC;4Hx~A| zb!)L;cfBm$THL$%Q8)JY@NfCV6HnZI&pij*hbJfdC(qwKIXpaG{KAckMWNe5G&*J? zVR4?9T^@IdR)x|!PTH=nN+n8Js#>e+Vj1CBMrBl*ziCQ8mZcev(k#nT{8?r=8D-@% zi^>cq{XcxgXjZ1~=g7V}!)W&4&ZU3%;O?cxJum$mJBuHD<)!PZdoR83Z~xejUw_N3 zFMjJ!U0S^HO_y$Z^P4X%-u9O2Rd0Im()-%2VQO}|zqEM&2!e0m`^t1*f6HPaC6}BD zreoE$Do(1rOk~`(f{RY&ji`0p=uQ`HtMamtN$ceA)md&b`cG`S|sVMeIb!WBECZy?81*I|m0R z`}+n=#kgd;$yvp;ieu3+)l|GK7}rYbF4l~9x#W2gOP$Ag(&&mMj3q+xMrhu&3DYGP z0KX`+6fmS2&5{6HM$088l$8!CI$;i)Iej(28gPs#Ib;nwB}C8u5V^Y&c`2mNUM`1Q zm1dJWw;b-J`Og66e?AS&TW7)iTo2|Tm;~^Dp9p;Yz~=Uz_E3~Cy=@yvg>7t%1K zk>E`+n5HRRK&S8`HJ@j$OO4!GMK70WMAw?D(lP{d0WhCOU=lVCOf~}Zd0XFfz`g7S z59S*euNjTKCIIvB9f^|)g$Vn4s z{hDCbmp(q7^ixKb?ct2*Je>NsPuRxmv+ z7ZsO6=0&WUTvu9FDp6GnI~4N^B(5o$-^^Oo%*bqJEeIqRqa?{B;@+zFC`DtqjDn@^ zl~u9jnF00C{e3{UJyw$210~rSDaq|yFWee{@)qa4iviI02CmS8_ep?P3mzxBin~VU zRU(;6bXh2++a|A+XmiP!&^0R(*(o?o9mlz7YOc6W0CbZ`Rm)tSS$VQ+bj`>(h3|9d zO-u~mU}E^zD;#=%_o;)4`G7<3wx=x?gzgBvz3r3zo$bY2V6Ur$Rk|T46i+&qGtsIx zk6BgLi9%e#VZUQZ3K=uqD#TK(=$K9rJuni)M7JF|#wt-442F@Bo@{BRW8}?82)$n# zO?L!=d)2G%;52&=N1;R1UvJ%IWJ41p`V@Ci+(K`)UXuPjxN~p372^5)0MFY;cs_sY z<^0v{fc2cjXuGe4EfrH$_h&1@CTxo9GuL=!hUH1ECuG+$&5;{%If?EO>2 zxOaGXa&)k`eA8k{X9O1^dGxPc7#%7m4JBEMg=+1!9c0N;F5* zu3|=4=0LoI^dVNM8rXgY%O&$T024-qLXj++2I*_S>;sXqoDKS~`pXbPq{;1Mx`x1`p(YGYZD&{dsCv=+z}+nBd`4ad;M zF}wpD1CNWC!5uVOLYQ#|v0Wv-O?Dz3)TUf&5oW0v~_RtX!29heEKY! zgj|?Q62d!yt`BIo;B)de!5597BhSkm(MCh~0b~hwxZgZ`9`()%s!;y~thx>U-saQ40($Q@>E1Jm!%}Wl==Pw7@{HG4hKLDC{PJQXY z_RjwC;UXi>s6k|$w3R~ehF}gq#JUBc!Rwqt;SMn}t6HI0soEB}31twb_zw{ZXrqMH z3Q)!_2%%{#M^jJT%!au~=>Wi4j%@bjDerN9_%k7l<<{Et6z4p_@dwGOP!hCXn0jId zqY3chBmoB`8U;86QVXSW0Y8)!QnXAaadKv$uZd93e84Fm>&T|mhTkTfwBfdW-yE2s zOvV$L^k81ReemWU%nxAf{{~Yo5f75|rt(q!W3?@x3i1C}G~} z4scZrh7(>1i~?+B4x=fF0OfGwttJZM0+C^9KPgWyVl*9DJD-h=l8(*jT)9j4TK?zY?by zP0*lVkmmQ2Fo^!(TF&Is2Pd<0Hf%h*SB7eL-r0GT>VNZWNHCYf(2z=5LYv6^G!5om zo?&*kj%QamJ$ThKJcD@e;%64a*gu@!yThHGo!z~~XTN8$;1U82z9(mrCmJypOg5_; z-5?T6I6^eoZK)*kfljta)wzV`U>Hf1NN`W5Vw$DH#KJkPvlH2sNV=EtU7+;#F-qT_ zqIB~JrEgDAaxySR>1U=gaP3eUeAm?IZkl(xerT=ji2RrfeT~5ZN^cKkDL^S2qf~hZ z^=ymMb&#c>b141xUqTof$P&2dz2ilLvzKqiJHQ)Oh|UX+yb?O2G-^Lh2PY1ISb`*p z4o>3&cp*crYE7qL;)uXUI22RV2veWsaIAPZR_)0G?m4VrMhT zsAmX!bb5)I@L;Yi8M(Ja7En)dd+IywFE=D2-JH|~?Z(Oz--rP!4llE19Otmi z0!Hk@2+ORGm@a?cYKD0S#=betFb|Ipw@)m;h%x{O#Y7-(qH3B_9myPrhIUeb{6M9* z?iwmmD%1zSfJ?+pEa5GFq^lTEhu7~^%zJ?dH7_%0jB}VQu<`4TYy$szZJ-y-@1904 z-Zr5Z|Jv37oEJ<_CK13FrU|A4(~#d?f=s+^!71s6M6LwUQ72$<6@(2%bY$f*93szS ztyw8S*PrQIa4s<6#(1GbjM&40gq*_Lp zRwxv=v64+**02W^!yQ3d)wU0r77NRITlG%;jNFus+Gb-3&Q6!QE4x8G%uCAHFI-_Q zfv^ixO~Uv`rsC=FBwepLj3M~9R_jEeKnTnL9^lR4d=Rq(x>%XYAkS;vwwio4v>bFW ztSWfY4sWURHV(>L+1Vi2z-IcNQ~z=r?)CY?GIPAA{pwAv|KwRTH}fA}*>8C6s&sx; zEEX@Fy7H5qlf$DOBb_|qQpGTsorY~hHV#-6%BkRwnndUDP9T+uweEc-wyk#Ps-VqUW zp>Pt!pO`vz2PX+9I3d+`yoFzfWP-Z9a(vIggJgZ=Gm+x(bvFGnbi!m~k+r9M=ZC#_ z?sv$t-h$478OkP~$mW-QNO|XwZvK!P`?nbTg+Mls_D=S;7vH273EhFss1ph*!Li3p zOIj|dN+@!$M$qycpj}K&0W=yaosoHku+n#5+4Fv~Ygmktul=!row)R~nBgB)pSf!$ z;xCzL(-z*V_OPU>Mm(9EnP4&YSc=FB69);h`JgOro2#8GYEF!bEND_?kri1zj4JAT$!4!~__*YZ^i{?roh;xOj@lB5W9A=KxmB@!l z2iis=WSXLuro3(?%(n2E>r=|-aS6cr%^)mJ5Mfn{0J|m7 z2MZ1v7v8j7(wsZ66bf`d;;{})AgDreiUKqhtOsxrLP2U|jk3GsfKEzPfJM{|(u@*# zRXO<}@1s&X_sn&D**+PYU`0M~wPmZJWv>S|>YEBiC9UBu_*c<1$XGR3Uh{tQG~;sn zY#HT$0?gY2l9k5ZA9Rs{u|7Q6vn;FNycRV=Rs^{@&th5@Jcgi1g) zA$NI~h?X)i$qNFphFMY=w=Anu$farA^qqwC2ND(i2Mm(3h#>Tw1pf(-2X5t?G_*$&Om z5}MBOy>l^){i6`g-R+~Jqs3*I!K(t(N`qx7qs+WAseRhF#HKDq$eiDgvME_GZciE= z%Lzgb(xn~t7`+)7J$LLU{&+oC`^OW6oaY=P^v+OwuMtA$D=}r~JUy~>VI!AEXt4pI z?;RuLoyCtlf3Rx(0PDYaB~>imH1{~g?DRQAY{|9}NdEw2>W z#`)ea8xv*=_-X%Edzoo^H{R9JROY`mz8ItAjg7bh%8v!h3hsR>DdeXf{qS9@SmK;F zZ|~{B9LyU5{K8NnEts~2VKZon1x-276fj5ds`93-i7l2aPf(|;62@z9kW3)qx)!og zuX>u5V?Ua@ws%`i$yUT}o@&pc-p#DYXPCiy)&1CM&LWz17I(hDqdBm3ImYe}(A?SE z-8-<(f)yyS)Kb&{86> zfx0RXE9W&-IBp?xNXU32st!`C;)3xO75Jox3GfGsU_Ws-r({mU4B)HdKQmw1 z=<^ecLpoZIF>QO>L=_K%X*o*&C+k0gtE15edinh7X?U_(JYPE_pM>$Bh4MMzc`gZT zU}@Lm|Ik{`y~4$%HcpjhZ9_%hM>Uu>LEtfEbvQ&ziL{LGBHB!L5CMSiO5zE1mB(SGEk#JmB*-_&& zxDBT<|NFwQ=CSn)VIG^ze&2fH@$5eoYUfqV{t8U`*$=NxH2ZYJ?5`Wo{-sWrj(3K8 z|2{YSdocDNf_uNae|UhR;YI3Zp$0$Rq2#P(&6}bF`5|NYP1e*HrgR9F)e)Uk!SwjUfgs5ddP0Wtm5_U%hB&nD!H0@p@0nF$xczU_N@9S-NS~%>T39 zRkvATEDrG8+dbYnwq~9+9APdblP5_F#T2SVBm&stM3x;zCynIDM5+X2RKYX~g;Jw} zEn13E5G5<*bBOz5lnqU|i_|_fHrLiuOe!Bap-Qu58VwQiw2Cu0#8En%&7oADGB`SY zR=0WEES}Zws@o)t|9_zBBRty}#cP>^6sh98fJFnnhaW)*OKR-{Pt01bd5rQlODfG` zh7+=CBuX+G^#{mioukaviul4<**$FQm^k;-Y-BNjF{FIi$UkJG5mTge(YUK6q^aX{ z?C)>|c)q|d!R_&U<3cTlF%{yuw{v`avbcCN@qIBBQ5hl%1RJ#=jb{SXL5F;e2vtE& zLjhQ7qO7Q)MY3D-$ajX2-MpbT zc=Hr8N9v6sdsnE^*A}uVAlYOxn~rF(nNfKbi#ey@UTdET$lkdLvg;?1d9D50j1~Od z-PP1~Z-4v1t;9K#eFK0J+*Q+5d5kg-oO#PAth_fVs+Xu71n;1@HP{wD**ICalf zU-dNSsbH-ut)4RK@%$&}^C~{z`Q4Ef^Z^fH945C8PYQU>>@!ZqbaTKHZJc(w@fgK2 z6P6(jE*q_MaK{{!n**L7a8P!8gJ?eDpu8VruLwkQe`oi2?_{xUdi3KGL={5_iDZmu zzP14^ib8d#kf4#*4ex_at_^IKa1w()nVRbUf3TTvp1P{W9+v=G^y{}20Dng z3(+><>vJOb5VHWZvQuRXIshO^1x<$@bCB3&$%iR= zt1!^XHeEJ0rqRd(N1j`a9RHp636GQgNty#ZfAutznarBZyL&tz=}jhKd=%=r#gj~? z0$b8(&?Sh!Td2Ay>MAZkN9Ao^qgoQP4s~J{1M#|1G6Bs}w6toyt(zPIEl_Yr%G2`L z&LYi9ANfRX;!9f%h^Cb_N4KtN^pD&Xwt{!=_64(>(9($JubyTyZ=IM-ujt=8@Bz?3 zRR2E4?hE(8**iXQI|>nrCleIcm02|hIP)APv0}(>x|o~A|2U;kmhTv3w}DCEq}~aG zoK{lmxCV|j3}+^I1}mZIXGUB`Q371e5SoilQcommp@Kp0iSfW6 z>N>5Ewc7-7Y3DQNV0x7v*1g!FrM)=n#k61W7>$K4Ez_(gLu@_I7WDf8+n;5_T{1WH zfe-H7yWBc$ZPRkHw#hd_K6lSa)m9uu+pHkoy`%X6;zey)S_IWHVq802CYd-oF|oF{vSM~e%{ z9UJo6njtwXSc@nV7#666GB|FOlnbt**s5t+BNC!8px!yJ;Oi3#P7w(_WVb%++Y-!< zb8fNAxZfsnJ<+xgsn^%Y>8tfo&K!hI4l74CilQ>Np61SRW7oHUR{4P^UNqCq@U010 z&N7a{l8aE2uO(P3$<3-7pB87%8^>(&Eg0>OaPiR$Sl>Ga>(xGwxyw4`>qjZ}yD;{K zP_mAW!AzJX+cB-;q79G)io{4q5>(q|46Ovk*ybg|aN0#C)<_g%su0AC>AHlj^0uhK zo9A&%D5^wtAlTq^5`MAwi&nJKY%LC}KDPI~lX2&j*CsTl&>y`B9s$*dH|=>b4t9n; zF9^extOjEr3Wd<(Y8KHH@wI|r!Xf0rigpm zIu^8a2I+T_pl;ks+Rm&IQWn4!0nkG$xcFIR&S&B}76r3BH+Tn*=#DwT9HRK07XxmX>aV38Jk+Xx)<3 zRJ4gXNR1p6X2B`b0f!-y%}mghBC_tsNlD+g)29x@)UAt-%zC*`U0J*4Z`Wi)Up`4} z{!6`**z9ia?Hz90ekPRUbyLV1>MG(wi@K|!^e8vf8e_aJA-+&}sZ_!mip`}~1vDGj zO4`|z$D&5y3G#^0Ox^Nsn|!Bcr>vz*RHC7?*m&iM-Al7RtsY17Zod!+mfe z&Y75T@JpjMe`>qSB4-K_+T%ht_#t$CSsCzhdm6J*9y#~%U>N&@rb7Qh?$lj`)HtjZi?ACrdXKYF||{4Q%l_n zKu-olo+McMZ2;ZY!S=3}k?kN{8jiu}#mX!{yng%65Br4b*X|xXptmvG82j?$(^>BB z?``kecmiVHao)j$#i*CFyv9onON=UUr6s(fp#4me9EQEgrO-8JV4_hyjs^9DC)z|7 zNvA93oA}H&o@La#FSm|v6s|cSERC7_|{1N$G3RB>)$tCjQV>gcRB6 zYq>@^=gzuqFeenX?a%IYfVBOw)9{REBeb9H@f-*g!uSP&RdaZnu3qY;sRX#Cgu|Vo zaD*~k)5^RP!_62NCBb`wE+0Tev+B@R%2Aw??rjd zqq_csT}<)VY3lxz*$C~uuNb7$qxpvz`%buj@9vK8krpI}u{I6T)O?P_3~?>^Jk~(u zbY-?7YZMr$9DE)}Fd<{mc@Rr#r&NHMQ$*2flZ{yy;J2r7&cybly1sie&E=RyiL4)q zCVRZtGuD^(Xs3NYL*#Z)Ht%s;+b>uUM5$LiomTE9n7ENMfryq^*FZfyAM74BAeaZSh#HV}bm(rS@}$+FdP^GN)Z? zi-a6h3u8cO7}rhPHnG|9!tNAnw~}@%0;yXG(S%tYv2@r_s!EX+MH!i22H|!OO6|{? zi@v-s&s>;dv9gR4&(yE zMb{=18$KbBQ@0R*Za;*yHzQ_Kugu(!Xufm5+`thGjLVl_y(*Q2?frpNIvhzvQbE%OhfIm@9oKyQ8Z94>p8Ku!e5(8)i7fEdmUxoAmM1C$gE)uBJ>n0oO?2|mH zuacC#>W}frR{hT!XwQ3$*Fik{>{*yc^H=U$jW@gdR_dO+hX>ooN46>tr^0vz00k}b zq9RkmJ5DB(bQqR0Cq}jI@}`FGso>X94W(y>;Ei#PY7>lSBWh6obvep^X884iY6_3> z(%1lx@@fu`<|j|{KTn(Gue$qI!%af?w?ZAaXqxs|M3hAhzXEYX4T%=fRj3{n5KLlN zxByj%*Q%o>4?5QhK+4Z zMn@MW(a}pBo?ri)1LNYi%%ZWk2GV)7x4n0~yZDOPFAKqU+fuBPgPzJ8DOD+G2jNQK zFVg~vU)dnIM@lG?j?t1RMSK+kcG%b&wNK=WhB5MFi*Z1iFLtK3%;9u?9>9>|Pd!Fw z)c>si950dEK=9|s2tI2=`S@8=1f4{V5&XG8BA)~V*9hYoRy<{+B{yq&nrwh-?u6Or z*`=noJ!-k^H+!+=K@*|>d?sq!Fib_u@;9V!mx&faDstV&dFQE1{anh*96VeEPI%G>a?&$eS0-#>7)w#%% znC0ME&Dt5O!`T!SUxD+jou0%PKWS;Y{CdM44xa09SNkVTy>Prndyc1i%7mx#Z8qQO z#qnSCZ8ik(`&Pu!fJtpOS0#i>NFgXhZB*Kq@*_SHoLi5KFLG|OM1i?g`;zu2Z#-FDf$&ne?GE}Zy1ezXjLwckM_)( zo^vBvp6$p^8(*_{)>^h_gpaui*7V#kx1#CKM|N#I`!}o#(iPACNvU5FEL=60cF17}&RAaLh-?w##)#HfPV@|m=d}g1Z6XD@AR}4PK&SEP%&IxZt9L89XKbNS@3;wEN9>0Wa>9g zYYrO6Z&@D=FJ8M^FC(nK6iSuF)$#`=_${gGg!atIb4q7vhiF8;9JHZgU>6v`Em%cY zp!P@)v*8+pc@fj)L?Y5I3j9p+cJ-S~&&15mta&>&`7P$q+`J{?O%Bbc|E(e8^$!}M zu?IplkB<%x4$YHBY9)x5E@@wLB3NYH6gsA5{#1+8tFoS{-KeR_h#uFYA;NW4GP*Q-QgQpPhgG`&&w=W_gQ<@I@R zusU`c5?f;={1dkBd)*)t{_$iq_HZB*$9ubnM~CM5WNjYjjM{0TRV*)sqBU3o#lM_P z6}aUNSrbkv&I{PAR;ZY$xkgkdL^IGMjjp<(^KLSg(=bTOsL$}%r0}Db7B}D?7d2u) zJT-5PAh1)R&%b>GTRc3U%%Hbz8$9TVX2I-tUU+lsZHqJI6=>6=gIx6-6g;B`o-eb5WQn0=lk}7KeYY;)CN?r zp?xhCD`31qWYP-3m^yb_(ZJS&1V&0{e%$sn zVbXHcw%)gLI4)w^PBWSKCJ?icmXXzZ{f1|hjr##U@7JF_IFbs#{*1;R38i9pcXw~c ztUudu?;#~I?d~YR-Il7Qr@;V*TqFgj;Gz^Aim)v%QUWpUxvQ<`Vq@0@VAKC`{2S1V zx!%G%HnghWu{A-`$l-bj>%WJa`uCQ;w?=rLG*(I9?R zbxynUCZKPW^s64%Ky$T+iRE7!+NO`Jf7r!GCbQq8**`Lw{l5w2=;~%)K52))`!xG} z!|bmc&;AatMSo{to!-3|#$+)2o!#A&qwU2v=~*9nQo+yDGn6_cFtyA(=v9rP9H)NW zrmA(B*PvhWMu8}8k=@ihPC8uJa8+MG@pkEYEr!j4GT)IkYIku_r(dyPwsPx3_Hn3h z^Icb-3$lW2Bx7zl$EAHB^^P%8kFV!w9xum8x#;L0jF5VFpd}8euUtJ+PGOd#nUpiH zM`Ukbo$Wy_%kicOhPJdHOJ(=*Q{T5TKJN(i>AP#Uy1MjTA4UGJ2UaVP zAH`S|O4RZGp{wVj_?6J=^Csb_EV7d23T!WIFwAisqnMkE5)4%fykr+kP%ka&YNkg# zq32YvgcZn~HL7Md6r#k2C|T+WSlTC&xeXYp`5^5V45niU({kud7_Es3V$w77UO|_^ zCr*nZZ=S8^e)NIWn4A#)Z=uu;XaXad04Az-OD44dDMgP-sTnRTrkA}>Hs#Is@oKLvYae(jk6<|{PWPAiLx6PJ8`)`*rM~&yD-0Z z{MYY$@l0^}uI1J+mD}#H!t%D!l)oMbi<{m>W1>{t(q0BygDK9tinI@ZOiL-m1+zwe zRubb-l*p4CaGO;jib}igC^r#vV@#2RnuwRS&ScKpCkokHow>&gndc0-&B4tpb9HK; zSQOb!Ps@Hgdfzz}Ekj^#tnR(^F#vP&qAxbJ(YnI72H z^#yD(5NW{4bsZ4lNd8ewP)Sa!BRVGX28LbasPfS>S?Tfcw3joZC%n`s{{dC-lgVcYWLvu4z@E19ArW&A#001A02m}BC000301^_}s0stET0{{R300000 F000^eh;IM@ literal 0 HcmV?d00001 diff --git a/src/umi_tools/umi_tools_prepareforrsem/test_data/test.sam b/src/umi_tools/umi_tools_prepareforrsem/test_data/test.sam new file mode 100644 index 00000000..6465827d --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/test_data/test.sam @@ -0,0 +1,119 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:MT192765.1 LN:29829 +@RG ID:1 LB:lib1 PL:ILLUMINA SM:test PU:barcode1 +@PG ID:minimap2 PN:minimap2 VN:2.17-r941 CL:minimap2 -ax sr tests/data/fasta/sarscov2/GCA_011545545.1_ASM1154554v1_genomic.fna tests/data/fastq/dna/sarscov2_1.fastq.gz tests/data/fastq/dna/sarscov2_2.fastq.gz +@PG ID:samtools PN:samtools PP:minimap2 VN:1.11 CL:samtools view -Sb sarscov2_aln.sam +@PG ID:samtools.1 PN:samtools PP:samtools VN:1.11 CL:samtools sort -o sarscov2_paired_aln.sorted.bam sarscov2_paired_aln.bam +@PG ID:samtools.2 PN:samtools PP:samtools.1 VN:1.20 CL:samtools view -h test_data/test_dedup.bam +ERR5069949.29668 83 MT192765.1 267 60 89M = 121 -235 CCTTGTCCCTGGTTACAACTAGAAACCACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAG E////6/E/EE/EE/<&k`Z3G%vZz z`gZt$L!Wt1VA1i&B!FF?QR|jkefx2a7c&Ofov>{l0}foaaNVW1@Esg_Ol4BxV98=$ zB_hqBZwK{*A}{lT?StqS(p4A@nWQ9i-~~Lmt^zt&mwIWPk|&aln`5a z2-ND}S2!S0=Q_=`eV)B(dKhaBm~TrWd2vo9aL#RX7gvf(X}V~!JDyirdZdd=Whsey z(K(Jg%;_a_0!Mo~=!o@PcNojD4(ABzYC000000RIL6LPG)o{71EY3$!d*c~+l!F*6L6TQ$|ghC)(RR>sZ9 z8qWIx1Kz4z14V>5nZYiIG1Uf21QWVN7_FN z5?aSA34KY^LSGjI^h8}6(8c;Wy+q6O6I~zRvIH+DiTX`+b&9pL2$t$QJ)^%Xp|5z3 zC3-tn@C4VsY3I^Ee$(!y#l0_h>CWOCUVLe*yzjnz>(9LQTYuv1zxJH({Ps(W*S!AH zliu)#ON;M%V|mZ(--Op5b8ypG-naPX#enbfD_{A_1j~0`zgXk|h1W6)c^pPrR;#I_}cQmOYg@0UVGEc&o%dZ!*IWs-u*%&I_))!MWk-`HNEIOhPAJZMQ3Mc z|M+NI$xaxTRoB!xiy@>=b0osgpI*+>B!DAxBj>R^L;&-4=#Shfcc(l7mNG0#$fIrotzvm-Uu)m595}} zl8ZWz%1YK5FIz6Uu#_^3!;sfu#>0rkSyOj1%9}E3S=Hdmuqf*~6B#c!YYG--7gF^o zmIgp2Wa&~rR}!UEO{=Q`-Rf0+v`NjB7Ip}Q8faV zdN8Dgz41?aH-70uH5o_#wwIN9GveAq(c?jtp5a3E<~? z3ORt;g$%&d85{vz1avIy>XJ1%K2;6_-@C99h_lm+7m zOmf@+8JLtJ0176S!4k$sn`|XGctKmh-WO=K(gGQnPeK_pKw{9e_y-nv`rXUr@S!AQ z)m#prl9kJ!*%WBpJPI_dvv~S-gXgu*;E6<#gx@g7a^=a9#sApQ@e3{dXBSZ*ky!>elBBbT4$^e3`TMGEdPDPY!nW z77xL`dCFRbVI|=cr08lXG--2m9tdjv+FaMv&MjoZ(zHf}#Vy60`} zAl-ZZF;5@Qd(W%?*;j;wRh$&}{IWIoe64%W|H)Ai8x&NFL6v^j^J7|VQo6dS5KT2A ztTNseWUnd8DkR2G?$;7FJIlW5P6jR4;-v{je zqVJB_8h6CDPPPW5q8;sy-o6+R{Vp$6)9C#a(94B@aEH8TqBv||4w0X6B)MG`v#M)Z z7)G*#A8HscncyPJqpr?mDMg7ev20*hBX)uO4`m!hs?*O@xz<#>lSCQ7Brt}~q`y5# zfs2ANl!Zu7v~a=^Q7>qQ2<@3>{^er|ozV)5*;``Oz*+27e# z?x&2AmDD6h9gEvA3whHu@H=IJ)D9E^L!cB1vMy05n+~~en75&fK@@}pWh|SPTvZu& zI#)3X$x$e8%{A7hN{`X_MWbmZeNHPg!6&%PlEJB^{P*hvAGAAxo$k>ONG!x40fq7pn*4jTV{2pkXWD_i#SAc->-Hdz}a8;o-^A!Qz1% z77G$UqIohNA%?F+RU%U85M4G^7R69-#=;!#1$RfJLXpEj`C8;|HL4r5`bU zVvNL1j>G-L2#Dusm*E2Op0UfgQXo!yPs`b>lC(%qfrWJdn}E2^1;VCsFTXG~SEO`H z4+yPbid#AD&3B@#VjnhH7 z2B*z0SSTOelox-^C@;3Y^qZoowR!|=PaXTx{lkO(1D(o6oT4?6RSgTfMzo03cq<`m zSq|%IgtbULFn9`n>LO^W( zZv6==H?Ca@tpD{}XCwZsFMTAM3MXOwTVr2p@Fd~v(J0{?83`R@VEkgia*;ZXfAx-tXeEuu8cBXelg^`)2tTjO2I)c}OX2^ko!K2L^#5l7l|#D@G7Ze>(FAveF${+1!lglV=&l--)NPdFOJm zcz7I`9BhN*Kh#lk6vyBR5zB>D$C2!I9pbiD#<}24DByKOjiflHI1s8GQHC7J1_T+a zGRlHmq{m^|@GiV6vN`h$HvC#cNG3X`Q8D_!bF0wimo|YJjljILH2`Nt^RF)kfbSUl zbOUBeb>P!NRiJhUmFAsNeoFL^iXg;}@gYFVBa>~=?JXUxOza)^Y8lw2v%`DpkIH)U z35>UB4H3r6i)5G|h#osV6ejaPT=*i^X?#a=~;vT@5K{+!HH{FH64XheG3A7cJ zVd#7(t;9`POY=dI z1<|RY$?Y_+FDgJy9=DYeZlZ18O({C-96lNA3%9&9J$}eOL z(GYaEY^a*mh>ko?gm9CC?J7}-sTe3lDWKvUbPj^2S}>8v_zf9nVRS(e{Rw%QK$T(= zgNe>o0v*q5^-j!<2YNk8hakQWiUJ2=k5JaKzH}ndw3rirtLGotloY#qerK=eB!|{q zT0K|t>cOdHll1(&*da?obs56UQ7h~+Q5F|dTP@L@2e!>+X{vO_bgJdTEUB`el{70^ z=}f`Y0TRF1?XOrm*O*=zTLm*%etMIhUzpYNi?>bNNPYxs|HISsqrIbT?Z=zwRKPJ0 zxTeC@5a%Xu;}G8|DN_ITWpTA0!TjgZ1w`bJ;;KbXOU35fiJv!%iURE$4-Xxe$8VTkj2DiW8WBK6X#m?G` zeZf53J3d}~4CSKJfq2;pon@dM)=tY}|MRl2mnbGW&`^$0!TmkYXK{XXe0xq2`O=S$ z?rv9`F!%2Mxi2Yay*t~SmyAA%Wd0NCzmkO+2P#Alq^G#ME%)wnqX+jqW%%s=(qdTq zOXJV(aA#*{cW?2rCoC4AaX~rdaTW>|OJq{W7C;5d8u?btkcc5=5a6dWh5!Q8I}^|x zEQ846vH47$kI?&+W*^DwrIlLIyET)#DOCbWZ}w68(io+{LFr2)l(w}DxG4RVu?$>2 zl)RfslR>y=?snFirsuH-))MRjr8loZ=|&eNYg2#m4-K-^+tjBo7LR+fghKh=v9hV) z+Q7Xb&Sa7aq)1`e6>*IG2uv`c7DpKiK`-$#k3rputZh&i>wqOLMc$q>@ORL&j+vld zJK~@(RvFK&fpDrSh0T@%!>6gY=+aDT>xH(CnDm#cdKQ`J#4sL8g;UjfL4T5Ex(rNX z5!rPh`RhJCmwLq+*V(6A<<@{3VR;QC?uQlwzSnpXr?IT@>H|089j%9pLNXoDfs2G+=*|QLwQiG~C zs*5p4&;@GVO*7A^h?6j=UkK(dR9{_MZbDEw*Abjw-W1FyqhQ_!pbvcQl$IliKlk`p zCN-QY(HtQXc3n$i$6)YUfog?BMp?BrwWM{jL=Fmr7(y0VgCFTC)VMPQQEeA7KRF!T zeNwojWaaAc|B~h9s9|IUsLV_tpEu_U;s;r5;-jwuo zH@~K0;iOEded7J{RM#;~Z9{*x_SODoaGV#;$Qaw|5Fuwp(`q^){13;XY0xB14?)0J zc?bBAYa+=6$zD<6tVM}~qeKKc1PnU>j$=YPk`Pc1h&xedMFkgChAdVoEP=lWvyeFo3J`bRXL*E-dK5WB%Pwl*$2sa4W!80!R}tqW{7@ysoQG-$gW#DgMBEC7 zx`yN+m8cdBdz)Q254q%au^7Xcx*So^zoI#nK(}m=BU8@{<=IpZnrcZ`n_-kwSJVMH zDk%Lg%zE_|MYpuKpcYn__PmZS7PpO+{`hG7WLIaMXNFToEo+;w;bF_DT8G@fri{O_ zk+WLB{IGy^KEG z%jiJW5x`F#he$?77h76J-$-Fn4hDqwf2_i)g%@fXA!|fTq64wlbqMEU{EzrEit`XA zqJ)iz#bt_>s9SoO^o=@+3J?)KB|0pm=5UQ(YJ;6rg-VHwJ>FpGyg+#`-PlI(>&x5^ zkHRgVI06;tmIN=kf=&&p=ySujxiFFz9gV~993Qs=>K$XVuIyFZ`2zy z-#~}lnknwP#-X z@eCwqV{?77ceKB|ySV3iq-WPGR6n#t;@VUKwdfp*p&B2c6o`J2oWXK8$YEEG^5BoXDl>O|cz?aR|s|Ywh@Fjp>FjhVTrrHj9 zZtUVLL%s%ESW{;O6WK)vGILAIaS)K2=pilE zn1CtKDbg~vk0kB4Vks=u7D*FqjigdPg1~C{5b6!mPiA1b21qNgtNNnW`-Z^a`QvvC zy8cFk=P%x&t-p)s(ca14_G0k_(shKnsQ-Z`K&2Tjlgm7$e3$lJWei+_$Y{$z1KDjU zwJmuITNzKH+w(nw73=8`tm(f6GWJ@ZN`7=@wdB&HpK|WrmJ@7vMJT<~Q&^1%g;$Ym zQH!f$qVb}zjp~MSeoj5>JVOB=vXenmFD8MBnQY&^39e`-joAwy1kPs2U?m7viC()Q zymBiFKkyX0fwBL{U$c0NxxsJwql?A=^FH$9!~K0&Bz)xK`vC3Jk=Wv5oJ=jJjc@xY5`lrq+ozfhz`Z`SW>rs0ldmAT@L-eiP;=&wzisX6qS_(eT#}} z%N%xp_P_@HhK|u_fx!mpXWrC-sW-K9_8&L-zT0No1phO@ywxj+)7pEy?TFhu`-dlc zx+2aQt9S*j7VK^XWWg+=D8ulj6aWh-!(P+YkdnjpK<=_O<`k|)LQP!ADkx!xbzKoR z7KZ{+o3H4USMLcnNsVP00|%6JB&hZmnVB1&^qS!Sf$H2&^w18ouO~4+kMx|DfTlcK zXWIzP?XhUm+UvZHjt-jjRYcPaAW?Uy_%z&_y4zH3Z&i*gHP|i#BVV>~le&GX%Otf@ z?l``_qTnj~*)r~kM*+o;Np80!`k=wvbno`bab?M z0A}!tz&u-$ke=~nN00q2VuaBfd`I~Cl^y+;K0iXpI6@bp@Ac*FDj}r)u_~BaQYJ~y zY}#WwFcL%2+E%I{9oG-t&2!FU>AqxzZmE~86cIwtSc8!3Nk9JrgHZF#!M?rAAoNMB z-M;0?(Ba|k!O3D8U!}dY$^bDElry!B%;tp0zMELT*ip0a)y#&Q&+8I zVM|2?nf8u#UKnOmBMXELF~?hPj28(~*CC{VQO=|%Y5T9KTNTth2d8}Y?Q7}-xA$@V z-+ShujyA6UFxLK=ua1YiDB`QYejPeI+E_^)9=6_puO0`WBG4E$myB77<$8{k|n zmu7EoFa#b0*=17Bz)X?R0+$Xln)G64MlR@n07KQ961NJW6Fpo(H|v1wP0YUm*U^M2 zs*3~7AK&D9FU(f2KHQ@@B=ID%-{70VK{~G(L*B+6-VSSex#uFRqKz%SBQP`hdFox3 zx{WRF5k#xRzYcP_xdRaUf(?=fn@l=LWQM^|Kp~O|Kj4Nn-ClpcU z)}WM(E$s&OH0_HbCcaS&o7>r=1vXe?V@DrAy85Lto~I}W#oQZ(; z>V8%m8L}uMik#8TVM$kUQ3GxeCrWj-vsUxmw5%dPRFtqCX=f29jkHa-dg@kc!gDi` zCLJ{krhRxG1jC>J^D&G){m@=y#Pi2DmHuxSmHurK_^0k1{JO0IejnEUy>C2^kB^QI zbQ!RTTN$-&jPy688bmBgmZ8i-rIi|~a)X?inobcCwU9(=qM&Wqh_bto_PHaq29E&F z*<{RmvY)Q2B*rtF9_U^WgQ9m@lotZLL^t>ZW=xI!qf!%%WzxS@U#42wRn~0!tJ)rUp}Q7F!QmtU>DxXpr2B zoE9ot6?N>X4HBsT$Dx{FrqMCQJS7^=IHM$WOsTqMjaf2M&-IjyF&6VPVf?NPE1DnN zwZH3+D%{)MJJ{2nD2nSW;sT(sIEQ3{!fA6vOrVvsis%|%wKQ%4 z)ew{in~non>ZPDr6%w*3fn>v};b+)pyPBI?|CkN368BZ_3N@9m=c&Ed%%(rW^WB>a z=W^C?-tnxd;UtV-<&W3Xc&a_%ISghcM8&hRk+IBq3q@xYWUk{;$Fmq^GFDb7NCI1i zRH*Gju%T)ncn6?cNN?9wB5Wt}l8iif9~7lyACh|CXI?t(!(*7ObiR9&bY4FzoiCq} z&JRCpLM`s>9PAwJ>C)|))&Uqtg36&Z1q3D`fyk>8+T`a zZ(?4hpAmQ7vN7(wbp+^_40ylL1Nxf9tDUu1`wD-2vU{+*NNJV`3$skX7K)k=aLzagb(2ZeGSu798F2Kb)vgZ*4+Iy?%mrcd3$FXCBGhP@Ap1?q{PSDdOHDXgfgbt38<3s5Or5p zL#tRPA~?^mt}?hv-ee(u4h3AOj9d5v1bJmvf*V74m)FQzqA*gn-Xs?)Xw`$7EN8E& zgym8%1$w+zpu=M`2TNT*om{3*KClBE&5&T#XsBlVs{NjcsmdtbQT&1$kCz(GKe3Y= zzZXzGYh9Fl{U}PdRN}D%=DQ7;_hRjz`e5!K9_$@2F4BHC0+OurSM`OY*5AxHY|$2~j;rzsmaN-+I(9!g%J z9Lhvif;a#~O^&EI5=9iFI^1=w>x(+v2u!%|6pSo~ zV4Cafji$30tiz``#XCm7)sQu}oRO@t^S55_25-ka+nuW!_@?O?*8}s{`2E2dDTlFx0+h+ zPQ$UkxjPw*fZXeN=U9igxJMMuGt^JYD9hm}QC`Q7S(Xdh1|7yowU}fP2&ynHWf+Bc zL4eglY$j_EfibVLtce;Pjk?!ukm-f4zNXGbtbwq@V9>nrz zrwVUxolO`1=I&He3FF8UEC|3Vb1Mr6eOH1Umm3D{L8T1i_e!4Obv zhZGkYoP{NACgpKg*N{`hL;^9OifUHf>N?}5K~CJ}jSNxpYBl|b&lgFPR{MC8_Ivwtt){CPnPlH&NFBG-Rf@VM{?FutWW%tQtY+YxTQ2LasaSzaIj+hZIB^!E2!(7L#ffpnN1MRyN`*J zTK}+-^w!q-*!x%QO-*NuwSVF1`u@)D@!p9&+2o2)M7^^FlOKauO;0^>$0D*^d&Ssx zn7&UV)2S&sr-f}_w1lXh+ifj`63_RaZM4?&w+7^_c-nR*Lik0Vx*IgN)gZ#CKshc1 z6#)vLikVFf@)O15wgatSNCey9nZar?3ecJCx(JkhEkIttnTs509`b3yF6s=~rf`Xx z;;oC2w%M!b2B8il&-tXHK~DPhXY&l3`|`*>8T#`X^jXT;Q67mWz5#6-`(T+uHL5;#z($n zYCOrOcc^;Q5@iqMGYy23=T#ZyC~IU*R&}h3!nTpf4}o}9cOpi)ByVU!e^b>NOaqSv zux%R_>AOf&DaVMPzVW5bgM)D=t0~*97!wqjD|E-~nfF7elkS#EG(WV-r{6s5(|_w= znsa{g4#PiIn-=UR9nQ* z&mZ-BMQ-D~2(uAeR+Pr`|!S=Gz5kw^6dR z^Qg%^1xr(>>oqw8GIO|w1jZ#kUq z82BjGerIx0+3wDXMw2m76hagSOvdV* zvbII7IYh+SF~O&`s}9mJfv zLPNfXCX#TJGhr_2oCskxi+5fddHwZ&-bkI+)t)_^X9FV7*6AlT|yf|x|cewJ;q z)9aGiIDY=u4TXCrMsS;abh9!QaVE2h=B63AVBXOi&W|2W%L*i}w@uD{Q=+Lll+Otr zY7s(a>c|rg@s4;G=@U(qf~!r4`lQsQZaYv!u9`2D^h}7^d78F7{6fQ}xpnu2x zFpR4I<|d#1HM6lh38LS5Ljrrj$5RWZirvo$T2{l*(tx8T%R1_;N92z5GnWz5yEn}l zm3j#d-?A=1)9IUN!6b;YdCK_nHFO|fVD%9L^|5uKRDKtOG<={ zsA*Apo;9GwBNSaw$ddFkHH3zF0p}!9geiF|XmAm48Xh8Gq9Pgoqv1Ox=wr>KFAV@Q zcC(L#hEb7vX{j#`dVkap%@}QhPt>`NCXUn%in=qmU(lrt%@(K%hMruq+&H;pIns4Y z$$j8t+OtRqf7DCV4VucSXEhX?cuJJ=K~k0+m5PS?zoQJO!U@@gr_J&6f72ZxGGYr#}3h z^iFi&|8jef(ejLkFFeD*`E}18{E%1-YhvLG=gHpT(c%I~;hNGN3FiZnw?RS)40Az! z1K>TS%$Y=B)YVN`^O$NqFbo+hktD^Gha!cV^FC9_+~niekQ`G#?Oo-MvZNiTUj$mk zk9_4hGuy7dI0DOTyK=#L+gE(qyP9APW>1BbL1Bk-+L~%=yol+Tb?OKDy>`4^VnTR- zW7aH+sblyEtUK0#wdI0k&E8w@njwI&_FiAIj*d}s&?E2ih-#!zH5yb&vnpf*HoeIT zkU-=;Xs`#5I&#w?9q9`-Wldg%bS_Fn0CsWLq6k^eot<#DjLxHb3U`;JsTB5mKN9=h zDOgy8H+U?)@%`;*)6=-K(F!m z3~~7BA1jUDs0Y-JQArGji5Z`)XSDtN+4e!(K>SyFGzTdqgbzI_HE62MLk*o9SW>$k zox(_`SR*Wlmj)k+WRQ~X7-EVnys08iqe8)3#&O0Wy>td<&4tJzsz}E#Pj0%eUOD}i z>0sW>rS9G>AHcKjzVaScI)8psAb!gz5Vu+4zpMu7w4viKRue{OcYAN|a9i(oA^JQ- zR87s>vd%>X<>gT>dDT`h0YYSz#2Tv#AP@ByT^ENnksLyHIrJPtE9hiqI~I6!|6?$c zx##{{hTc8%8-{lcy6=n#q0X2zFrQbD^aosqB-#BZ@#oxY;Sp@N#{&$nuNR zBJn3}ooz!wKibOy)arTyK!4KL^~3#>oo%J-rv@`y3np?(I}64V5GKdd16+#n220(q zX0_C)xRy_~^-{I#((8d9+Nd>KouW=+Z(vLZLWG{D`rH;Z3RX{l#2TdY2RDrXcO6mM z>pB5v!L+(AvGxTI%$?nXgA?u6SyPrYz&d11f~Eio$(op4jL5sJ=s4KRk_H1~y`^3@ zI)+M=a86xI8!bEFo8gURSwv@a>%*nV#w$iY_7du@j$NPL#A%82|9LjD^2sim%5uJI z@#e)KuJ`(u(}?C16q?N$(2NE&FRL^816|Qc^ia#xZerCBC0V+>l@23cGzV8K)yTud zUY+Q4eQ70CpWCC4II|{Ik2$;@;QXLFvFTx8^Wx<@=EfX8Yz6Jjy`T+ep^$0(*yA2F z+6=10P}a&$OjU#i>nJmXC^B>icS6bw5r-pmMyWIY^Si z+19Hz;s?5*qFqa(wN@$n(&W)5anUCy4Kk#A^3y?_T$4NVu$y~lY~cE_vv#z5n6S#}{PNml0+jf0(p%yqytkC}iF zhRRHeOz*bUo0!e`P?~b0Bay)DQFR{4#u~S;up6y`^~Mtoy|6|@FZBHZf6xQ>VBa5b zxn3-u;(6npnhZ@t2UJ68L}dSD2hnLbkJEGW^pJ4UdtLdj0l1l zMncpr2(wP+Z5?tmWYY{AwT^Vy0Z!kRW(=4*03y)yifB}?9$chnLj-!}iAtFhJwPlm zN~&l3nUOQL8(PhcNc8rP)c9lgW&O7T$uIhc8%2cV_ka8qcg^hLje5`VgnF6~J`-#I z%J&=xJI8z0b8G@J2Z(Ylh;3cSCigc~1slcY4A)vg8@<)}+su*2iF1hxbyPz5Gw$A* zGwg}3r%~b?u=X21nn#Cw+j>v*l>x|MG@CApI+)85)3}NmOuc3Ig86Ny&UJ|`Wd-(P zOUb1}4|U?aMPCFw|I7J?Nj$U1bI4&xV(<11xi&ssk0W=Q7GlMkcY1Vfob2 zCL!E&lgANjT3(cqY>Em-l2{Ff zoUJlrXB1yDXkP0?i9dz4Ydx#EyM3^`d$M@vjha;Z7zy^dg;Lhw)L5t{qSFB#r7NU? zi-dW$l7jFzh>yN43p@3C^6s9tOJ|F_dTs7>j;F6_yLkH6uNwqk=;-`C&f2}c&L2{D z>SA#{?b|;clqyTGai0!MObC+(&XA_1)$E|3-mM3ErutiwnBm~K$-DjCF%PWFelI=n zY4;Y7bewXpv-Xp|8tPk2&v1*m`Zp$K{Exo1#2k@f`$0@vR@mJ_hV?g2lWt56%UG&5 z6Qv-NyZo`0?eCX<%%0c#EsiVvVRM%cVeRwY2Z`I<+eW!VZOPuK2<40-AP{3=j2JD0 z2gNJl1+{spKVq{FD%3<)l_#2gqCsEsP-y}+*d%@Ss&YwHp;JK!#T|h{ELFOpd&cy@ z8#T#&sUS9e@2Sc{=AnR@R8+sUriY58QKjaIG_JL}S z#i>oQU^Mt$8xmDjq9Fmux*93qJ3Gmop>TtlG4qfULxWH{KOXQg>n-QZkTFl`G@MxV z9&5Az%lWG5muEa6MZceY+(T1Gx{9zr6^LRDhzLhI{FJq+CFc~Ap;E123Hb=z+tYKD zOul6}qQQ#mV^WMRCuXRynhmdbkW>qD-wu{ow~!>xhtgBCOM~VYH;q$rr>Esx-?%k+ zJM967^p(8gA9!dQ5BQKd2;`l}@I4XqH5jE2$><1f(MA3tTrTqxrbi6!V8u(FJF`8bJx*|GpfKOX< zJk`_sWb7eFL~Sxgw)7TH;f>v-WNfn@_afM3Otdkv0-`R6>o4hSgscz|>}z+?_QDc{ zw$u?CrA*6oX*cSs9)&<{Y*f;f=$_bg*k`Iqxn5v(il#47BAsZjyHrbV1E}7)ujSRq zxJL4kStP&zWx4pY(M^BCsDa#P`PL6^Ss&x$h z7XdJ8@)f5IYqBn>38W1pb%qcZPs_(I?!ABfP48nL-I*PiG0(pWE+p$1j{PH>)cuLG>i)s!Pum;F1boJ~o{qYo9<4Y+h{Rb5@uctxq(AIL z*fk>OArdKZc>pgBoU}+iOY}f{Lr7_t?ryzyXq(kU;ig`jRd}94-Jd+`j$Z$QDcSlM z)?&|o?(H5QZSO1b%(_z6ye+%7tZR6tN^r!HRVQna!w%%o`$_=8R%8g9!W^{}-au$+ zn-(39l_Sy;@LaOJ;@LVbvFBhiT}uaPAAFoo_1I4oaNf9ezV@5{`huzTESkmQzj@Yk zZ-4jbQ15NaY6%z4=-4&EqL@huUoMfI!`tT-uVa=m8IlZ(Ji?_>2;0yiszN!hD}*wu zi02y-cjq~BGEL8^?c)snhvnUb_n03aqx@xOKigFTeMKX@|o-5YNYa@jTo)Jl@ld@K*uHw(TN&c)sZ} z7(yA-o1P3%xv9Lnsd6|+UblabE^}koL>H&*$EDIQ^ZS?j@9t0gBH8?nv-QDS&rW*n zdv?4*Ht&u>GT*L3;qYLh?juA5E8{`hT;LMQBQ>%$KAKuIX&39caEqxI6j5nhqEKCAv^|D1 zBgn|IIEj!SglKLI6_ApHQ7zTX|y9^BX zVxjzvO}kjb(JoeNH2;1xZM}KX3m1#$c|LuA=iqop8%+>NF#@7is0mRWN>ZYZcv&IX z1Y?J2iNz7MeS%kNGhWF|9mxUk>a1n0kTpZ{9sx-h&2#vmm9HclRy|kmD}iQQ_3**_ zuGwfbm$Q85KbnE_%UJtu-)J5m92hv)3R_bmE~mi>60kuE?3UV(US<2C4*4#NkzRkxID2d36eH>E_Z90iK7i&I&bBGRqWP&AG_MDmAD+nO@yW54%`>WVK_w#ao^6=1 z8c{vMc!cy>ScN$otNPjjKEqPF{`Da@oenq)oS)K>rmHW?P$wjt@7siCIE&_&XFNJ- zKl6Ripy}t9#0X?EZ&=rcO|JIFMW)9t>0_b57KD<-2(6Q_rw*8X2c8RV>aV2U_x`bp zOjXgEujzA@jhjZ54I6Bd9QuuUB(IkyXe!xM2SuDu4$HO^w5g`e)TFcw-124RMVE%i z>8X#vOw6~`*$5A{OEBs0-M*&I*s^EVt&WOL86LfFQ#n;Bkt*>7dawo({+lT{;ii@nn+0>7_m{25zup*tAaXF)+!s|%Li&nbEj;6;eJ%4gOF`cJ%rz z*mF%dRWl-~4mF_zcq$Y!5tgEyUc+a>yq0v546>&vYY+ltG89qU;Z1aWMA-pSScOnF zQf-XYy+KQJ*TYtiz;x4_X6L05UIT?{FbayopI#LNLFzU`n<=M(lKVdXSC`?hU7AlW zvHMC?m71caf8wHf9nidv(EP&BJ#yE~2F~kxH2?RDraDguU++n&gC^=3jcg-2@`ZsT zE;~`Ojy7_NkPeAJ(kaM@(gs9XS1_5>tsmwwl{qP&qP>Pp)=g7K)aQ`|x)~`k#Juj^ z&?R=-^|CbBs4YlA+IQ%yu5%S7=ps7(F0=iuFu7>^24hLSM}B<=6yi2lnmZUr6T5nrXvuq!hH8;QC7l;-d2C;q4tf>ju- zjz^n2ZKqh184aH3Q!klHC1Lw2Pbv+Lik)kbSwgELS<4({4Uu(VF6)Sf((tgXX|GHU zLn&xzfD&A!_Z20hbOcYBfp;!zj+j-}Mra0VxUbfxBOeot@ko7dv=Z90K3t3W^waM( zv~`X4{%M*0r|+N4?Atv&*giheXA&}oVlX8{00`wYI?}d;r^x{&ZOaBrS4ljlbQENL zCOg4IS<}=nqXH%uyd>?0vy!~xun?%n)md3) z062;+UlpTy`d^?22Iw#>8Siqi!$n9g-+Q)`hOJz_{H4=!@Pk-;#7FXIfA>UN&74kd z;}Q8|nkok-z5zfPVkL$;JM}BX4SA@J+zcI9)^<5uRRq+lIHsKu2%sXG-QZKlBT$xW zK^grtQ)y4ewl%dYJ-umm#rChK10@PHfB9_XHjC!By>#k>NLK%(ujN`Ub+6Y|z_I99 z9>w-BerwbcJCD~6OBsRZ+dP*yy7HJn|01L1@7;vv4YL)Z2YWQ{>nlVguebYXYS~m3 zqB8*o7OyNNr*U*y&hS>6cHCs}Tq5!jvJG?kt4%C*jx=--CZlgx5l{=|FK?<41)~bl z^9+=q{nLY*|IlJs`yo%wkM_3rj&~Q2(zctHG5Bw4n#q6*vP;nwpgCApa{M1kp0{;T zXN72p{%Auw_pzfHE3C;7lvZ&_J1cm@*v~W7vyzedI489z z8XpM$H6OvJtx@f#jS)1XEL;S?(-X$80)mO2qhR>iB(;SabZ6tu-JyB={ zCfUX-62cBq5hsd6$VwH}TmwaUL4p$o3j%>7z&$-IdUn{{n%uhb3K+WxxW#oy206M|63q z;ta4dr;>^*pS(PiyczYg`ZoGkoTdPH>!<)|>FY}dO?P4*A^df|ENL_sW@6qOX<{C7 zS>Tg_SOLWnA-P2Yh6ENsW;qh!801w$UAmHJL`M4`$m)kt7H3F@!EwnlW_12}IHX6@ z^RsZ|`R13Vp;QHiO4C(Ic7Q@=>d?`IXIyg97JOCZ7 zI!@a`3NWgQkrq*%BQgfIG(eAM;0Hi1$O^n_#@i0`tf=a9S*f%0E_!R{H~8Z~KME+` ze9cp5xaUhB9U-x+6D${r_xKWag-E!9qrKA5fH=b$ozJvesn~?XmW#x*ZK>#62AO#K zVpw~-Clkke$d3*eKSf82p)wTGSr4?$Eh{)mKQsUk3=%0QKgZ$Cg%GVAuIR|F$)4+qZ3v z&@Q#W|%&e%!=MAPIIz*)V~hsb?eZZ zFQ@(VE2myFe$`^}yT0rlgUi%yGXY2mxAtvLo-hnc&>LCQ@w@}4j`Ad{!>~dgj)=Ys zQ8-6EvJF|S4!(kWji_lB{5mTmC~wOIk7kM%=jpRPg~}%W8xVsoQ$03*h~std4x#zk zvmMc9sdhrn-{L|D-|WkxMpK=GMTa)gv9eiM!y}3=q#eGfqtW4?5e=MeIL*~Z4H7In z5CqRjvt_BJ21YN7cnR*EXEKW~rfD&?`i}gisqZ9D*Y~yNvEgcK7e9M8s@?W@z#vYx zWc&rJ-QqPIo*e9*?CA|{4Xl4$sxyv31!W>EpxM;bMYDLr79z{KGJ-gwWLZH>fpuJV zJZgDMfr%^%@Jd-!ioCpXdN`e5oBy45rVNOn(-cIi-;Z51S!h|5s9ZNr-O3+0mbXYjVd^C~P@g#W{eXFTVk zD5<#Ki5hWrB&jn&;_3=@wlEwWOPdBR8{}YRb8-}Zm&wwUZ}kR#wJjhW-m)|_K%fsv zhb}MmTQzDys$S4ZX%H+^wN3*rkbdHq+ZyHQxS{LKc<%RHdg60u2G@PD*&2ktH4SyaDhc*8cohsrR;bj*fSaRf`crq@0&UM29(4O$Bs93Wrd^ZK6``aA{fF$f88N z9XC+}&m-;F*ky6pLY}%!VZB{||ZKFY(RS-UZV=PP-3(6{R z(-Dv;BEjPVWrmJsnb7%ju~5E4z_}Js(-39DS`;!AfFpH>2`BCthjgf6Db9O5i+HSb z8$hyhL9yK{T_irSdj8NSY+?0H-#RU<(%QRxso34!-P1!2*L?Pn5*T#YeUz|^u3=C+ zz>x7cXOx1$T9t@Ya8U=RAkOSL(^IVUkCk&?)A3K^{Zn(N2i(496UpscBP5MXxk$d= zm#MD?lFra6ZiPHH4n zhDC9U3v1R@MhvVZuVHy?HN!i+uJ4$&+Xc;eTJ&pGTk>-JI}?PUYAzV!2*T+tzLb1< zMmoo}n;Cto*hYa@h9U+xAp85|oObY91- zW0YxU9V(3QXmD;-)Uhz*NO)ue`4a#gWt66AggUJx790dy9OhtaC9gvrSQ42=Wk8C_ zm#GS>)U>!jWpzn^;%DMwsw42!KfcN1l}=q9@LGX4qIu&8&80!}gxP_bcZOhI%~>Y{rr3erT1+=v-Vu9GFZ+ohXp#oG39b+Hs(&V?9^` znpgRm^9O;<1K+>>+?lZSgTLm0aO>A;F?;c10P889m>FO`cHLr8FzTtQ+ZqVek~%O# zP!A1tOGgC<^N>Zz_AaP@q8f4+=yv&`+umehW`e2nLF>N#-X^Jc*-w6Sd)JotU3x2S z?=AoOYv*q7tyYVk`W;i7^fs*hp8vTY?QL)C5yM>t=C90U$lAOuYusresZNGsnW*9x zMU9AzI5M>y`Ud+`79mBBu;sNty@L*W#bwBN#fq@oo6zR~vm7afqa%s^qCS0lTtK1m zA(O6tkpf#Qp+1N+)e5ZOB$ZPqNdwEr&bCF?@~JQTjw%25cd+*U37SXy`v-c0-KoKn zbe;;2K?c@2&A=&nn+wqYbdqL7n;0*ogE#c?oB-y%Vni(MGdXM6gYkQNYn9rKBjy|t zvtKS9{T=#5I{!@PIr`7F@xyWBZ~KmE8y-pQ6MW^<(n+*c$!dVtltn{RmFSq)26h~I zizJL`JZ)WNWz1=g2%Y0a-L_>>fr#m9Vq8i@AG`}imZybbTg>!Xb9xqyS1gu_jY||S zuU=D4Q2H!!HSf+0ATCTBi8;E$%LV@MnqV)T(Pbh3idKgD z4^hdeza23s_+wtt-hqyq+j$QAjvJ=#ud<3ej(P^-)QX2lpsN`5qAaeUvXu~sGNddJ zniW?}&U^CUeP8()@AlOH3LS*eS77&p+Zfk2ZQ`X zWh0BiHtXVyPRwgVp4CVScm|+zl;3K&lA0rXj2hX}BnEJxB_M53{kS}`V#(k!oSF5T z(Oz*ynih%wdOF<;H+fe&kLR6hDw}t1{eRFwe8D&I001A02m}BC000301^_}s0stET N0{{R300000007o0qmlps literal 0 HcmV?d00001 diff --git a/src/umi_tools/umi_tools_prepareforrsem/test_data/test_dedup.sam b/src/umi_tools/umi_tools_prepareforrsem/test_data/test_dedup.sam new file mode 100644 index 00000000..e9487b8b --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/test_data/test_dedup.sam @@ -0,0 +1,201 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:MT192765.1 LN:29829 +@RG ID:1 LB:lib1 PL:ILLUMINA SM:test PU:barcode1 +@PG ID:minimap2 PN:minimap2 VN:2.17-r941 CL:minimap2 -ax sr tests/data/fasta/sarscov2/GCA_011545545.1_ASM1154554v1_genomic.fna tests/data/fastq/dna/sarscov2_1.fastq.gz tests/data/fastq/dna/sarscov2_2.fastq.gz +@PG ID:samtools PN:samtools PP:minimap2 VN:1.11 CL:samtools view -Sb sarscov2_aln.sam +@PG ID:samtools.1 PN:samtools PP:samtools VN:1.11 CL:samtools sort -o sarscov2_paired_aln.sorted.bam sarscov2_paired_aln.bam +@PG ID:samtools.2 PN:samtools PP:samtools.1 VN:1.20 CL:samtools view -h test_data/test_dedup.bam +ERR5069949.29668 163 MT192765.1 121 60 150M = 267 235 TATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCTTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTT AAA/E/EEEEEEEEEAEEEEEEEEE/ Date: Fri, 13 Sep 2024 09:10:30 +0200 Subject: [PATCH 09/13] Add Kallisto index (#149) --- CHANGELOG.md | 6 +- src/kallisto/kallisto_index/Kallisto | Bin 0 -> 2439 bytes src/kallisto/kallisto_index/config.vsh.yaml | 94 ++++++++++++++++++ src/kallisto/kallisto_index/help.txt | 21 ++++ src/kallisto/kallisto_index/script.sh | 34 +++++++ src/kallisto/kallisto_index/test.sh | 35 +++++++ .../kallisto_index/test_data/d_list.fasta | 5 + .../test_data/transcriptome.fasta | 23 +++++ 8 files changed, 217 insertions(+), 1 deletion(-) create mode 100644 src/kallisto/kallisto_index/Kallisto create mode 100644 src/kallisto/kallisto_index/config.vsh.yaml create mode 100644 src/kallisto/kallisto_index/help.txt create mode 100644 src/kallisto/kallisto_index/script.sh create mode 100644 src/kallisto/kallisto_index/test.sh create mode 100644 src/kallisto/kallisto_index/test_data/d_list.fasta create mode 100644 src/kallisto/kallisto_index/test_data/transcriptome.fasta diff --git a/CHANGELOG.md b/CHANGELOG.md index d88d0996..846007d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -149,7 +149,11 @@ * `sortmerna`: Local sequence alignment tool for mapping, clustering, and filtering rRNA from metatranscriptomic data. (PR #146) -* `fq_subsample`: Sample a subset of records from single or paired FASTQ files (PR #147). +* `fq_subsample`: Sample a subset of records from single or paired FASTQ files (PR #147). + +* `kallisto`: + - `kallisto_index`: Create a kallisto index (PR #149). + ## MINOR CHANGES diff --git a/src/kallisto/kallisto_index/Kallisto b/src/kallisto/kallisto_index/Kallisto new file mode 100644 index 0000000000000000000000000000000000000000..3c7b5b2bff962965d99ca3f9a4a6b6af6da1f3f0 GIT binary patch literal 2439 zcmeHJTTBx{6rFCj(iW7(R4PH@f{iwcTCEBOksY_VO)N&jMkQb@MkU0z5b%pZJItq>6y3i5QJC02jb;7mKSNlWm{Po|xmnK|d)x%cj5cE^Hf zo5Ms=gP>q-=Dx`Y&8Tam%b*(*sAYc)aw*sH`%Qmb#irI!wJd#g^%N{jJ942Y*B;7v zR8myiWp1c$UHLL&=A`AC^o$*g(yPKxE9&k2TL(+#tPc&Aci!%O@N%%KUe}u^mgT!# zi%Y`WO!oXjbAkQw#f0xZP+kZo&F8eXH{r?boldZcY~7r&DBNCpW#D;g<%>%V>88S* zn(fP&_poKg&FeY4ul8ovgjcuEYhCLuEry9tsbgo; zfcbNEMVB@;wf0@jF3-`vqIs3Vln>G5z~*-Q*|O7R4?+L_8<&^;{;|Iv=yw^0x?1Fq zKKH{t-pk+KZ7B%++|bB;H`mVIyPQk@FmSG}@6usnuL2HvP5$1gzh1RfdlA=oM8O*p9pU%UxMoXtZa4MiI;e6lN9)Zj9SsczoL4F-nxW7gjS;lL#ITXLZTe7Tm{6y<^@7*BFcmyOu`fe-N>GMM_MNm8n3s15~-A-du zs>S(M$mE>7gVQsBMnsW@M&}f>b(D#qkcNO}MG=t0Wgt?+7*11VW2Q|{QbnyveDt-4O(8?_>}HQ+|nsHK}JD> zEI}_&#snCR!wU`=ImWmY2!IuUp@YzBuGIbjA?Q;}NCbS6Ej{38{Q(Ed(9~7CHlh~@ z(zu?LfHTHk>o~Hk>ib8~11>`F@%qmr>8X$)4UE=ZAnP=qIJp|ns6M_j(fMdSqjeZP zKmX@E#Gf*HzUVyzWeLinBtr;M7o$H(mPA>GqA1d9hM{`wuLpH{uWG16io*!3&Oq!~ zY>JwOK3Z%+$HT0KEnpYTsK*f41@5@T5O@KlCdndB3}+_eA<9%le@sX@NTS*o#e2qq z{mZk6y~(I%5^AViqJ%~m(IWP&+TTT!n9y(~NA!$eA9;u!LWn;?@K-_t>bR9cmu5}*J8 literal 0 HcmV?d00001 diff --git a/src/kallisto/kallisto_index/config.vsh.yaml b/src/kallisto/kallisto_index/config.vsh.yaml new file mode 100644 index 00000000..2c4f65c7 --- /dev/null +++ b/src/kallisto/kallisto_index/config.vsh.yaml @@ -0,0 +1,94 @@ +name: kallisto_index +namespace: kallisto +description: | + Build a Kallisto index for the transcriptome to use Kallisto in the mapping-based mode. +keywords: [kallisto, index] +links: + homepage: https://pachterlab.github.io/kallisto/about + documentation: https://pachterlab.github.io/kallisto/manual + repository: https://github.com/pachterlab/kallisto + issue_tracker: https://github.com/pachterlab/kallisto/issues +references: + doi: https://doi.org/10.1038/nbt.3519 +license: BSD 2-Clause License + +argument_groups: +- name: "Input" + arguments: + - name: "--input" + type: file + description: | + Path to a FASTA-file containing the transcriptome sequences, either in plain text or + compressed (.gz) format. + required: true + - name: "--d_list" + type: file + description: | + Path to a FASTA-file containing sequences to mask from quantification. + +- name: "Output" + arguments: + - name: "--index" + type: file + direction: output + example: Kallisto_index + +- name: "Options" + arguments: + - name: "--kmer_size" + type: integer + description: | + Kmer length passed to indexing step of pseudoaligners (default: '31'). + example: 31 + - name: "--make_unique" + type: boolean_true + description: | + Replace repeated target names with unique names. + - name: "--aa" + type: boolean_true + description: | + Generate index from a FASTA-file containing amino acid sequences. + - name: "--distiguish" + type: boolean_true + description: | + Generate index where sequences are distinguished by the sequence names. + - name: "--min_size" + alternatives: ["-m"] + type: integer + description: | + Length of minimizers (default: automatically chosen). + - name: "--ec_max_size" + alternatives: ["-e"] + type: integer + description: | + Maximum number of targets in an equivalence class (default: no maximum). + - name: "--tmp" + alternatives: ["-T"] + type: string + description: | + Path to a directory for temporary files. + example: "tmp" + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: + - type: docker + image: ubuntu:22.04 + setup: + - type: docker + run: | + apt-get update && \ + apt-get install -y --no-install-recommends wget && \ + wget --no-check-certificate https://github.com/pachterlab/kallisto/releases/download/v0.50.1/kallisto_linux-v0.50.1.tar.gz && \ + tar -xzf kallisto_linux-v0.50.1.tar.gz && \ + mv kallisto/kallisto /usr/local/bin/ +runners: + - type: executable + - type: nextflow diff --git a/src/kallisto/kallisto_index/help.txt b/src/kallisto/kallisto_index/help.txt new file mode 100644 index 00000000..28778ac0 --- /dev/null +++ b/src/kallisto/kallisto_index/help.txt @@ -0,0 +1,21 @@ +``` +kallisto index +``` +kallisto 0.50.1 +Builds a kallisto index + +Usage: kallisto index [arguments] FASTA-files + +Required argument: +-i, --index=STRING Filename for the kallisto index to be constructed + +Optional argument: +-k, --kmer-size=INT k-mer (odd) length (default: 31, max value: 31) +-t, --threads=INT Number of threads to use (default: 1) +-d, --d-list=STRING Path to a FASTA-file containing sequences to mask from quantification + --make-unique Replace repeated target names with unique names + --aa Generate index from a FASTA-file containing amino acid sequences + --distinguish Generate index where sequences are distinguished by the sequence name +-T, --tmp=STRING Temporary directory (default: tmp) +-m, --min-size=INT Length of minimizers (default: automatically chosen) +-e, --ec-max-size=INT Maximum number of targets in an equivalence class (default: no maximum) diff --git a/src/kallisto/kallisto_index/script.sh b/src/kallisto/kallisto_index/script.sh new file mode 100644 index 00000000..59a5d3de --- /dev/null +++ b/src/kallisto/kallisto_index/script.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -eo pipefail + +unset_if_false=( par_make_unique par_aa par_distinguish ) + +for var in "${unset_if_false[@]}"; do + temp_var="${!var}" + [[ "$temp_var" == "false" ]] && unset $var +done + +if [ -n "$par_kmer_size" ]; then + if [[ "$par_kmer_size" -lt 1 || "$par_kmer_size" -gt 31 || $(( par_kmer_size % 2 )) -eq 0 ]]; then + echo "Error: Kmer size must be an odd number between 1 and 31." + exit 1 + fi +fi + +kallisto index \ + -i "${par_index}" \ + ${par_kmer_size:+--kmer-size "${par_kmer_size}"} \ + ${par_make_unique:+--make-unique} \ + ${par_aa:+--aa} \ + ${par_distinguish:+--distinguish} \ + ${par_min_size:+--min-size "${par_min_size}"} \ + ${par_ec_max_size:+--ec-max-size "${par_ec_max_size}"} \ + ${par_d_list:+--d-list "${par_d_list}"} \ + ${meta_cpus:+--cpu "${meta_cpus}"} \ + ${par_tmp:+--tmp "${par_tmp}"} \ + "${par_input}" + diff --git a/src/kallisto/kallisto_index/test.sh b/src/kallisto/kallisto_index/test.sh new file mode 100644 index 00000000..2646dcd8 --- /dev/null +++ b/src/kallisto/kallisto_index/test.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +echo ">>>Test 1: Testing $meta_functionality_name with non-default k-mer size" + +"$meta_executable" \ + --input "$meta_resources_dir/test_data/transcriptome.fasta" \ + --index Kallisto \ + --kmer_size 21 + + +echo ">>> Checking whether output exists and is correct" +[ ! -f "Kallisto" ] && echo "Kallisto index does not exist!" && exit 1 +[ ! -s "Kallisto" ] && echo "Kallisto index is empty!" && exit 1 + +kallisto inspect Kallisto 2> test.txt +grep "number of k-mers: 989" test.txt || { echo "The content of the index seems to be incorrect." && exit 1; } + +################################################################################ + +echo ">>>Test 2: Testing $meta_functionality_name with d_list argument" + +"$meta_executable" \ + --input "$meta_resources_dir/test_data/transcriptome.fasta" \ + --index Kallisto \ + --d_list "$meta_resources_dir/test_data/d_list.fasta" + +echo ">>> Checking whether output exists and is correct" +[ ! -f "Kallisto" ] && echo "Kallisto index does not exist!" && exit 1 +[ ! -s "Kallisto" ] && echo "Kallisto index is empty!" && exit 1 + +kallisto inspect Kallisto 2> test.txt +grep "number of k-mers: 959" test.txt || { echo "The content of the index seems to be incorrect." && exit 1; } + +echo "All tests succeeded!" +exit 0 diff --git a/src/kallisto/kallisto_index/test_data/d_list.fasta b/src/kallisto/kallisto_index/test_data/d_list.fasta new file mode 100644 index 00000000..ad5e05bf --- /dev/null +++ b/src/kallisto/kallisto_index/test_data/d_list.fasta @@ -0,0 +1,5 @@ +>YAL067W-A CDS=1-228 +ATGCCAATTATAGGGGTGCCGAGGTGCCTTATAAAACCCTTTTCTGTGCCTGTGACATTTCCTTTTTCGG +TCAAAAAGAATATCCGAATTTTAGATTTGGACCCTCGTACAGAAGCTTATTGTCTAAGCCTGAATTCAGT +CTGCTTTAAACGGCTTCCGCGGAGGAAATATTTCCATCTCTTGAATTCGTACAACATTAAACGTGTGTTG +GGAGTCGTATACTGTTAG diff --git a/src/kallisto/kallisto_index/test_data/transcriptome.fasta b/src/kallisto/kallisto_index/test_data/transcriptome.fasta new file mode 100644 index 00000000..94c06163 --- /dev/null +++ b/src/kallisto/kallisto_index/test_data/transcriptome.fasta @@ -0,0 +1,23 @@ +>YAL069W CDS=1-315 +ATGATCGTAAATAACACACACGTGCTTACCCTACCACTTTATACCACCACCACATGCCATACTCACCCTC +ACTTGTATACTGATTTTACGTACGCACACGGATGCTACAGTATATACCATCTCAAACTTACCCTACTCTC +AGATTCCACTTCACTCCATGGCCCATCTCTCACTGAATCAGTACCAAATGCACTCACATCATTATGCACG +GCACTTGCCTCAGCGGTCTATACCCTGTGCCATTTACCCATAACGCCCATCATTATCCACATTTTGATAT +CTATATCTCATTCGGCGGTCCCAAATATTGTATAA +>YAL068W-A CDS=1-255 +ATGCACGGCACTTGCCTCAGCGGTCTATACCCTGTGCCATTTACCCATAACGCCCATCATTATCCACATT +TTGATATCTATATCTCATTCGGCGGTCCCAAATATTGTATAACTGCCCTTAATACATACGTTATACCACT +TTTGCACCATATACTTACCACTCCATTTATATACACTTATGTCAATATTACAGAAAAATCCCCACAAAAA +TCACCTAAACATAAAAATATTCTACTTTTCAACAATAATACATAA +>YAL068C CDS=1-363 +ATGGTCAAATTAACTTCAATCGCCGCTGGTGTCGCTGCCATCGCTGCTACTGCTTCTGCAACCACCACTC +TAGCTCAATCTGACGAAAGAGTCAACTTGGTGGAATTGGGTGTCTACGTCTCTGATATCAGAGCTCACTT +AGCCCAATACTACATGTTCCAAGCCGCCCACCCAACTGAAACCTACCCAGTCGAAGTTGCTGAAGCCGTT +TTCAACTACGGTGACTTCACCACCATGTTGACCGGTATTGCTCCAGACCAAGTGACCAGAATGATCACCG +GTGTTCCATGGTACTCCAGCAGATTAAAGCCAGCCATCTCCAGTGCTCTATCCAAGGACGGTATCTACAC +TATCGCAAACTAG +>YAL067W-A CDS=1-228 +ATGCCAATTATAGGGGTGCCGAGGTGCCTTATAAAACCCTTTTCTGTGCCTGTGACATTTCCTTTTTCGG +TCAAAAAGAATATCCGAATTTTAGATTTGGACCCTCGTACAGAAGCTTATTGTCTAAGCCTGAATTCAGT +CTGCTTTAAACGGCTTCCGCGGAGGAAATATTTCCATCTCTTGAATTCGTACAACATTAAACGTGTGTTG +GGAGTCGTATACTGTTAG \ No newline at end of file From fe56ee7c53ca30f25aa31cb9a025e17cd75b636e Mon Sep 17 00:00:00 2001 From: Sai Nirmayi Yasa <92786623+sainirmayi@users.noreply.github.com> Date: Fri, 13 Sep 2024 09:15:19 +0200 Subject: [PATCH 10/13] change output quant file to an optional argument (#151) --- src/salmon/salmon_quant/config.vsh.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/salmon/salmon_quant/config.vsh.yaml b/src/salmon/salmon_quant/config.vsh.yaml index 1f96f0c9..5fa3d48f 100644 --- a/src/salmon/salmon_quant/config.vsh.yaml +++ b/src/salmon/salmon_quant/config.vsh.yaml @@ -24,7 +24,7 @@ argument_groups: description: | Format string describing the library. The library type string consists of three parts: - 1. Relative orientation of the reads: This part is only provided if the library is paired-end, THe possible options are + 1. Relative orientation of the reads: This part is only provided if the library is paired-end, The possible options are I = inward O = outward M = matching @@ -118,7 +118,7 @@ argument_groups: direction: output description: | Salmon quantification file. - required: true + required: false example: quant.sf - name: Basic options @@ -327,7 +327,7 @@ argument_groups: If this option is provided, then the selective-alignment results will be written out in SAM-compatible format. By default, output will be directed to stdout, but an alternative file name can be provided instead. - name: --mapping_sam type: file - description: Path to file that should output the selective-alignment results in SAM-compatible format. THis option must be provided while using --write_mappings + description: Path to file that should output the selective-alignment results in SAM-compatible format. This option must be provided while using --write_mappings required: false direction: output example: mappings.sam From 124d50ce5318b612e4a1e4da1be705523cd6eab7 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 16 Sep 2024 09:50:24 +0200 Subject: [PATCH 11/13] update changelog for viash 0.2.0 --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 846007d8..1f733203 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -# biobox x.x.x +# biobox 0.2.0 ## BREAKING CHANGES From c3b40a15350235b00144f9f6735090d45bc24963 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 16 Sep 2024 09:53:23 +0200 Subject: [PATCH 12/13] update to viash 0.9.0 --- CHANGELOG.md | 6 ++++++ _viash.yaml | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f733203..0370a216 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +# biobox x.x.x + +## MINOR CHANGES + +* Upgrade to Viash 0.9.0. + # biobox 0.2.0 ## BREAKING CHANGES diff --git a/_viash.yaml b/_viash.yaml index ab4f3828..d08f2fb2 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -7,7 +7,7 @@ links: issue_tracker: https://github.com/viash-hub/biobox/issues repository: https://github.com/viash-hub/biobox -viash_version: 0.9.0-RC7 +viash_version: 0.9.0 config_mods: | .requirements.commands := ['ps'] From 38f635ad57ef05550bba3a0864c81627f84f5ad2 Mon Sep 17 00:00:00 2001 From: Leila011 Date: Mon, 16 Sep 2024 10:44:10 +0200 Subject: [PATCH 13/13] Add agat convert genscan2gff (#100) * add config * add help * add test data and expected output adn the script to obtain them * add running script * add test script * update changelog * cleanup * fix tests * format description * remove unused argument --inflate-off * update --config description * add requirements * create temporary directory and clean up on exit * add GENSCAN in keywords * add set -e to test * fix create temporary directory * add set -eo pipefail to test * add set -eo pipefail to script * fix create temporary directory * update --config description * cleanup changelog * cleanup changelog * Update deprecated variable --------- Co-authored-by: Robrecht Cannoodt --- CHANGELOG.md | 5 + .../agat_convert_genscan2gff/config.vsh.yaml | 95 +++++++++++++ src/agat/agat_convert_genscan2gff/help.txt | 94 +++++++++++++ src/agat/agat_convert_genscan2gff/script.sh | 21 +++ src/agat/agat_convert_genscan2gff/test.sh | 35 +++++ .../test_data/agat_convert_genscan2gff_1.gff | 25 ++++ .../test_data/script.sh | 11 ++ .../test_data/test.genscan | 127 ++++++++++++++++++ 8 files changed, 413 insertions(+) create mode 100644 src/agat/agat_convert_genscan2gff/config.vsh.yaml create mode 100644 src/agat/agat_convert_genscan2gff/help.txt create mode 100644 src/agat/agat_convert_genscan2gff/script.sh create mode 100644 src/agat/agat_convert_genscan2gff/test.sh create mode 100644 src/agat/agat_convert_genscan2gff/test_data/agat_convert_genscan2gff_1.gff create mode 100755 src/agat/agat_convert_genscan2gff/test_data/script.sh create mode 100644 src/agat/agat_convert_genscan2gff/test_data/test.genscan diff --git a/CHANGELOG.md b/CHANGELOG.md index 0370a216..b31f43d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # biobox x.x.x +## NEW FUNCTIONALITY + +* `agat`: + - `agat/agat_convert_genscan2gff`: convert a genscan file into a GFF file (PR #100). + ## MINOR CHANGES * Upgrade to Viash 0.9.0. diff --git a/src/agat/agat_convert_genscan2gff/config.vsh.yaml b/src/agat/agat_convert_genscan2gff/config.vsh.yaml new file mode 100644 index 00000000..2adce1da --- /dev/null +++ b/src/agat/agat_convert_genscan2gff/config.vsh.yaml @@ -0,0 +1,95 @@ +name: agat_convert_genscan2gff +namespace: agat +description: | + The script takes a GENSCAN file as input, and will translate it in gff + format. The GENSCAN format is described [here](http://genome.crg.es/courses/Bioinformatics2003_genefinding/results/genscan.html). + + **Known problem** + + You must have submited only DNA sequence, without any header!! Indeed the tool expects only DNA + sequences and does not crash/warn if an header is submited along the + sequence. e.g If you have an header ">seq" s-e-q are seen as the 3 first + nucleotides of the sequence. Then all prediction location are shifted + accordingly. (checked only on the [online version](http://argonaute.mit.edu/GENSCAN.html). + I don't know if there is the same problem elsewhere.) +keywords: [gene annotations, GFF conversion, GENSCAN] +links: + homepage: https://github.com/NBISweden/AGAT + documentation: https://agat.readthedocs.io/en/latest/tools/agat_convert_genscan2gff.html + issue_tracker: https://github.com/NBISweden/AGAT/issues + repository: https://github.com/NBISweden/AGAT +references: + doi: 10.5281/zenodo.3552717 +license: GPL-3.0 +requirements: + - commands: [agat] +authors: + - __merge__: /src/_authors/leila_paquay.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --genscan + alternatives: [-g] + description: Input genscan bed file that will be converted. + type: file + required: true + direction: input + - name: Outputs + arguments: + - name: --output + alternatives: [-o, --out, --outfile, --gff] + description: Output GFF file. If no output file is specified, the output will be written to STDOUT. + type: file + direction: output + required: true + example: output.gff + - name: Arguments + arguments: + - name: --source + description: | + The source informs about the tool used to produce the data and is stored in 2nd field of a gff file. Example: Stringtie, Maker, Augustus, etc. [default: data] + type: string + required: false + example: Stringtie + - name: --primary_tag + description: | + The primary_tag corresponds to the data type and is stored in 3rd field of a gff file. Example: gene, mRNA, CDS, etc. [default: gene] + type: string + required: false + example: gene + - name: --inflate_type + description: | + Feature type (3rd column in gff) created when inflate parameter activated [default: exon]. + type: string + required: false + example: exon + - name: --verbose + description: add verbosity + type: boolean_true + - name: --config + alternatives: [-c] + description: | + AGAT config file. By default AGAT takes the original agat_config.yaml shipped with AGAT. The `--config` option gives you the possibility to use your own AGAT config file (located elsewhere or named differently). + type: file + required: false + example: custom_agat_config.yaml +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/agat:1.4.0--pl5321hdfd78af_0 + setup: + - type: docker + run: | + agat --version | sed 's/AGAT\s\(.*\)/agat: "\1"/' > /var/software_versions.txt +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/agat/agat_convert_genscan2gff/help.txt b/src/agat/agat_convert_genscan2gff/help.txt new file mode 100644 index 00000000..8a9e9f52 --- /dev/null +++ b/src/agat/agat_convert_genscan2gff/help.txt @@ -0,0 +1,94 @@ +```sh +agat_convert_genscan2gff.pl --help +``` + ------------------------------------------------------------------------------ +| Another GFF Analysis Toolkit (AGAT) - Version: v1.4.0 | +| https://github.com/NBISweden/AGAT | +| National Bioinformatics Infrastructure Sweden (NBIS) - www.nbis.se | + ------------------------------------------------------------------------------ + +Name: + agat_convert_genscan2gff.pl + +Description: + The script takes a genscan file as input, and will translate it in gff + format. The genscan format is described here: + http://genome.crg.es/courses/Bioinformatics2003_genefinding/results/gens + can.html /!\ vvv Known problem vvv /!\ You must have submited only DNA + sequence, wihtout any header!! Indeed the tool expects only DNA + sequences and does not crash/warn if an header is submited along the + sequence. e.g If you have an header ">seq" s-e-q are seen as the 3 first + nucleotides of the sequence. Then all prediction location are shifted + accordingly. (checked only on the online version + http://argonaute.mit.edu/GENSCAN.html. I don't know if there is the same + pronlem elsewhere.) /!\ ^^^ Known problem ^^^^ /!\ + +Usage: + agat_convert_genscan2gff.pl --genscan infile.bed [ -o outfile ] + agat_convert_genscan2gff.pl -h + +Options: + --genscan or -g + Input genscan bed file that will be convert. + + --source + The source informs about the tool used to produce the data and + is stored in 2nd field of a gff file. Example: + Stringtie,Maker,Augustus,etc. [default: data] + + --primary_tag + The primary_tag corresponf to the data type and is stored in 3rd + field of a gff file. Example: gene,mRNA,CDS,etc. [default: gene] + + --inflate_off + By default we inflate the block fields (blockCount, blockSizes, + blockStarts) to create subfeatures of the main feature + (primary_tag). Type of subfeature created based on the + inflate_type parameter. If you don't want this inflating + behaviour you can deactivate it by using the option + --inflate_off. + + --inflate_type + Feature type (3rd column in gff) created when inflate parameter + activated [default: exon]. + + --verbose + add verbosity + + -o , --output , --out , --outfile or --gff + Output GFF file. If no output file is specified, the output will + be written to STDOUT. + + -c or --config + String - Input agat config file. By default AGAT takes as input + agat_config.yaml file from the working directory if any, + otherwise it takes the orignal agat_config.yaml shipped with + AGAT. To get the agat_config.yaml locally type: "agat config + --expose". The --config option gives you the possibility to use + your own AGAT config file (located elsewhere or named + differently). + + -h or --help + Display this helpful text. + +Feedback: + Did you find a bug?: + Do not hesitate to report bugs to help us keep track of the bugs and + their resolution. Please use the GitHub issue tracking system available + at this address: + + https://github.com/NBISweden/AGAT/issues + + Ensure that the bug was not already reported by searching under Issues. + If you're unable to find an (open) issue addressing the problem, open a new one. + Try as much as possible to include in the issue when relevant: + - a clear description, + - as much relevant information as possible, + - the command used, + - a data sample, + - an explanation of the expected behaviour that is not occurring. + + Do you want to contribute?: + You are very welcome, visit this address for the Contributing + guidelines: + https://github.com/NBISweden/AGAT/blob/master/CONTRIBUTING.md diff --git a/src/agat/agat_convert_genscan2gff/script.sh b/src/agat/agat_convert_genscan2gff/script.sh new file mode 100644 index 00000000..38afb084 --- /dev/null +++ b/src/agat/agat_convert_genscan2gff/script.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +## VIASH END + +# unset flags +[[ "$par_inflate_off" == "true" ]] && unset par_inflate_off +[[ "$par_verbose" == "false" ]] && unset par_verbose + +# run agat_convert_genscan2gff +agat_convert_genscan2gff.pl \ + --genscan "$par_genscan" \ + --output "$par_output" \ + ${par_source:+--source "${par_source}"} \ + ${par_primary_tag:+--primary_tag "${par_primary_tag}"} \ + ${par_inflate_off:+--inflate_off} \ + ${par_inflate_type:+--inflate_type "${par_inflate_type}"} \ + ${par_verbose:+--verbose} \ + ${par_config:+--config "${par_config}"} \ No newline at end of file diff --git a/src/agat/agat_convert_genscan2gff/test.sh b/src/agat/agat_convert_genscan2gff/test.sh new file mode 100644 index 00000000..b666dacf --- /dev/null +++ b/src/agat/agat_convert_genscan2gff/test.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +## VIASH END + +test_dir="${meta_resources_dir}/test_data" + +# create temporary directory and clean up on exit +TMPDIR=$(mktemp -d "$meta_temp_dir/$meta_name-XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -rf "$TMPDIR" +} +trap clean_up EXIT + +echo "> Run $meta_name with test data" +"$meta_executable" \ + --genscan "$test_dir/test.genscan" \ + --output "$TMPDIR/output.gff" + +echo ">> Checking output" +[ ! -f "$TMPDIR/output.gff" ] && echo "Output file output.gff does not exist" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "$TMPDIR/output.gff" ] && echo "Output file output.gff is empty" && exit 1 + +echo ">> Check if output matches expected output" +diff "$TMPDIR/output.gff" "$test_dir/agat_convert_genscan2gff_1.gff" +if [ $? -ne 0 ]; then + echo "Output file output.gff does not match expected output" + exit 1 +fi + +echo "> Test successful" diff --git a/src/agat/agat_convert_genscan2gff/test_data/agat_convert_genscan2gff_1.gff b/src/agat/agat_convert_genscan2gff/test_data/agat_convert_genscan2gff_1.gff new file mode 100644 index 00000000..695fb46c --- /dev/null +++ b/src/agat/agat_convert_genscan2gff/test_data/agat_convert_genscan2gff_1.gff @@ -0,0 +1,25 @@ +##gff-version 3 +unknown genscan gene 2223 4605 75.25 + . ID=gene_1 +unknown genscan mRNA 2223 4605 75.25 + . ID=mrna_1;Parent=gene_1 +unknown genscan exon 2223 3020 75.25 + . ID=exon_1;Parent=mrna_1 +unknown genscan exon 4249 4605 13.03 + . ID=exon_2;Parent=mrna_1 +unknown genscan CDS 2223 3020 75.25 + 0 ID=cds_1;Parent=mrna_1 +unknown genscan CDS 4249 4605 13.03 + 0 ID=cds_2;Parent=mrna_1 +unknown genscan gene 6829 8789 20.06 - . ID=gene_2 +unknown genscan mRNA 6829 8789 20.06 - . ID=mrna_2;Parent=gene_2 +unknown genscan exon 6829 7297 20.06 - . ID=exon_3;Parent=mrna_2 +unknown genscan exon 7730 7888 12.78 - . ID=exon_4;Parent=mrna_2 +unknown genscan exon 8029 8185 7.45 - . ID=exon_5;Parent=mrna_2 +unknown genscan exon 8278 8546 17.45 - . ID=exon_6;Parent=mrna_2 +unknown genscan exon 8647 8789 18.65 - . ID=exon_7;Parent=mrna_2 +unknown genscan CDS 6829 7297 20.06 - 1 ID=cds_3;Parent=mrna_2 +unknown genscan CDS 7730 7888 12.78 - 1 ID=cds_4;Parent=mrna_2 +unknown genscan CDS 8029 8185 7.45 - 2 ID=cds_5;Parent=mrna_2 +unknown genscan CDS 8278 8546 17.45 - 1 ID=cds_6;Parent=mrna_2 +unknown genscan CDS 8647 8789 18.65 - 0 ID=cds_7;Parent=mrna_2 +unknown genscan gene 10209 11924 16.18 + . ID=gene_3 +unknown genscan mRNA 10209 11924 16.18 + . ID=mrna_3;Parent=gene_3 +unknown genscan exon 10209 11313 16.18 + . ID=exon_8;Parent=mrna_3 +unknown genscan exon 11850 11924 3.27 + . ID=exon_9;Parent=mrna_3 +unknown genscan CDS 10209 11313 16.18 + 0 ID=cds_8;Parent=mrna_3 +unknown genscan CDS 11850 11924 3.27 + 2 ID=cds_9;Parent=mrna_3 diff --git a/src/agat/agat_convert_genscan2gff/test_data/script.sh b/src/agat/agat_convert_genscan2gff/test_data/script.sh new file mode 100755 index 00000000..c1693653 --- /dev/null +++ b/src/agat/agat_convert_genscan2gff/test_data/script.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# clone repo +if [ ! -d /tmp/agat_source ]; then + git clone --depth 1 --single-branch --branch master https://github.com/NBISweden/AGAT /tmp/agat_source +fi + +# copy test data +cp -r /tmp/agat_source/t/scripts_output/in/test.genscan src/agat/agat_convert_genscan2gff/test_data/test.genscan +cp -r /tmp/agat_source/t/scripts_output/out/agat_convert_genscan2gff_1.gff src/agat/agat_convert_genscan2gff/test_data/agat_convert_genscan2gff_1.gff + diff --git a/src/agat/agat_convert_genscan2gff/test_data/test.genscan b/src/agat/agat_convert_genscan2gff/test_data/test.genscan new file mode 100644 index 00000000..a88037db --- /dev/null +++ b/src/agat/agat_convert_genscan2gff/test_data/test.genscan @@ -0,0 +1,127 @@ +GENSCAN 1.0 Date run: 7-Mar-120 Time: 14:46:49 + + + +Sequence /tmp/03_07_20-14:46:49.fasta : 12217 bp : 42.83% C+G : Isochore 1 ( 0 - 43 C+G%) + + + +Parameter matrix: HumanIso.smat + + + +Predicted genes/exons: + + + +Gn.Ex Type S .Begin ...End .Len Fr Ph I/Ac Do/T CodRg P.... Tscr.. + +----- ---- - ------ ------ ---- -- -- ---- ---- ----- ----- ------ + + + + 1.01 Init + 2223 3020 798 2 0 55 2 924 0.940 75.25 + + 1.02 Term + 4249 4605 357 0 0 26 38 307 0.976 13.03 + + 1.03 PlyA + 4711 4716 6 -0.45 + + + + 2.06 PlyA - 4852 4847 6 -0.45 + + 2.05 Term - 7297 6829 469 0 1 13 42 387 0.281 20.06 + + 2.04 Intr - 7888 7730 159 0 0 85 93 144 0.998 12.78 + + 2.03 Intr - 8185 8029 157 2 1 65 60 144 0.787 7.45 + + 2.02 Intr - 8546 8278 269 1 2 36 65 287 0.946 17.45 + + 2.01 Init - 8789 8647 143 2 2 94 96 176 0.550 18.65 + + 2.00 Prom - 9720 9681 40 -6.55 + + + + 3.00 Prom + 10160 10199 40 -11.84 + + 3.01 Init + 10209 11313 1105 2 1 66 57 269 0.512 16.18 + + 3.02 Intr + 11850 11924 75 1 0 80 86 57 0.507 3.27 + + + +Suboptimal exons with probability > 1.000 + + + +Exnum Type S .Begin ...End .Len Fr Ph B/Ac Do/T CodRg P.... Tscr.. + +----- ---- - ------ ------ ---- -- -- ---- ---- ----- ----- ------ + + + +NO EXONS FOUND AT GIVEN PROBABILITY CUTOFF + + + + + +Predicted peptide sequence(s): + + + + + +>/tmp/03_03_20-07:33:11.fasta|GENSCAN_predicted_peptide_1|384_aa + +MSSKNKVSKQDIDSIVESLMKKQKSYFEPRLAQIQQVGMENVQKLSAIHAELALLTASIS + +TVKSDVDKLKCKVENNFSAIDGHDQAFGELELKMADMEDRSRRCNIRVIGLKERLEGFNA + +IQYLTHSLPKWFPALADVPVEVMSAHRIYSDAKRGDNRTLIFNVLRYTTRQAILRAAKKD + +PLSVDDRKVRFSPDYSNFTVKRCQAFHQAKDAARNKCLDFFLLYPATLKIKEGAQYRSFT + +SPKEAEDYVNSAASNHAATPASPRQHGTILTIYRRIHSLYDGERARKIQLLEQAASVALT + +GDNWTSVRNDNYLGVTAHFIDNVWKLRCFALEVKKKKKHSRHTAEDCAEEFIDVSNRWEI + +NGKLTTLGTDSALIMLAAARLLPF + + + +>/tmp/03_03_20-07:33:11.fasta|GENSCAN_predicted_peptide_2|398_aa + +MASTMPSSSSTEDEENTPECLNKDHYHFHHYTMEYIQDKPTNVARVGGFTDKKSIAKVER + +CLARERQEATEDHEAIPSTSGATSLTKKLRSRSGLPIAGSGLVLPALCIICQKKEKFINR + +AGKRQRDPLSKAETLTVGQLQKAAELKDDQSILLHIKDKDCVALEVQYHKGCYNQYTRFM + +TRPEKPEKEQNEPTFDVGYKILCERIIRQRLLVNQEVLRMGQLRMAFIELVKANEGLDAS + +NYSIKNLERSRRADAGSQRIQIFDPDQRTPTQWKKFLSEGTKKEALAEFLYVAWKNADLT + +IVGKNLCLYIAHTNQCHCVTVKEGVQSVRVVEDLLLFLHAQHAAREHKAVIIKSSDTDVA + +VIAVSVQTDLPCSLYVFTGTGNRTRIIDITKVSSANKI + + + +>/tmp/03_03_20-07:33:11.fasta|GENSCAN_predicted_peptide_3|394_aa + +MQRGRAAGINGIPPEFYVAFWEQLSPFFLHMINFSIEKGGFLRDVNTALISLLMKKDKNP + +TDCSSYRPLSLLNSDVKIFAKLLPLRLEPHMPELVSSDQTGFIKSRTAADNIRRLLHIIA + +AAPGCETPMSVLSLDAMKAFDRLEWSFLWSVLEAMGFISTFIGMVKVLYSNPSARVLTGQ + +TFSSLFPVSRSSRQGCPLSPALFVLSLEPLAQAVRLSNLVLPICICDTQHKLSLFADDVI + +VFLEHPTQSLPHFLSICEEFRKLSGFKMNWSKSALMHLNDNARKSVTPVNIPLVGQLKYL + +GIEVFPSLNQIVKHNYSLAFTNVLKDMDRWISLPMSIQARISIIKMNGLPRIHFVSSMVP + +LPPPSDYWIKISAQGVRCPLAKPFTHSPYSKTKX