From 923a6da3898a832df96a0e17c8a3b74c2806d939 Mon Sep 17 00:00:00 2001 From: Theodoro Gasperin Terra Camargo <98555209+tgaspe@users.noreply.github.com> Date: Wed, 14 Aug 2024 22:55:45 +0200 Subject: [PATCH 01/28] Bug Fixed (#136) --- src/bedtools/bedtools_intersect/script.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/src/bedtools/bedtools_intersect/script.sh b/src/bedtools/bedtools_intersect/script.sh index 04a8d854..3a28ba57 100644 --- a/src/bedtools/bedtools_intersect/script.sh +++ b/src/bedtools/bedtools_intersect/script.sh @@ -24,6 +24,7 @@ unset_if_false=( par_sortout par_bed par_no_buffer_output + par_header ) for par in ${unset_if_false[@]}; do From 766ab6c9c3059004c7c3f205621909b2d8b0b26d Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Wed, 21 Aug 2024 13:32:48 +0200 Subject: [PATCH 02/28] Qualimap rnaseq (#74) * first version * complete script for qualimap * add escaping character before leading hashtag (#50) * add escaping character before leading hashtag * update changelog * Update CHANGELOG.md Co-authored-by: Robrecht Cannoodt * replace escaping \ by \\ --------- Co-authored-by: Robrecht Cannoodt * Samtools collate (#49) * initial commit dedup * Revert "initial commit dedup" This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2. * Initial commit, whole component is functional * Update viash (#51) * update viash * update readme * update changelog * update changelog * fix incorrect heading detection * update again * clean up readme * Samtools view (#48) * initial commit dedup * Revert "initial commit dedup" This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2. * initial version with a few tests, script, and config file * update changelog, add one test * add a 4th test, fix option names in the script * Fix name of component in config * remove option named with a number * add must_exist to input file argument * removed "default: null" from one of the arguments in config * remove utf8 characters from config * Update CHANGELOG.md --------- Co-authored-by: Robrecht Cannoodt * Samtools fastq (#52) * initial commit dedup * Revert "initial commit dedup" This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2. * Initial commit, config, script, help and test_data * Update changelog, add tests, fix argument naming errors, add test data * update changelog, remove gffread namespace field --------- Co-authored-by: Robrecht Cannoodt * format URL in the description (#55) * format URL in the description * update changelog * Change name in _viash.yaml (#60) * Update operational code (#63) * update readme * switch ci to toolbox * update to viash 0.9.0-RC6 * edit keywords * fix version * update biobox * cutadapt (#7) * First commit, clone of cutadapt in htrnaseq + help.txt * Add config * Don't allow multiple: true when providing a FASTA file with adapters * First version of script * Updates and fixes - se/pe * Add tests and fix --json argument * Add software version * Better consistency in using snake_case * Update src/cutadapt/config.vsh.yaml Co-authored-by: Robrecht Cannoodt * Update src/cutadapt/config.vsh.yaml Co-authored-by: Robrecht Cannoodt * Update src/cutadapt/config.vsh.yaml Co-authored-by: Robrecht Cannoodt * Specify --input and --input_r2 as separate arguments * Avoid specifying default arg values * Add more information to `--minimum_length` and `maximum_length` * Add --cpus by means of $meta_cpus and set proper default * Allow multiple for adapters/fasta and add test * change multiple_sep to ';' * add example * simplify code with a helper function * create directories in test * use a different output extension if --fasta is provided * decrease code duplication by separating optional outputs from paired/unpaired output arguments * write custom tests for cutadapt * fix _r2 arguments * add debug flag as not to always print the cli command * remove comment * Update to Viash 0.9.0-RC4 * Ability to specify output globbing patterns * Avoid the need for both output_dir and output * Move fields from `info` to `links` Co-authored-by: Robrecht Cannoodt * Move references back to the info field * apologies, I proposed a wrong syntax --------- Co-authored-by: Robrecht Cannoodt * update changelog * update readme * Update salmon quant arguments (#57) * Make index an optional argument * FIx argument type and add optional argument * FEAT: add bedtools getfasta. (#59) * FEAT: add bedtools getfasta. * Add PR number to CHANGELOG * Add star genomegenerate component (#58) * Add star genomegenerate component * Update changelog * Rename component * Update test * Update CHANGELOG.md --------- Co-authored-by: Robrecht Cannoodt * fix package config (#65) * Delete src/bgzip directory (#64) It was moved to toolbox * Output alignments to the transcriptome (#56) * Output alignments to the transcriptome * Change argument name * BUG: pear component failure is ignored (#70) * FEAT + BUG: cutadapt; allowing disabling demultiplexing and fix par_quality_cutoff_r2 (#69) * FEAT: Disable cutadapt demultiplexing by default * Cutadapt: fix --par_quality_cutoff_r2 * FEAT: update busco to 5.7.1 (#72) * FEAT: update busco to 5.7.1 * Typo * Samtools fasta (#53) * initial commit dedup * Revert "initial commit dedup" This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2. * Fasta component * change script resource to samtools_fastq script, with dummy argument to specify the command * add dummy argument to samtools_fastq to share the script with samtools_fasta * fix path to script in config * Update src/samtools/samtools_fastq/script.sh Co-authored-by: Robrecht Cannoodt * Change default fields to examples * Two more default fields changed to examples * Minor formatting changes * Markdown formatting changes in configs --------- Co-authored-by: Robrecht Cannoodt * Umi tools dedup (#54) * initial commit dedup * Revert "initial commit dedup" This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2. * inital commit dedup * Working component with one test * Update test 1 and test data, fix some arg types in config and script * test data files and changes to script * Add third test and test data * Fix typo in script * remove utf8 characters in config * Add choices fields and change default fields to exampels * Minor formatting changes * md formatting changes in config * Fix typo (#79) * add vscode to gitignore * update multiple separator (#81) * update multiple separator * update changelog * Update src/multiqc/config.vsh.yaml Co-authored-by: Robrecht Cannoodt * Update src/multiqc/config.vsh.yaml Co-authored-by: Robrecht Cannoodt * Update src/multiqc/config.vsh.yaml Co-authored-by: Robrecht Cannoodt * Update src/multiqc/config.vsh.yaml Co-authored-by: Robrecht Cannoodt * update ifs --------- Co-authored-by: Robrecht Cannoodt * add test data * add tests * update changelog * remove unrequired test data * update descriptions * update changelog * update help text * Update src/qualimap/qualimap_rnaseq/script.sh Co-authored-by: Robrecht Cannoodt * update unit tests * update unit tests * addres pr changes request * add version * remove whitespace multiqc * Apply suggestions from code review Co-authored-by: Robrecht Cannoodt * address pr comments * Update CHANGELOG.md * fix doi * Fix name * update version and container image * write software version to file --------- Co-authored-by: dorien-er Co-authored-by: Leila011 Co-authored-by: Robrecht Cannoodt Co-authored-by: emmarousseau Co-authored-by: Sai Nirmayi Yasa <92786623+sainirmayi@users.noreply.github.com> Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> Co-authored-by: Dorien <41797896+dorien-er@users.noreply.github.com> --- CHANGELOG.md | 2 + src/qualimap/qualimap_rnaseq/config.vsh.yaml | 103 ++++++++++++++++ src/qualimap/qualimap_rnaseq/help.txt | 52 ++++++++ src/qualimap/qualimap_rnaseq/script.sh | 50 ++++++++ src/qualimap/qualimap_rnaseq/test.sh | 112 ++++++++++++++++++ src/qualimap/qualimap_rnaseq/test_data/a.bam | Bin 0 -> 2447 bytes .../qualimap_rnaseq/test_data/annotation.gtf | 10 ++ .../qualimap_rnaseq/test_data/script.sh | 10 ++ 8 files changed, 339 insertions(+) create mode 100644 src/qualimap/qualimap_rnaseq/config.vsh.yaml create mode 100644 src/qualimap/qualimap_rnaseq/help.txt create mode 100644 src/qualimap/qualimap_rnaseq/script.sh create mode 100755 src/qualimap/qualimap_rnaseq/test.sh create mode 100644 src/qualimap/qualimap_rnaseq/test_data/a.bam create mode 100644 src/qualimap/qualimap_rnaseq/test_data/annotation.gtf create mode 100755 src/qualimap/qualimap_rnaseq/test_data/script.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bd21a1e..2f4c0c71 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,8 @@ - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98). - `bedtools/bedtools_bamtofastq`: Convert BAM alignments to FASTQ files (PR #101). - `bedtools/bedtools_bedtobam`: Converts genomic feature records (bed/gff/vcf) to BAM format (PR #111). + +* `qualimap/qualimap_rnaseq`: RNA-seq QC analysis using qualimap (PR #74). ## MINOR CHANGES diff --git a/src/qualimap/qualimap_rnaseq/config.vsh.yaml b/src/qualimap/qualimap_rnaseq/config.vsh.yaml new file mode 100644 index 00000000..ffc807ab --- /dev/null +++ b/src/qualimap/qualimap_rnaseq/config.vsh.yaml @@ -0,0 +1,103 @@ +name: qualimap_rnaseq +namespace: qualimap +keywords: [RNA-seq, quality control, QC Report] +description: | + Qualimap RNA-seq QC reports quality control metrics and bias estimations + which are specific for whole transcriptome sequencing, including reads genomic + origin, junction analysis, transcript coverage and 5’-3’ bias computation. +links: + homepage: http://qualimap.conesalab.org/ + documentation: http://qualimap.conesalab.org/doc_html/analysis.html#rna-seq-qc + issue_tracker: https://bitbucket.org/kokonech/qualimap/issues?status=new&status=open + repository: https://bitbucket.org/kokonech/qualimap/commits/branch/master +references: + doi: 10.1093/bioinformatics/btv566 +license: GPL-2.0 +authors: + - __merge__: /src/_authors/dorien_roosen.yaml + roles: [ author, maintainer ] +argument_groups: + - name: "Input" + arguments: + - name: "--bam" + type: file + required: true + example: alignment.bam + description: Path to the sequence alignment file in BAM format, produced by a splicing-aware aligner. + - name: "--gtf" + type: file + required: true + example: annotations.gtf + description: Path to genomic annotations in Ensembl GTF format. + + - name: "Output" + arguments: + - name: "--qc_results" + direction: output + type: file + required: true + example: rnaseq_qc_results.txt + description: Text file containing the RNAseq QC results. + - name: "--counts" + type: file + required: false + direction: output + description: Output file for computed counts. + - name: "--report" + type: file + direction: output + required: false + example: report.html + description: Report output file. Supported formats are PDF or HTML. + + - name: "Optional" + arguments: + - name: "--num_pr_bases" + type: integer + required: false + min: 1 + description: Number of upstream/downstream nucleotide bases to compute 5'-3' bias (default = 100). + - name: "--num_tr_bias" + type: integer + required: false + min: 1 + description: Number of top highly expressed transcripts to compute 5'-3' bias (default = 1000). + - name: "--algorithm" + type: string + required: false + choices: ["uniquely-mapped-reads", "proportional"] + description: Counting algorithm (uniquely-mapped-reads (default) or proportional). + - name: "--sequencing_protocol" + type: string + required: false + choices: ["non-strand-specific", "strand-specific-reverse", "strand-specific-forward"] + description: Sequencing library protocol (strand-specific-forward, strand-specific-reverse or non-strand-specific (default)). + - name: "--paired" + type: boolean_true + description: Setting this flag for paired-end experiments will result in counting fragments instead of reads. + - name: "--sorted" + type: boolean_true + description: Setting this flag indicates that the input file is already sorted by name. If flag is not set, additional sorting by name will be performed. Only requiredfor paired-end analysis. + - name: "--java_memory_size" + type: string + required: false + description: maximum Java heap memory size, default = 4G. + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - path: test_data/ + +engines: + - type: docker + image: quay.io/biocontainers/qualimap:2.3--hdfd78af_0 + setup: + - type: docker + run: | + echo QualiMap: $(qualimap 2>&1 | grep QualiMap | sed 's/^.*QualiMap//') > /var/software_versions.txt +runners: + - type: executable + - type: nextflow diff --git a/src/qualimap/qualimap_rnaseq/help.txt b/src/qualimap/qualimap_rnaseq/help.txt new file mode 100644 index 00000000..c6493ed9 --- /dev/null +++ b/src/qualimap/qualimap_rnaseq/help.txt @@ -0,0 +1,52 @@ +QualiMap v.2.3 +Built on 2023-05-19 16:57 + +usage: qualimap [options] + +To launch GUI leave empty. + +Available tools: + + bamqc Evaluate NGS mapping to a reference genome + rnaseq Evaluate RNA-seq alignment data + counts Counts data analysis (further RNA-seq data evaluation) + multi-bamqc Compare QC reports from multiple NGS mappings + clustering Cluster epigenomic signals + comp-counts Compute feature counts + +Special arguments: + + --java-mem-size Use this argument to set Java memory heap size. Example: + qualimap bamqc -bam very_large_alignment.bam --java-mem-size=4G + +usage: qualimap rnaseq [-a ] -bam -gtf [-npb ] [-ntb + ] [-oc ] [-outdir ] [-outfile ] [-outformat ] + [-p ] [-pe] [-s] + -a,--algorithm Counting algorithm: + uniquely-mapped-reads(default) or + proportional. + -bam Input mapping file in BAM format. + -gtf Annotations file in Ensembl GTF format. + -npb,--num-pr-bases Number of upstream/downstream nucleotide bases + to compute 5'-3' bias (default is 100). + -ntb,--num-tr-bias Number of top highly expressed transcripts to + compute 5'-3' bias (default is 1000). + -oc Output file for computed counts. If only name + of the file is provided, then the file will be + saved in the output folder. + -outdir Output folder for HTML report and raw data. + -outfile Output file for PDF report (default value is + report.pdf). + -outformat Format of the output report (PDF, HTML or both + PDF:HTML, default is HTML). + -p,--sequencing-protocol Sequencing library protocol: + strand-specific-forward, + strand-specific-reverse or non-strand-specific + (default) + -pe,--paired Setting this flag for paired-end experiments + will result in counting fragments instead of + reads + -s,--sorted This flag indicates that the input file is + already sorted by name. If not set, additional + sorting by name will be performed. Only + required for paired-end analysis. \ No newline at end of file diff --git a/src/qualimap/qualimap_rnaseq/script.sh b/src/qualimap/qualimap_rnaseq/script.sh new file mode 100644 index 00000000..351e5159 --- /dev/null +++ b/src/qualimap/qualimap_rnaseq/script.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +set -eo pipefail + +tmp_dir=$(mktemp -d -p "$meta_temp_dir" qualimap_XXXXXXXXX) + +# Handle output parameters +if [ -n "$par_report" ]; then + outfile=$(basename "$par_report") + report_extension="${outfile##*.}" +fi + +if [ -n "$par_counts" ]; then + counts=$(basename "$par_counts") +fi + +# disable flags +[[ "$par_paired" == "false" ]] && unset par_paired +[[ "$par_sorted" == "false" ]] && unset par_sorted + +# Run qualimap +qualimap rnaseq \ + ${meta_memory_mb:+--java-mem-size=${meta_memory_mb}M} \ + ${par_algorithm:+--algorithm $par_algorithm} \ + ${par_sequencing_protocol:+--sequencing-protocol $par_sequencing_protocol} \ + -bam $par_bam \ + -gtf $par_gtf \ + -outdir "$tmp_dir" \ + ${par_num_pr_bases:+--num-pr-bases $par_num_pr_bases} \ + ${par_num_tr_bias:+--num-tr-bias $par_num_tr_bias} \ + ${par_report:+-outformat $report_extension} \ + ${par_paired:+--paired} \ + ${par_sorted:+--sorted} \ + ${par_report:+-outfile "$outfile"} \ + ${par_counts:+-oc "$counts"} + +# Move output files +mv "$tmp_dir/rnaseq_qc_results.txt" "$par_qc_results" + +if [ -n "$par_report" ] && [ $report_extension = "html" ]; then + mv "$tmp_dir/qualimapReport.html" "$par_report" +fi + +if [ -n "$par_report" ] && [ $report_extension = "pdf" ]; then + mv "$tmp_dir/$outfile" "$par_report" +fi + +if [ -n "$par_counts" ]; then + mv "$tmp_dir/$counts" "$par_counts" +fi diff --git a/src/qualimap/qualimap_rnaseq/test.sh b/src/qualimap/qualimap_rnaseq/test.sh new file mode 100755 index 00000000..2e1b647b --- /dev/null +++ b/src/qualimap/qualimap_rnaseq/test.sh @@ -0,0 +1,112 @@ +set -e + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_doesnt_exist() { + [ ! -f "$1" ] || { echo "File '$1' exists but shouldn't" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +############################################# + + +test_dir="$meta_resources_dir/test_data" + +mkdir "run_qualimap_rnaseq_html" +cd "run_qualimap_rnaseq_html" + +echo "> Running qualimap with html output report" + +"$meta_executable" \ + --bam $test_dir/a.bam \ + --gtf $test_dir/annotation.gtf \ + --report report.html \ + --counts counts.txt \ + --qc_results output.txt + +echo ">> Checking output" +assert_file_exists "report.html" +assert_file_exists "counts.txt" +assert_file_exists "output.txt" +assert_file_doesnt_exist "report.pdf" + +echo ">> Checking if output is empty" +assert_file_not_empty "report.html" +assert_file_not_empty "counts.txt" +assert_file_not_empty "output.txt" + +echo ">> Checking output contents" +assert_file_contains "output.txt" ">>>>>>> Input" +assert_file_contains "output.txt" ">>>>>>> Reads alignment" +assert_file_contains "output.txt" ">>>>>>> Reads genomic origin" +assert_file_contains "output.txt" ">>>>>>> Transcript coverage profile" +assert_file_contains "output.txt" ">>>>>>> Junction analysis" +assert_file_contains "output.txt" ">>>>>>> Transcript coverage profile" + +assert_file_contains "counts.txt" "ENSG00000125841.12" + +assert_file_contains "report.html" "Qualimap report: RNA Seq QC" +assert_file_contains "report.html" "

Input

" +assert_file_contains "report.html" "

Reads alignment

" +assert_file_contains "report.html" "

Reads genomic origin

" +assert_file_contains "report.html" "

Transcript coverage profile

" +assert_file_contains "report.html" "

Junction analysis

" + + +cd .. +rm -r run_qualimap_rnaseq_html + +mkdir "run_qualimap_rnaseq_pdf" +cd "run_qualimap_rnaseq_pdf" + +echo "> Running qualimap with pdf output report" + +"$meta_executable" \ + --bam $test_dir/a.bam \ + --gtf $test_dir/annotation.gtf \ + --report report.pdf \ + --counts counts.txt \ + --qc_results output.txt + +echo ">> Checking output" +assert_file_exists "report.pdf" +assert_file_exists "counts.txt" +assert_file_exists "output.txt" +assert_file_doesnt_exist "report.html" + +echo ">> Checking if output is empty" +assert_file_not_empty "report.pdf" +assert_file_not_empty "counts.txt" +assert_file_not_empty "output.txt" + +cd .. +rm -r run_qualimap_rnaseq_pdf + +mkdir "run_qualimap_rnaseq" +cd "run_qualimap_rnaseq" + +echo "> Running qualimap without report and counts output" + +"$meta_executable" \ + --bam $test_dir/a.bam \ + --gtf $test_dir/annotation.gtf \ + --qc_results output.txt + +echo ">> Checking output" +assert_file_doesnt_exist "report.pdf" +assert_file_doesnt_exist "report.html" +assert_file_doesnt_exist "counts.txt" +assert_file_exists "output.txt" + +echo ">> Checking if output is empty" +assert_file_not_empty "output.txt" + +cd .. +rm -r run_qualimap_rnaseq \ No newline at end of file diff --git a/src/qualimap/qualimap_rnaseq/test_data/a.bam b/src/qualimap/qualimap_rnaseq/test_data/a.bam new file mode 100644 index 0000000000000000000000000000000000000000..c8ea1065e89ca06cf12711850c36f85fba0d31b3 GIT binary patch literal 2447 zcmV;A32^owiwFb&00000{{{d;LjnL`0CRHmWi(=7U~uqo;SBS$GSoBU4EDE5&d)DO z$;?YEN#$|~4&)5>vr5h=GBV)w@v|~B0Rlrab1p`pE;b+r%P_EqUuOmY%wr|6OaK4? zABzYC000000RIL6LPG)o5edy%O^9Si9e+KOiG~T1sx7fvv$$2fVM}u9e*H1sjo?;I z5lhI2uzOI^AnR^U8Z~?JpzbRQErbQl3SK-2f`LWULr$WbKo(sQKk$-+AUU{t5Q(5D zM$GuDneLvqJ^fzq?2aM-nVz0kJw5dA_y79+|8Jv}?b(+vZe*u-pQ7v8ce9K8N7*A! zZ)60o>rQ7p=uNssf8ri39&`5WM?N|!Cf(k!nDjfN-lQ1!yQAzocnihjM;lqDESFJA zg9vtGth8%dO8P7oi?Y}Hu%w&%^tE1R2pm+h@(w*2%7x6^VT z+G**tJ2#$qe5due{aCm2)Xun@y_1ba&wcj( zOzobhtx6cYh_mOi`Y?b^y2D~Po)o=e)ZK$0ERW6yTUu_l6Utgfxm6}arFE&5V%7*9 zbPyInz;dQ*{NHZ4T59c@-}kKo|K<|-FV5i~I6qPH%x=B&m%>*7&p4yLv{o{1IU=4( zVYFq&E6Y4Z*juYZutr%bbZz(_tF3_lmorZS{#Q;^z5jOeWajIOK+zox@ngF`niPor z@gDr>Y1EXt2pZsU>zNH!TkZf_BmILGA{wHFBPP!g2I|z-QEb7lE7GDpe_8=QLEVRc z;Qd6&m+Di0DSRc6)mjPQS0P&X*{BdCB{GO$ohhx0gbEg%$cg1X@>7RD?lK7XXf^O> zmG9{vTxgx%{r$UI*{8A#)3a~=@IvcKLkk#o`;+dVKO9U(-F_!K1N1^ljsPIU86K^& z(J`caz)>7-A=Wu^( zM!P7rpLa(kpt~-AQk-U5uv5Z+tYuiyU<>7tZE6SGn{w2&z0)GT10s&OP|y z!YFAU+B$42BK{5&qWWh*zXtScHOYT|4*%*LexhVtU;3BAx2!biexAyaQe^B}pB}etX&$U%a z)iB}>dZ1+524bUOff=p6F+v&R1z6x#FqV^t4t>;>b{Dx(1Al$0{@?lX$u+h6(z~nb z{}aDHxpv{f2K8T{=Z%YgZ_?`zI@{T0RKJ(6W!Z1>UcnFtP8jAB6^=>nyrRS?dYn2N4%Ox}<$on)LL_wnF+kbD+{tGm`x|U^^HnZ$=>siKt z64hNY29*m96a-2GV@`pTkOIXh2Bi=*N4$V6zO)n`ztzw%*Fhf`frj_Jr&-GozeeY97$|Ugt-vz>mg7S}wsu&HW z0xJQo3Aa(93kEC;+>wTW<{H@a>I(P?>OTAf9hfNjX~Ph(41P(dvWiQVdY?|SZxVlm zDHk*&%*sfJ$HRdgC<7|U<*hrmdzc4S3<0OETwgoAyK}Od`>){a;fCBVa8PUyN23WS z*dKwk4$el*eoo^X1x^Paf+dj#fxK6O3X5d`bfQFrPz(x7o!@}7y<9Ua6Uh}@f&P^_ z`h(O@jQp`7^_QUMvJ~8urA5h8GBrBQOCL;*vhQi>koXRLfU3R9&xo~edi$%QpN&BF zneMvUeaCyNng1-#u4YXG;CRyQY(oN)l8m4_>&H@Y}0aUw2{^_&=M@fW6+i;r2ZDpPdtMko$?0&+InMfQJcaKq?g1 z%3&NpKt!CC9>L+&vc`8jLx0s42$=bwOy$p?yDzMt-ffHfD*nGUd13w02kQMl7;I0v zqak)+gYI~k{R40LdQw1FAO!Jja-}i!M2U)zmWN7@K}dRyOPc3M0&Ih$mY4!Vo|nj# zcy|)us9aT;xB@0Nz(mnI*V!qGozc$VgGDcEwGIeKq@=yTY%7w8`td*OV`71n8~9h3 z@5?garz`PWqA?)l-eO!}`R7s3skIwiqRek{n0!z{&4Fi664dN zS{WYs)6MlSY}9Ll{&rf9bbDhoMXX2nhhuaF8Zjg}NwEOI04#Fu0(aKoR)Go<(lANn z5U?K(m=k1eWw`t7*gSk$;ouib=7Iz@!C$Y`g+$3$U$55$i}3%3dEnAxS@ua}g?3I` ztj2OnrL{ppD4-qUD9uBGgCCQpDFG{Tj7l~)<~aL9yY{$-hFKK6|6jiW4e_=pxPi03 zHtYzBAtdp5G#XBNy}_W^-w~w!uTh?9$qX?RpmAqpc4`k!6GxOX8IKEoa~D~m3~$Wg zA4EZ-SVif#U^c0D*-tP@+*Lq=Xrul{vE_Z3xoRTri0x$+5w2PMaF|uTG&p zaN>7>{|%9p;knA$xdwI6>!oT~bnsGl8_n^|MjDDupcsTu39Q8J0Mx)d;0__a@_??1 zBNebIP-g4@U=@(xn}Qtmvo{tnu2uA3I{SWu09eGxP&5)w=dcHr+T&)Y(jq7ubZ+uM zvHy_HVC+>A&rx6=^wDA*3diW?23yX+{{U)jjNb?z001A02m}BC000301^_}s0stET N0{{R300000002N$thE3D literal 0 HcmV?d00001 diff --git a/src/qualimap/qualimap_rnaseq/test_data/annotation.gtf b/src/qualimap/qualimap_rnaseq/test_data/annotation.gtf new file mode 100644 index 00000000..976de753 --- /dev/null +++ b/src/qualimap/qualimap_rnaseq/test_data/annotation.gtf @@ -0,0 +1,10 @@ +chr20 HAVANA transcript 347024 354868 . + . gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA exon 347024 347142 . + . gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 1; exon_id "ENSE00001831391.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA exon 349249 349363 . + . gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 2; exon_id "ENSE00001491647.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA exon 349638 349832 . + . gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 3; exon_id "ENSE00003710328.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA CDS 349644 349832 . + 0 gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 3; exon_id "ENSE00003710328.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA start_codon 349644 349646 . + 0 gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 3; exon_id "ENSE00003710328.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA exon 353210 354868 . + . gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 4; exon_id "ENSE00001822456.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA CDS 353210 353632 . + 0 gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 4; exon_id "ENSE00001822456.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA stop_codon 353633 353635 . + 0 gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 4; exon_id "ENSE00001822456.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; +chr20 HAVANA UTR 347024 347142 . + . gene_id "ENSG00000125841.12"; transcript_id "ENST00000382291.7"; gene_type "protein_coding"; gene_name "NRSN2"; transcript_type "protein_coding"; transcript_name "NRSN2-202"; exon_number 1; exon_id "ENSE00001831391.1"; level 2; protein_id "ENSP00000371728.3"; transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS12996.1"; havana_gene "OTTHUMG00000031628.5"; havana_transcript "OTTHUMT00000077446.1"; diff --git a/src/qualimap/qualimap_rnaseq/test_data/script.sh b/src/qualimap/qualimap_rnaseq/test_data/script.sh new file mode 100755 index 00000000..801fe405 --- /dev/null +++ b/src/qualimap/qualimap_rnaseq/test_data/script.sh @@ -0,0 +1,10 @@ +# qualimap test data + +# Test data was obtained from https://github.com/snakemake/snakemake-wrappers/raw/master/bio/qualimap/rnaseq/test + +if [ ! -d /tmp/snakemake-wrappers ]; then + git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers +fi + +cp -r /tmp/snakemake-wrappers/bio/qualimap/rnaseq/test/mapped/a.bam src/qualimap/qualimap_rnaseq/test_data +cp -r /tmp/snakemake-wrappers/bio/qualimap/rnaseq/test/annotation.gtf src/qualimap/qualimap_rnaseq/test_data From c4ea23a0f508b93b31bb1a36418ad4868fdb5bc3 Mon Sep 17 00:00:00 2001 From: Sai Nirmayi Yasa <92786623+sainirmayi@users.noreply.github.com> Date: Wed, 21 Aug 2024 17:31:32 +0200 Subject: [PATCH 03/28] Add RSEM prepare reference component (#89) * initial commit * incorporaate some requested changes * update test * change argument reference_fasta_files to multiple true and update docker setup * Update src/rsem/rsem_prepare_reference/config.vsh.yaml Co-authored-by: Robrecht Cannoodt * Update src/rsem/rsem_prepare_reference/config.vsh.yaml Co-authored-by: Robrecht Cannoodt * Update src/rsem/rsem_prepare_reference/script.sh Co-authored-by: Robrecht Cannoodt * set multiple true * update changelog * Apply suggestions from code review * fix script --------- Co-authored-by: Robrecht Cannoodt --- CHANGELOG.md | 2 + .../rsem_prepare_reference/config.vsh.yaml | 196 +++++++++++++++++ src/rsem/rsem_prepare_reference/help.txt | 207 ++++++++++++++++++ src/rsem/rsem_prepare_reference/script.sh | 42 ++++ src/rsem/rsem_prepare_reference/test.sh | 37 ++++ 5 files changed, 484 insertions(+) create mode 100644 src/rsem/rsem_prepare_reference/config.vsh.yaml create mode 100644 src/rsem/rsem_prepare_reference/help.txt create mode 100644 src/rsem/rsem_prepare_reference/script.sh create mode 100644 src/rsem/rsem_prepare_reference/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 2f4c0c71..3e9f40fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,8 @@ * `qualimap/qualimap_rnaseq`: RNA-seq QC analysis using qualimap (PR #74). +* `rsem/rsem_prepare_reference`: Prepare transcript references for RSEM (PR #89). + ## MINOR CHANGES * `busco` components: update BUSCO to `5.7.1` (PR #72). diff --git a/src/rsem/rsem_prepare_reference/config.vsh.yaml b/src/rsem/rsem_prepare_reference/config.vsh.yaml new file mode 100644 index 00000000..44915a2f --- /dev/null +++ b/src/rsem/rsem_prepare_reference/config.vsh.yaml @@ -0,0 +1,196 @@ +name: rsem_prepare_reference +namespace: rsem +description: | + RSEM is a software package for estimating gene and isoform expression levels from RNA-Seq data. This component prepares transcript references for RSEM. +keywords: ["Transcriptome", "Index"] +links: + homepage: http://deweylab.github.io/RSEM + documentation: https://deweylab.github.io/RSEM/rsem-prepare-reference.html + repository: https://github.com/deweylab/RSEM +references: + doi: 10.1186/1471-2105-12-323 +license: GPL-3.0 +requirements: + commands: [ rsem-prepare-reference ] +authors: + - __merge__: /src/_authors/sai_nirmayi_yasa.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --reference_fasta_files + type: file + description: | + Semi-colon separated list of Multi-FASTA formatted files OR a directory name. If a directory name is specified, RSEM will read all files with suffix ".fa" or ".fasta" in this directory. The files should contain either the sequences of transcripts or an entire genome, depending on whether the '--gtf' option is used. + required: true + multiple: true + example: read1.fasta + - name: --reference_name + type: string + description: | + The name of the reference used. RSEM will generate several reference-related files that are prefixed by this name. This name can contain path information (e.g. '/ref/mm9'). + required: true + example: /ref/mm9 + + - name: Outputs + arguments: + - name: --output + type: file + description: Directory containing reference files generated by RSEM. + required: true + direction: output + + - name: Other options + arguments: + - name: --gtf + type: file + description: Assume that 'reference_fasta_files' contains the sequence of a genome, and extract transcript reference sequences using the gene annotations specified in the GTF file. If this and '--gff3' options are not provided, RSEM will assume 'reference_fasta_files' contains the reference transcripts. In this case, RSEM assumes that name of each sequence in the Multi-FASTA files is its transcript_id. + example: annotations.gtf + - name: --gff3 + type: file + description: GFF3 annotation file. Converted to GTF format with the file name 'reference_name.gtf'. Please make sure that 'reference_name.gtf' does not exist. + example: annotations.gff + - name: --gff3_rna_patterns + type: string + description: List of transcript categories (separated by semi-colon). Only transcripts that match the string will be extracted. + multiple: true + example: mRNA;rRNA + - name: --gff3_genes_as_transcripts + type: boolean_true + description: This option is designed for untypical organisms, such as viruses, whose GFF3 files only contain genes. RSEM will assume each gene as a unique transcript when it converts the GFF3 file into GTF format. + - name: --trusted_sources + type: string + description: List of trusted sources (separated by semi-colon). Only transcripts coming from these sources will be extracted. If this option is off, all sources are accepted. + multiple: true + example: ENSEMBL;HAVANA + - name: --transcript_to_gene_map + type: file + description: | + Use information from this file to map from transcript (isoform) ids to gene ids. Each line of this file should be of the form: + gene_id transcript_id + with the two fields separated by a tab character. + If you are using a GTF file for the "UCSC Genes" gene set from the UCSC Genome Browser, then the "knownIsoforms.txt" file (obtained from the "Downloads" section of the UCSC Genome Browser site) is of this format. + If this option is off, then the mapping of isoforms to genes depends on whether the '--gtf' option is specified. If '--gtf' is specified, then RSEM uses the "gene_id" and "transcript_id" attributes in the GTF file. Otherwise, RSEM assumes that each sequence in the reference sequence files is a separate gene. + example: isoforms.txt + - name: --allele_to_gene_map + type: file + description: | + Use information from to provide gene_id and transcript_id information for each allele-specific transcript. Each line of should be of the form: + gene_id transcript_id allele_id + with the fields separated by a tab character. + This option is designed for quantifying allele-specific expression. It is only valid if '--gtf' option is not specified. allele_id should be the sequence names presented in the Multi-FASTA-formatted files. + - name: --polyA + type: boolean_true + description: Add poly(A) tails to the end of all reference isoforms. The length of poly(A) tail added is specified by '--polyA-length' option. STAR aligner users may not want to use this option. + - name: --polyA_length + type: integer + description: The length of the poly(A) tails to be added. + example: 125 + - name: --no_polyA_subset + type: file + description: Only meaningful if '--polyA' is specified. Do not add poly(A) tails to those transcripts listed in this file containing a list of transcript_ids. + example: transcript_ids.txt + - name: --bowtie + type: boolean_true + description: Build Bowtie indices. + - name: --bowtie2 + type: boolean_true + description: Build Bowtie 2 indices. + - name: --star + type: boolean_true + description: Build STAR indices. + - name: --star_sjdboverhang + type: integer + description: Length of the genomic sequence around annotated junction. It is only used for STAR to build splice junctions database and not needed for Bowtie or Bowtie2. It will be passed as the --sjdbOverhang option to STAR. According to STAR's manual, its ideal value is max(ReadLength)-1, e.g. for 2x101 paired-end reads, the ideal value is 101-1=100. In most cases, the default value of 100 will work as well as the ideal value. (Default is 100) + example: 100 + - name: --hisat2_hca + type: boolean_true + description: Build HISAT2 indices on the transcriptome according to Human Cell Atlas (HCA) SMART-Seq2 pipeline. + - name: --quiet + alternatives: -q + type: boolean_true + description: Suppress the output of logging information. + + - name: Prior-enhanced RSEM options + arguments: + - name: --prep_pRSEM + type: boolean_true + description: A Boolean indicating whether to prepare reference files for pRSEM, including building Bowtie indices for a genome and selecting training set isoforms. The index files will be used for aligning ChIP-seq reads in prior-enhanced RSEM and the training set isoforms will be used for learning prior. A path to Bowtie executables and a mappability file in bigWig format are required when this option is on. Currently, Bowtie2 is not supported for prior-enhanced RSEM. + - name: --mappability_bigwig_file + type: file + description: Full path to a whole-genome mappability file in bigWig format. This file is required for running prior-enhanced RSEM. It is used for selecting a training set of isoforms for prior-learning. This file can be either downloaded from UCSC Genome Browser or generated by GEM (Derrien et al., 2012, PLoS One). + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: +- type: docker + image: ubuntu:22.04 + setup: + - type: apt + packages: + - build-essential + - gcc + - g++ + - make + - wget + - zlib1g-dev + - unzip xxd + - perl + - r-base + - bowtie2 + - pip + - git + - type: python + packages: bowtie + - type: docker + env: + - STAR_VERSION=2.7.11b + - RSEM_VERSION=1.3.3 + - BOWTIE_VERSION=1.3.1 + - TZ=Europe/Brussels + run: | + ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone && \ + cd /tmp && \ + wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip && \ + unzip ${STAR_VERSION}.zip && \ + cd STAR-${STAR_VERSION}/source && \ + make STARstatic CXXFLAGS_SIMD=-std=c++11 && \ + cp STAR /usr/local/bin && \ + cd /tmp && \ + wget --no-check-certificate https://github.com/deweylab/RSEM/archive/refs/tags/v${RSEM_VERSION}.zip && \ + unzip v${RSEM_VERSION}.zip && \ + cd RSEM-${RSEM_VERSION} && \ + make && \ + make install && \ + cd /tmp && \ + wget --no-check-certificate -O bowtie-${BOWTIE_VERSION}-linux-x86_64.zip https://sourceforge.net/projects/bowtie-bio/files/bowtie/${BOWTIE_VERSION}/bowtie-${BOWTIE_VERSION}-linux-x86_64.zip/download && \ + unzip bowtie-${BOWTIE_VERSION}-linux-x86_64.zip && \ + cp bowtie-${BOWTIE_VERSION}-linux-x86_64/bowtie* /usr/local/bin && \ + cd /tmp && \ + git clone https://github.com/DaehwanKimLab/hisat2.git /tmp/hisat2 && \ + cd /tmp/hisat2 && \ + make && \ + cp -r hisat2* /usr/local/bin && \ + cd && \ + rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip /tmp/bowtie-${BOWTIE_VERSION}-linux-x86_64 /tmp/hisat2 && \ + apt-get --purge autoremove -y ${PACKAGES} && \ + apt-get clean + + - type: docker + run: | + echo "RSEM: `rsem-calculate-expression --version | sed -e 's/Current version: RSEM v//g'`" > /var/software_versions.txt && \ + echo "STAR: `STAR --version`" >> /var/software_versions.txt && \ + echo "bowtie2: `bowtie2 --version | grep -oP '\d+\.\d+\.\d+'`" >> /var/software_versions.txt && \ + echo "bowtie: `bowtie --version | grep -oP 'bowtie-align-s version \K\d+\.\d+\.\d+'`" >> /var/software_versions.txt && \ + echo "HISAT2: `hisat2 --version | grep -oP 'hisat2-align-s version \K\d+\.\d+\.\d+'`" >> /var/software_versions.txt + +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/rsem/rsem_prepare_reference/help.txt b/src/rsem/rsem_prepare_reference/help.txt new file mode 100644 index 00000000..c69899ec --- /dev/null +++ b/src/rsem/rsem_prepare_reference/help.txt @@ -0,0 +1,207 @@ +```bash +rsem-prepare-reference --help +``` + +NAME +rsem-prepare-reference - Prepare transcript references for RSEM and optionally build BOWTIE/BOWTIE2/STAR/HISAT2(transcriptome) indices. + +SYNOPSIS + rsem-prepare-reference [options] reference_fasta_file(s) reference_name +ARGUMENTS +reference_fasta_file(s) +Either a comma-separated list of Multi-FASTA formatted files OR a directory name. If a directory name is specified, RSEM will read all files with suffix ".fa" or ".fasta" in this directory. The files should contain either the sequences of transcripts or an entire genome, depending on whether the '--gtf' option is used. + +reference name +The name of the reference used. RSEM will generate several reference-related files that are prefixed by this name. This name can contain path information (e.g. '/ref/mm9'). + +OPTIONS +--gtf +If this option is on, RSEM assumes that 'reference_fasta_file(s)' contains the sequence of a genome, and will extract transcript reference sequences using the gene annotations specified in , which should be in GTF format. + +If this and '--gff3' options are off, RSEM will assume 'reference_fasta_file(s)' contains the reference transcripts. In this case, RSEM assumes that name of each sequence in the Multi-FASTA files is its transcript_id. + +(Default: off) + +--gff3 +The annotation file is in GFF3 format instead of GTF format. RSEM will first convert it to GTF format with the file name 'reference_name.gtf'. Please make sure that 'reference_name.gtf' does not exist. (Default: off) + +--gff3-RNA-patterns + is a comma-separated list of transcript categories, e.g. "mRNA,rRNA". Only transcripts that match the will be extracted. (Default: "mRNA") + +--gff3-genes-as-transcripts +This option is designed for untypical organisms, such as viruses, whose GFF3 files only contain genes. RSEM will assume each gene as a unique transcript when it converts the GFF3 file into GTF format. + +--trusted-sources + is a comma-separated list of trusted sources, e.g. "ENSEMBL,HAVANA". Only transcripts coming from these sources will be extracted. If this option is off, all sources are accepted. (Default: off) + +--transcript-to-gene-map +Use information from to map from transcript (isoform) ids to gene ids. Each line of should be of the form: + +gene_id transcript_id + +with the two fields separated by a tab character. + +If you are using a GTF file for the "UCSC Genes" gene set from the UCSC Genome Browser, then the "knownIsoforms.txt" file (obtained from the "Downloads" section of the UCSC Genome Browser site) is of this format. + +If this option is off, then the mapping of isoforms to genes depends on whether the '--gtf' option is specified. If '--gtf' is specified, then RSEM uses the "gene_id" and "transcript_id" attributes in the GTF file. Otherwise, RSEM assumes that each sequence in the reference sequence files is a separate gene. + +(Default: off) + +--allele-to-gene-map +Use information from to provide gene_id and transcript_id information for each allele-specific transcript. Each line of should be of the form: + +gene_id transcript_id allele_id + +with the fields separated by a tab character. + +This option is designed for quantifying allele-specific expression. It is only valid if '--gtf' option is not specified. allele_id should be the sequence names presented in the Multi-FASTA-formatted files. + +(Default: off) + +--polyA +Add poly(A) tails to the end of all reference isoforms. The length of poly(A) tail added is specified by '--polyA-length' option. STAR aligner users may not want to use this option. (Default: do not add poly(A) tail to any of the isoforms) + +--polyA-length +The length of the poly(A) tails to be added. (Default: 125) + +--no-polyA-subset +Only meaningful if '--polyA' is specified. Do not add poly(A) tails to those transcripts listed in . is a file containing a list of transcript_ids. (Default: off) + +--bowtie +Build Bowtie indices. (Default: off) + +--bowtie-path +The path to the Bowtie executables. (Default: the path to Bowtie executables is assumed to be in the user's PATH environment variable) + +--bowtie2 +Build Bowtie 2 indices. (Default: off) + +--bowtie2-path +The path to the Bowtie 2 executables. (Default: the path to Bowtie 2 executables is assumed to be in the user's PATH environment variable) + +--star +Build STAR indices. (Default: off) + +--star-path +The path to STAR's executable. (Default: the path to STAR executable is assumed to be in user's PATH environment variable) + +--star-sjdboverhang +Length of the genomic sequence around annotated junction. It is only used for STAR to build splice junctions database and not needed for Bowtie or Bowtie2. It will be passed as the --sjdbOverhang option to STAR. According to STAR's manual, its ideal value is max(ReadLength)-1, e.g. for 2x101 paired-end reads, the ideal value is 101-1=100. In most cases, the default value of 100 will work as well as the ideal value. (Default: 100) + +--hisat2-hca +Build HISAT2 indices on the transcriptome according to Human Cell Atlas (HCA) SMART-Seq2 pipeline. (Default: off) + +--hisat2-path +The path to the HISAT2 executables. (Default: the path to HISAT2 executables is assumed to be in the user's PATH environment variable) + +-p/--num-threads +Number of threads to use for building STAR's genome indices. (Default: 1) + +-q/--quiet +Suppress the output of logging information. (Default: off) + +-h/--help +Show help information. + +PRIOR-ENHANCED RSEM OPTIONS +--prep-pRSEM +A Boolean indicating whether to prepare reference files for pRSEM, including building Bowtie indices for a genome and selecting training set isoforms. The index files will be used for aligning ChIP-seq reads in prior-enhanced RSEM and the training set isoforms will be used for learning prior. A path to Bowtie executables and a mappability file in bigWig format are required when this option is on. Currently, Bowtie2 is not supported for prior-enhanced RSEM. (Default: off) + +--mappability-bigwig-file +Full path to a whole-genome mappability file in bigWig format. This file is required for running prior-enhanced RSEM. It is used for selecting a training set of isoforms for prior-learning. This file can be either downloaded from UCSC Genome Browser or generated by GEM (Derrien et al., 2012, PLoS One). (Default: "") + +DESCRIPTION +This program extracts/preprocesses the reference sequences for RSEM and prior-enhanced RSEM. It can optionally build Bowtie indices (with '--bowtie' option) and/or Bowtie 2 indices (with '--bowtie2' option) using their default parameters. It can also optionally build STAR indices (with '--star' option) using parameters from ENCODE3's STAR-RSEM pipeline. For prior-enhanced RSEM, it can build Bowtie genomic indices and select training set isoforms (with options '--prep-pRSEM' and '--mappability-bigwig-file '). If an alternative aligner is to be used, indices for that particular aligner can be built from either 'reference_name.idx.fa' or 'reference_name.n2g.idx.fa' (see OUTPUT for details). This program is used in conjunction with the 'rsem-calculate-expression' program. + +OUTPUT +This program will generate 'reference_name.grp', 'reference_name.ti', 'reference_name.transcripts.fa', 'reference_name.seq', 'reference_name.chrlist' (if '--gtf' is on), 'reference_name.idx.fa', 'reference_name.n2g.idx.fa', optional Bowtie/Bowtie 2 index files, and optional STAR index files. + +'reference_name.grp', 'reference_name.ti', 'reference_name.seq', and 'reference_name.chrlist' are used by RSEM internally. + +'reference_name.transcripts.fa' contains the extracted reference transcripts in Multi-FASTA format. Poly(A) tails are not added and it may contain lower case bases in its sequences if the corresponding genomic regions are soft-masked. + +'reference_name.idx.fa' and 'reference_name.n2g.idx.fa' are used by aligners to build their own indices. In these two files, all sequence bases are converted into upper case. In addition, poly(A) tails are added if '--polyA' option is set. The only difference between 'reference_name.idx.fa' and 'reference_name.n2g.idx.fa' is that 'reference_name.n2g.idx.fa' in addition converts all 'N' characters to 'G' characters. This conversion is in particular desired for aligners (e.g. Bowtie) that do not allow reads to overlap with 'N' characters in the reference sequences. Otherwise, 'reference_name.idx.fa' should be used to build the aligner's index files. RSEM uses 'reference_name.idx.fa' to build Bowtie 2 indices and 'reference_name.n2g.idx.fa' to build Bowtie indices. For visualizing the transcript-coordinate-based BAM files generated by RSEM in IGV, 'reference_name.idx.fa' should be imported as a "genome" (see Visualization section in README.md for details). + +If the whole genome is indexed for prior-enhanced RSEM, all the index files will be generated with prefix as 'reference_name_prsem'. Selected isoforms for training set are listed in the file 'reference_name_prsem.training_tr_crd' + +EXAMPLES +1) Suppose we have mouse RNA-Seq data and want to use the UCSC mm9 version of the mouse genome. We have downloaded the UCSC Genes transcript annotations in GTF format (as mm9.gtf) using the Table Browser and the knownIsoforms.txt file for mm9 from the UCSC Downloads. We also have all chromosome files for mm9 in the directory '/data/mm9'. We want to put the generated reference files under '/ref' with name 'mouse_0'. We do not add any poly(A) tails. Please note that GTF files generated from UCSC's Table Browser do not contain isoform-gene relationship information. For the UCSC Genes annotation, this information can be obtained from the knownIsoforms.txt file. Suppose we want to build Bowtie indices and Bowtie executables are found in '/sw/bowtie'. + +There are two ways to write the command: + + rsem-prepare-reference --gtf mm9.gtf \ + --transcript-to-gene-map knownIsoforms.txt \ + --bowtie \ + --bowtie-path /sw/bowtie \ + /data/mm9/chr1.fa,/data/mm9/chr2.fa,...,/data/mm9/chrM.fa \ + /ref/mouse_0 +OR + + rsem-prepare-reference --gtf mm9.gtf \ + --transcript-to-gene-map knownIsoforms.txt \ + --bowtie \ + --bowtie-path /sw/bowtie \ + /data/mm9 \ + /ref/mouse_0 +2) Suppose we also want to build Bowtie 2 indices in the above example and Bowtie 2 executables are found in '/sw/bowtie2', the command will be: + + rsem-prepare-reference --gtf mm9.gtf \ + --transcript-to-gene-map knownIsoforms.txt \ + --bowtie \ + --bowtie-path /sw/bowtie \ + --bowtie2 \ + --bowtie2-path /sw/bowtie2 \ + /data/mm9 \ + /ref/mouse_0 +3) Suppose we want to build STAR indices in the above example and save index files under '/ref' with name 'mouse_0'. Assuming STAR executable is '/sw/STAR', the command will be: + + rsem-prepare-reference --gtf mm9.gtf \ + --transcript-to-gene-map knownIsoforms.txt \ + --star \ + --star-path /sw/STAR \ + -p 8 \ + /data/mm9/chr1.fa,/data/mm9/chr2.fa,...,/data/mm9/chrM.fa \ + /ref/mouse_0 +OR + + rsem-prepare-reference --gtf mm9.gtf \ + --transcript-to-gene-map knownIsoforms.txt \ + --star \ + --star-path /sw/STAR \ + -p 8 \ + /data/mm9 + /ref/mouse_0 +STAR genome index files will be saved under '/ref/'. + +4) Suppose we want to prepare references for prior-enhanced RSEM in the above example. In this scenario, both STAR and Bowtie are required to build genomic indices - STAR for RNA-seq reads and Bowtie for ChIP-seq reads. Assuming their executables are under '/sw/STAR' and '/sw/Bowtie', respectively. Also, assuming the mappability file for mouse genome is '/data/mm9.bigWig'. The command will be: + + rsem-prepare-reference --gtf mm9.gtf \ + --transcript-to-gene-map knownIsoforms.txt \ + --star \ + --star-path /sw/STAR \ + -p 8 \ + --prep-pRSEM \ + --bowtie-path /sw/Bowtie \ + --mappability-bigwig-file /data/mm9.bigWig \ + /data/mm9/chr1.fa,/data/mm9/chr2.fa,...,/data/mm9/chrM.fa \ + /ref/mouse_0 +OR + + rsem-prepare-reference --gtf mm9.gtf \ + --transcript-to-gene-map knownIsoforms.txt \ + --star \ + --star-path /sw/STAR \ + -p 8 \ + --prep-pRSEM \ + --bowtie-path /sw/Bowtie \ + --mappability-bigwig-file /data/mm9.bigWig \ + /data/mm9 + /ref/mouse_0 +Both STAR and Bowtie's index files will be saved under '/ref/'. Bowtie files will have name prefix 'mouse_0_prsem' + +5) Suppose we only have transcripts from EST tags stored in 'mm9.fasta' and isoform-gene information stored in 'mapping.txt'. We want to add 125bp long poly(A) tails to all transcripts. The reference_name is set as 'mouse_125'. In addition, we do not want to build Bowtie/Bowtie 2 indices, and will use an alternative aligner to align reads against either 'mouse_125.idx.fa' or 'mouse_125.idx.n2g.fa': + + rsem-prepare-reference --transcript-to-gene-map mapping.txt \ + --polyA + mm9.fasta \ + mouse_125 \ No newline at end of file diff --git a/src/rsem/rsem_prepare_reference/script.sh b/src/rsem/rsem_prepare_reference/script.sh new file mode 100644 index 00000000..806804d8 --- /dev/null +++ b/src/rsem/rsem_prepare_reference/script.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -eo pipefail + +unset_if_false=( par_gff3_genes_as_transcripts par_polyA par_bowtie par_bowtie2 par_star par_hisat2_hca par_quiet par_prep_pRSEM ) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +# replace ';' with ',' +par_reference_fasta_files=$(echo $par_reference_fasta_files | tr ';' ',') +par_gff3_rna_patterns=$(echo $par_gff3_rna_patterns | tr ';' ',') +par_trusted_sources=$(echo $par_trusted_sources | tr ';' ',') + +echo "$par_reference_fasta_files" +rsem-prepare-reference \ + ${par_gtf:+--gtf "${par_gtf}"} \ + ${par_gff3:+--gff3 "${par_gff3}"} \ + ${par_gff3_rna_patterns:+--gff3-RNA-patterns "${par_gff3_rna_patterns}"} \ + ${par_gff3_genes_as_transcripts:+--gff3-genes-as-transcripts "${par_gff3_genes_as_transcripts}"} \ + ${par_trusted_sources:+--trusted-sources "${par_trusted_sources}"} \ + ${par_transcript_to_gene_map:+--transcript-to-gene-map "${par_transcript_to_gene_map}"} \ + ${par_allele_to_gene_map:+--allele-to-gene-map "${par_allele_to_gene_map}"} \ + ${par_polyA:+--polyA} \ + ${par_polyA_length:+--polyA-length "${par_polyA_length}"} \ + ${par_no_polyA_subset:+--no-polyA-subset "${par_no_polyA_subset}"} \ + ${par_bowtie:+--bowtie} \ + ${par_bowtie2:+--bowtie2} \ + ${par_star:+--star} \ + ${par_star_sjdboverhang:+--star-sjdboverhang "${par_star_sjdboverhang}"} \ + ${par_hisat2_hca:+--hisat2-hca} \ + ${par_quiet:+--quiet} \ + ${par_prep_pRSEM:+--prep-pRSEM} \ + ${par_mappability_bigwig_file:+--mappability-bigwig-file "${par_mappability_bigwig_file}"} \ + ${meta_cpus:+--num-threads "${meta_cpus}"} \ + "${par_reference_fasta_files}" \ + "${par_reference_name}" + +mkdir -p "${par_output}" +mv ${par_reference_name}.* "${par_output}/" diff --git a/src/rsem/rsem_prepare_reference/test.sh b/src/rsem/rsem_prepare_reference/test.sh new file mode 100644 index 00000000..b38dd0a9 --- /dev/null +++ b/src/rsem/rsem_prepare_reference/test.sh @@ -0,0 +1,37 @@ + +#!/bin/bash + +set -e pipefail + +echo ">>> Testing $meta_functionality_name" + +cat > genome.fasta <<'EOF' +>Sheila +GCTAGCTCAGAAAAaaaNNN +EOF + +echo ">>> Prepare RSEM reference without gene annotations" +"$meta_executable" \ + --reference_fasta_files genome.fasta \ + --reference_name test \ + --output RSEM_index + +echo ">>> Checking whether output files exist" +[ ! -d "RSEM_index" ] && echo "RSEM index does not exist!" && exit 1 +[ ! -f "RSEM_index/test.grp" ] && echo "test.grp does not exist!" && exit 1 +[ ! -f "RSEM_index/test.n2g.idx.fa" ] && echo "test.n2g.idx.fa does not exist!" && exit 1 +[ ! -f "RSEM_index/test.ti" ] && echo "test.ti does not exist!" && exit 1 +[ ! -f "RSEM_index/test.idx.fa" ] && echo "test.idx.fa does not exist!" && exit 1 +[ ! -f "RSEM_index/test.seq" ] && echo "test.seq does not exist!" && exit 1 +[ ! -f "RSEM_index/test.transcripts.fa" ] && echo "test.transcripts.fa does not exist!" && exit 1 + +echo ">>> Checking whether output is correct" +[ ! -s "RSEM_index/test.grp" ] && echo "test.grp is empty!" && exit 1 +[ ! -s "RSEM_index/test.ti" ] && echo "test.ti is empty!" && exit 1 +[ ! -s "RSEM_index/test.seq" ] && echo "test.seq is empty!" && exit 1 +grep -q "GCTAGCTCAGAAAAaaaNNN" "RSEM_index/test.transcripts.fa" || { echo "The content of file 'test.transcripts.fa' seems to be incorrect." && exit 1; } +grep -q "GCTAGCTCAGAAAAAAANNN" "RSEM_index/test.idx.fa" || { echo "The content of file 'test.idx.fa' seems to be incorrect." && exit 1; } +grep -q "GCTAGCTCAGAAAAAAAGGG" "RSEM_index/test.n2g.idx.fa" || { echo "The content of file 'test.n2g.idx.fa' seems to be incorrect." && exit 1; } + +echo "All tests succeeded!" +exit 0 From 2d0a990cac4bf2d194ba9c610e00cc99b1c2c4c5 Mon Sep 17 00:00:00 2001 From: Theodoro Gasperin Terra Camargo <98555209+tgaspe@users.noreply.github.com> Date: Mon, 2 Sep 2024 14:41:55 +0200 Subject: [PATCH 04/28] Bedtools merge (#118) * Initial Commit * Script file * strand option tests * -bed option test * distance option test * all test implemented * Update CHANGELOG.md * Update config.vsh.yaml * adding more links * exit on error * suggested changes * working on suggested changes --------- Co-authored-by: Jakub Majercik <57993790+jakubmajercik@users.noreply.github.com> --- CHANGELOG.md | 1 + src/bedtools/bedtools_merge/config.vsh.yaml | 160 +++++++++++++ src/bedtools/bedtools_merge/help.txt | 85 +++++++ src/bedtools/bedtools_merge/script.sh | 35 +++ src/bedtools/bedtools_merge/test.sh | 222 ++++++++++++++++++ .../bedtools_merge/test_data/feature.bam | Bin 0 -> 287 bytes 6 files changed, 503 insertions(+) create mode 100644 src/bedtools/bedtools_merge/config.vsh.yaml create mode 100644 src/bedtools/bedtools_merge/help.txt create mode 100644 src/bedtools/bedtools_merge/script.sh create mode 100644 src/bedtools/bedtools_merge/test.sh create mode 100644 src/bedtools/bedtools_merge/test_data/feature.bam diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e9f40fc..8c1af805 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ * `bedtools`: - `bedtools/bedtools_intersect`: Allows one to screen for overlaps between two sets of genomic features (PR #94). - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98). + - `bedtools/bedtools_merge`: Merges overlapping BED/GFF/VCF entries into a single interval (PR #118). - `bedtools/bedtools_bamtofastq`: Convert BAM alignments to FASTQ files (PR #101). - `bedtools/bedtools_bedtobam`: Converts genomic feature records (bed/gff/vcf) to BAM format (PR #111). diff --git a/src/bedtools/bedtools_merge/config.vsh.yaml b/src/bedtools/bedtools_merge/config.vsh.yaml new file mode 100644 index 00000000..45e4a01d --- /dev/null +++ b/src/bedtools/bedtools_merge/config.vsh.yaml @@ -0,0 +1,160 @@ +name: bedtools_merge +namespace: bedtools +description: | + Merges overlapping BED/GFF/VCF entries into a single interval. +links: + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/merge.html + repository: https://github.com/arq5x/bedtools2 + homepage: https://bedtools.readthedocs.io/en/latest/# + issue_tracker: https://github.com/arq5x/bedtools2/issues +references: + doi: 10.1093/bioinformatics/btq033 +license: MIT +requirements: + commands: [bedtools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + description: Input file (BED/GFF/VCF) to be merged. + required: true + + - name: Outputs + arguments: + - name: --output + type: file + direction: output + description: Output merged file BED to be written. + required: true + + - name: Options + arguments: + - name: --strand + alternatives: -s + type: boolean_true + description: | + Force strandedness. That is, only merge features + that are on the same strand. + - By default, merging is done without respect to strand. + + - name: --specific_strand + alternatives: -S + type: string + choices: ["+", "-"] + description: | + Force merge for one specific strand only. + Follow with + or - to force merge from only + the forward or reverse strand, respectively. + - By default, merging is done without respect to strand. + + - name: --distance + alternatives: -d + type: integer + description: | + Maximum distance between features allowed for features + to be merged. + - Def. 0. That is, overlapping & book-ended features are merged. + - (INTEGER) + - Note: negative values enforce the number of b.p. required for overlap. + + - name: --columns + alternatives: -c + type: integer + description: | + Specify columns from the B file to map onto intervals in A. + Default: 5. + Multiple columns can be specified in a comma-delimited list. + + - name: --operation + alternatives: -o + type: string + description: | + Specify the operation that should be applied to -c. + Valid operations: + sum, min, max, absmin, absmax, + mean, median, mode, antimode + stdev, sstdev + collapse (i.e., print a delimited list (duplicates allowed)), + distinct (i.e., print a delimited list (NO duplicates allowed)), + distinct_sort_num (as distinct, sorted numerically, ascending), + distinct_sort_num_desc (as distinct, sorted numerically, desscending), + distinct_only (delimited list of only unique values), + count + count_distinct (i.e., a count of the unique values in the column), + first (i.e., just the first value in the column), + last (i.e., just the last value in the column), + Default: sum + Multiple operations can be specified in a comma-delimited list. + + If there is only column, but multiple operations, all operations will be + applied on that column. Likewise, if there is only one operation, but + multiple columns, that operation will be applied to all columns. + Otherwise, the number of columns must match the the number of operations, + and will be applied in respective order. + E.g., "-c 5,4,6 -o sum,mean,count" will give the sum of column 5, + the mean of column 4, and the count of column 6. + The order of output columns will match the ordering given in the command. + + - name: --delimiter + alternatives: -delim + type: string + description: | + Specify a custom delimiter for the collapse operations. + example: "|" + default: "," + + - name: --precision + alternatives: -prec + type: integer + description: | + Sets the decimal precision for output (Default: 5). + + - name: --bed + type: boolean_true + description: | + If using BAM input, write output as BED. + + - name: --header + type: boolean_true + description: | + Print the header from the A file prior to results. + + - name: --no_buffer + alternatives: -nobuf + type: boolean_true + description: | + Disable buffered output. Using this option will cause each line + of output to be printed as it is generated, rather than saved + in a buffer. This will make printing large output files + noticeably slower, but can be useful in conjunction with + other software tools and scripts that need to process one + line of bedtools output at a time. + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bedtools, procps] + - type: docker + run: | + echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/bedtools/bedtools_merge/help.txt b/src/bedtools/bedtools_merge/help.txt new file mode 100644 index 00000000..bc78fc67 --- /dev/null +++ b/src/bedtools/bedtools_merge/help.txt @@ -0,0 +1,85 @@ +```bash +bedtools merge +``` + +Tool: bedtools merge (aka mergeBed) +Version: v2.30.0 +Summary: Merges overlapping BED/GFF/VCF entries into a single interval. + +Usage: bedtools merge [OPTIONS] -i + +Options: + -s Force strandedness. That is, only merge features + that are on the same strand. + - By default, merging is done without respect to strand. + + -S Force merge for one specific strand only. + Follow with + or - to force merge from only + the forward or reverse strand, respectively. + - By default, merging is done without respect to strand. + + -d Maximum distance between features allowed for features + to be merged. + - Def. 0. That is, overlapping & book-ended features are merged. + - (INTEGER) + - Note: negative values enforce the number of b.p. required for overlap. + + -c Specify columns from the B file to map onto intervals in A. + Default: 5. + Multiple columns can be specified in a comma-delimited list. + + -o Specify the operation that should be applied to -c. + Valid operations: + sum, min, max, absmin, absmax, + mean, median, mode, antimode + stdev, sstdev + collapse (i.e., print a delimited list (duplicates allowed)), + distinct (i.e., print a delimited list (NO duplicates allowed)), + distinct_sort_num (as distinct, sorted numerically, ascending), + distinct_sort_num_desc (as distinct, sorted numerically, desscending), + distinct_only (delimited list of only unique values), + count + count_distinct (i.e., a count of the unique values in the column), + first (i.e., just the first value in the column), + last (i.e., just the last value in the column), + Default: sum + Multiple operations can be specified in a comma-delimited list. + + If there is only column, but multiple operations, all operations will be + applied on that column. Likewise, if there is only one operation, but + multiple columns, that operation will be applied to all columns. + Otherwise, the number of columns must match the the number of operations, + and will be applied in respective order. + E.g., "-c 5,4,6 -o sum,mean,count" will give the sum of column 5, + the mean of column 4, and the count of column 6. + The order of output columns will match the ordering given in the command. + + + -delim Specify a custom delimiter for the collapse operations. + - Example: -delim "|" + - Default: ",". + + -prec Sets the decimal precision for output (Default: 5) + + -bed If using BAM input, write output as BED. + + -header Print the header from the A file prior to results. + + -nobuf Disable buffered output. Using this option will cause each line + of output to be printed as it is generated, rather than saved + in a buffer. This will make printing large output files + noticeably slower, but can be useful in conjunction with + other software tools and scripts that need to process one + line of bedtools output at a time. + + -iobuf Specify amount of memory to use for input buffer. + Takes an integer argument. Optional suffixes K/M/G supported. + Note: currently has no effect with compressed files. + +Notes: + (1) The input file (-i) file must be sorted by chrom, then start. + + + + +***** ERROR: No input file given. Exiting. ***** diff --git a/src/bedtools/bedtools_merge/script.sh b/src/bedtools/bedtools_merge/script.sh new file mode 100644 index 00000000..db50dd83 --- /dev/null +++ b/src/bedtools/bedtools_merge/script.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +# Unset parameters +unset_if_false=( + par_strand + par_bed + par_header + par_no_buffer +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +# Execute bedtools merge with the provided arguments +bedtools merge \ + ${par_strand:+-s} \ + ${par_specific_strand:+-S "$par_specific_strand"} \ + ${par_bed:+-bed} \ + ${par_header:+-header} \ + ${par_no_buffer:+-nobuf} \ + ${par_distance:+-d "$par_distance"} \ + ${par_columns:+-c "$par_columns"} \ + ${par_operation:+-o "$par_operation"} \ + ${par_delimiter:+-delim "$par_delimiter"} \ + ${par_precision:+-prec "$par_precision"} \ + -i "$par_input" \ + > "$par_output" diff --git a/src/bedtools/bedtools_merge/test.sh b/src/bedtools/bedtools_merge/test.sh new file mode 100644 index 00000000..e2b46c15 --- /dev/null +++ b/src/bedtools/bedtools_merge/test.sh @@ -0,0 +1,222 @@ +#!/bin/bash + +# exit on error +set -eo pipefail + +## VIASH START +meta_executable="target/executable/bedtools/bedtools_sort/bedtools_merge" +meta_resources_dir="src/bedtools/bedtools_merge" +## VIASH END + +# directory of the bam file +test_data="$meta_resources_dir/test_data" + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create and populate example files +printf "chr1\t100\t200\nchr1\t150\t250\nchr1\t300\t400\n" > "$TMPDIR/featureA.bed" +printf "chr1\t100\t200\ta1\t1\t+\nchr1\t180\t250\ta2\t2\t+\nchr1\t250\t500\ta3\t3\t-\nchr1\t501\t1000\ta4\t4\t+\n" > "$TMPDIR/featureB.bed" +printf "chr1\t100\t200\ta1\t1.9\t+\nchr1\t180\t250\ta2\t2.5\t+\nchr1\t250\t500\ta3\t3.3\t-\nchr1\t501\t1000\ta4\t4\t+\n" > "$TMPDIR/feature_precision.bed" + +# Create and populate feature.gff file +printf "##gff-version 3\n" > "$TMPDIR/feature.gff" +printf "chr1\t.\tgene\t1000\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "$TMPDIR/feature.gff" +printf "chr1\t.\texon\t1000\t1200\t.\t+\t.\tID=exon1;Parent=transcript1\n" >> "$TMPDIR/feature.gff" +printf "chr1\t.\tCDS\t1000\t1200\t.\t+\t0\tID=cds1;Parent=transcript1\n" >> "$TMPDIR/feature.gff" +printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "$TMPDIR/feature.gff" +printf "chr2\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "$TMPDIR/feature.gff" +printf "chr3\t.\tmRNA\t1000\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "$TMPDIR/feature.gff" + +# Create expected output files +printf "chr1\t100\t250\nchr1\t300\t400\n" > "$TMPDIR/expected.bed" +printf "chr1\t100\t250\nchr1\t250\t500\nchr1\t501\t1000\n" > "$TMPDIR/expected_strand.bed" +printf "chr1\t100\t250\nchr1\t501\t1000\n" > "$TMPDIR/expected_specific_strand.bed" +printf "chr1\t128\t228\nchr1\t428\t528\n" > "$TMPDIR/expected_bam.bed" +printf "chr1\t100\t400\n" > "$TMPDIR/expected_distance.bed" +printf "chr1\t100\t500\t2\t1\t3\nchr1\t501\t1000\t4\t4\t4\n" > "$TMPDIR/expected_operation.bed" +printf "chr1\t100\t500\ta1|a2|a3\nchr1\t501\t1000\ta4\n" > "$TMPDIR/expected_delim.bed" +printf "chr1\t100\t500\t2.567\nchr1\t501\t1000\t4\n" > "$TMPDIR/expected_precision.bed" +printf "##gff-version 3\nchr1\t999\t2000\nchr2\t1499\t1700\nchr3\t999\t2000\n" > "$TMPDIR/expected_header.bed" + +# Test 1: Default sort on BED file +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null + +echo "> Run bedtools_merge on BED file" +"$meta_executable" \ + --input "../featureA.bed" \ + --output "output.bed" + +# # checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected.bed" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: strand option +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bedtools_merge on BED file with strand option" +"$meta_executable" \ + --input "../featureB.bed" \ + --output "output.bed" \ + --strand + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_strand.bed" +echo "- test2 succeeded -" + +popd > /dev/null + +# Test 3: specific strand option +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null + +echo "> Run bedtools_merge on BED file with specific strand option" +"$meta_executable" \ + --input "../featureB.bed" \ + --output "output.bed" \ + --specific_strand "+" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_specific_strand.bed" +echo "- test3 succeeded -" + +popd > /dev/null + +# Test 4: BED option +mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null + +echo "> Run bedtools_merge on BAM file with BED option" +"$meta_executable" \ + --input "$test_data/feature.bam" \ + --output "output.bed" \ + --bed + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_bam.bed" +echo "- test4 succeeded -" + +popd > /dev/null + +# Test 5: distance option +mkdir "$TMPDIR/test5" && pushd "$TMPDIR/test5" > /dev/null + +echo "> Run bedtools_merge on BED file with distance option" +"$meta_executable" \ + --input "../featureA.bed" \ + --output "output.bed" \ + --distance -5 + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected.bed" +echo "- test5 succeeded -" + +popd > /dev/null + +# Test 6: columns option & operation option +mkdir "$TMPDIR/test6" && pushd "$TMPDIR/test6" > /dev/null + +echo "> Run bedtools_merge on BED file with columns & operation options" +"$meta_executable" \ + --input "../featureB.bed" \ + --output "output.bed" \ + --columns 5 \ + --operation "mean,min,max" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_operation.bed" +echo "- test6 succeeded -" + +popd > /dev/null + +# Test 7: delimeter option +mkdir "$TMPDIR/test7" && pushd "$TMPDIR/test7" > /dev/null + +echo "> Run bedtools_merge on BED file with delimeter option" +"$meta_executable" \ + --input "../featureB.bed" \ + --output "output.bed" \ + --columns 4 \ + --operation "collapse" \ + --delimiter "|" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_delim.bed" +echo "- test7 succeeded -" + +popd > /dev/null + +# Test 8: precision option +mkdir "$TMPDIR/test8" && pushd "$TMPDIR/test8" > /dev/null + +echo "> Run bedtools_merge on BED file with precision option" +"$meta_executable" \ + --input "../feature_precision.bed" \ + --output "output.bed" \ + --columns 5 \ + --operation "mean" \ + --precision 4 + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_precision.bed" +echo "- test8 succeeded -" + +popd > /dev/null + +# Test 9: header option +mkdir "$TMPDIR/test9" && pushd "$TMPDIR/test9" > /dev/null + +echo "> Run bedtools_merge on GFF file with header option" +"$meta_executable" \ + --input "../feature.gff" \ + --output "output.gff" \ + --header + +# checks +assert_file_exists "output.gff" +assert_file_not_empty "output.gff" +assert_identical_content "output.gff" "../expected_header.bed" +echo "- test9 succeeded -" + +popd > /dev/null + +echo "---- All tests succeeded! ----" +exit 0 diff --git a/src/bedtools/bedtools_merge/test_data/feature.bam b/src/bedtools/bedtools_merge/test_data/feature.bam new file mode 100644 index 0000000000000000000000000000000000000000..3d56a6317ba2f31f1df17f2f4247a9ad8a0585ae GIT binary patch literal 287 zcmb2|=3rp}f&Xj_PR>jWyBLa#zNFqcap1s%2M-TPK1)wsk$yn(O@8RC@Hz1zlV%-y zD)8Xk%a=({pS%*9G=F}u%={^{geJY8GJ`uvIxKBTdd`CM15X8HPDs89>O`kt|Rd`U~p_h--Q&W->Gqci?Qd*Xrk(h0y?ChN!tn96l>NevPkFIWyj<2t; z?#>yFeQB&o6V~$N9AEd*(efD2)ZkBvGRkb Date: Mon, 2 Sep 2024 14:42:44 +0200 Subject: [PATCH 05/28] Bedtools links (#137) * Initial Commit * Tests * Adding help file * Adding more description * Update test.sh * Update help.txt * Update CHANGELOG.md --- CHANGELOG.md | 1 + src/bedtools/bedtools_links/config.vsh.yaml | 91 +++++++++++++++++++ src/bedtools/bedtools_links/help.txt | 25 ++++++ src/bedtools/bedtools_links/script.sh | 14 +++ src/bedtools/bedtools_links/test.sh | 98 +++++++++++++++++++++ 5 files changed, 229 insertions(+) create mode 100644 src/bedtools/bedtools_links/config.vsh.yaml create mode 100644 src/bedtools/bedtools_links/help.txt create mode 100644 src/bedtools/bedtools_links/script.sh create mode 100644 src/bedtools/bedtools_links/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 8c1af805..6dda7ab4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ - `bedtools/bedtools_merge`: Merges overlapping BED/GFF/VCF entries into a single interval (PR #118). - `bedtools/bedtools_bamtofastq`: Convert BAM alignments to FASTQ files (PR #101). - `bedtools/bedtools_bedtobam`: Converts genomic feature records (bed/gff/vcf) to BAM format (PR #111). + - `bedtools/bedtools_links`: Creates an HTML file with links to an instance of the UCSC Genome Browser for all features / intervals in a (bed/gff/vcf) file (PR #137). * `qualimap/qualimap_rnaseq`: RNA-seq QC analysis using qualimap (PR #74). diff --git a/src/bedtools/bedtools_links/config.vsh.yaml b/src/bedtools/bedtools_links/config.vsh.yaml new file mode 100644 index 00000000..b4e43cd3 --- /dev/null +++ b/src/bedtools/bedtools_links/config.vsh.yaml @@ -0,0 +1,91 @@ +name: bedtools_links +namespace: bedtools +description: | + Creates an HTML file with links to an instance of the UCSC Genome Browser for all features / intervals in a file. + This is useful for cases when one wants to manually inspect through a large set of annotations or features. +keywords: [Links, BED, GFF, VCF] +links: + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/links.html + repository: https://github.com/arq5x/bedtools2 + homepage: https://bedtools.readthedocs.io/en/latest/# + issue_tracker: https://github.com/arq5x/bedtools2/issues +references: + doi: 10.1093/bioinformatics/btq033 +license: MIT +requirements: + commands: [bedtools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + description: Input file (bed/gff/vcf). + required: true + + - name: Outputs + arguments: + - name: --output + alternatives: -o + type: file + direction: output + description: Output HTML file to be written. + + - name: Options + description: | + By default, the links created will point to human (hg18) UCSC browser. + If you have a local mirror, you can override this behavior by supplying + the -base, -org, and -db options. + + For example, if the URL of your local mirror for mouse MM9 is called: + http://mymirror.myuniversity.edu, then you would use the following: + --base_url http://mymirror.myuniversity.edu + --organism mouse + --database mm9 + arguments: + - name: --base_url + alternatives: -base + type: string + description: | + The “basename” for the UCSC browser. + default: http://genome.ucsc.edu + + - name: --organism + alternatives: -org + type: string + description: | + The organism (e.g. mouse, human). + default: human + + - name: --database + alternatives: -db + type: string + description: | + The genome build. + default: hg18 + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bedtools, procps] + - type: docker + run: | + echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/bedtools/bedtools_links/help.txt b/src/bedtools/bedtools_links/help.txt new file mode 100644 index 00000000..d848d989 --- /dev/null +++ b/src/bedtools/bedtools_links/help.txt @@ -0,0 +1,25 @@ +``` +bedtools links -h +``` + +Tool: bedtools links (aka linksBed) +Version: v2.30.0 +Summary: Creates HTML links to an UCSC Genome Browser from a feature file. + +Usage: bedtools links [OPTIONS] -i > out.html + +Options: + -base The browser basename. Default: http://genome.ucsc.edu + -org The organism. Default: human + -db The build. Default: hg18 + +Example: + By default, the links created will point to human (hg18) UCSC browser. + If you have a local mirror, you can override this behavior by supplying + the -base, -org, and -db options. + + For example, if the URL of your local mirror for mouse MM9 is called: + http://mymirror.myuniversity.edu, then you would use the following: + -base http://mymirror.myuniversity.edu + -org mouse + -db mm9 diff --git a/src/bedtools/bedtools_links/script.sh b/src/bedtools/bedtools_links/script.sh new file mode 100644 index 00000000..b8ee9a56 --- /dev/null +++ b/src/bedtools/bedtools_links/script.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -eo pipefail + +# Execute bedtools links +bedtools links \ + ${par_base_url:+-base "$par_base_url"} \ + ${par_organism:+-org "$par_organism"} \ + ${par_database:+-db "$par_database"} \ + -i "$par_input" \ + > "$par_output" diff --git a/src/bedtools/bedtools_links/test.sh b/src/bedtools/bedtools_links/test.sh new file mode 100644 index 00000000..d79cbd6c --- /dev/null +++ b/src/bedtools/bedtools_links/test.sh @@ -0,0 +1,98 @@ +#!/bin/bash + +# exit on error +set -eo pipefail + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create test data +cat < "$TMPDIR/genes.bed" +chr21 9928613 10012791 uc002yip.1 0 - +chr21 9928613 10012791 uc002yiq.1 0 - +chr21 9928613 10012791 uc002yir.1 0 - +chr21 9928613 10012791 uc010gkv.1 0 - +chr21 9928613 10061300 uc002yis.1 0 - +chr21 10042683 10120796 uc002yit.1 0 - +chr21 10042683 10120808 uc002yiu.1 0 - +chr21 10079666 10120808 uc002yiv.1 0 - +chr21 10080031 10081687 uc002yiw.1 0 - +chr21 10081660 10120796 uc002yix.2 0 - +EOF + +# Test 1: Default Use +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null + +echo "> Run bedtools_links on BED file" +"$meta_executable" \ + --input "../genes.bed" \ + --output "genes.html" + +# checks +assert_file_exists "genes.html" +assert_file_not_empty "genes.html" +assert_file_contains "genes.html" "uc002yip.1" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: Base URL +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bedtools_links with base option" +"$meta_executable" \ + --input "../genes.bed" \ + --output "genes.html" \ + --base_url "http://genome.ucsc.edu" + +# checks +assert_file_exists "genes.html" +assert_file_not_empty "genes.html" +assert_file_contains "genes.html" "uc002yip.1" +echo "- test2 succeeded -" + +popd > /dev/null + +# Test 3: Organism and Genome Database Build +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null + +echo "> Run bedtools_links with organism option and genome database build" +"$meta_executable" \ + --input "../genes.bed" \ + --output "genes.html" \ + --base_url "http://genome.ucsc.edu" \ + --organism "mouse" \ + --database "mm9" + +# checks +assert_file_exists "genes.html" +assert_file_not_empty "genes.html" +assert_file_contains "genes.html" "uc002yip.1" +echo "- test3 succeeded -" + +popd > /dev/null + +echo "---- All tests succeeded! ----" +exit 0 From 2b29a47575db9dbdff8448b287925c25d9a8b01d Mon Sep 17 00:00:00 2001 From: Theodoro Gasperin Terra Camargo <98555209+tgaspe@users.noreply.github.com> Date: Mon, 2 Sep 2024 15:00:09 +0200 Subject: [PATCH 06/28] Bedtools GroupBY (#123) * Initial Commit * Update config.vsh.yaml * config file * script.sh * adding some tests * more test * Update CHANGELOG.md * deleted test_data * bug fix * Update config.vsh.yaml * adding more links * exit on error * $TMPDIR * Update script.sh * Update config.vsh.yaml * Suggested change on column option --------- Co-authored-by: Jakub Majercik <57993790+jakubmajercik@users.noreply.github.com> --- CHANGELOG.md | 2 + src/bedtools/bedtools_groupby/config.vsh.yaml | 155 ++++++++++++++ src/bedtools/bedtools_groupby/help.txt | 93 ++++++++ src/bedtools/bedtools_groupby/script.sh | 36 ++++ src/bedtools/bedtools_groupby/test.sh | 198 ++++++++++++++++++ 5 files changed, 484 insertions(+) create mode 100644 src/bedtools/bedtools_groupby/config.vsh.yaml create mode 100644 src/bedtools/bedtools_groupby/help.txt create mode 100644 src/bedtools/bedtools_groupby/script.sh create mode 100644 src/bedtools/bedtools_groupby/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 6dda7ab4..29fb8cfa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ * `bedtools`: - `bedtools/bedtools_intersect`: Allows one to screen for overlaps between two sets of genomic features (PR #94). - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98). + - `bedtools/bedtools_groupby`: Summarizes a dataset column based upon common column groupings. Akin to the SQL "group by" command (PR #123). - `bedtools/bedtools_merge`: Merges overlapping BED/GFF/VCF entries into a single interval (PR #118). - `bedtools/bedtools_bamtofastq`: Convert BAM alignments to FASTQ files (PR #101). - `bedtools/bedtools_bedtobam`: Converts genomic feature records (bed/gff/vcf) to BAM format (PR #111). @@ -38,6 +39,7 @@ * `rsem/rsem_prepare_reference`: Prepare transcript references for RSEM (PR #89). + ## MINOR CHANGES * `busco` components: update BUSCO to `5.7.1` (PR #72). diff --git a/src/bedtools/bedtools_groupby/config.vsh.yaml b/src/bedtools/bedtools_groupby/config.vsh.yaml new file mode 100644 index 00000000..89c4845b --- /dev/null +++ b/src/bedtools/bedtools_groupby/config.vsh.yaml @@ -0,0 +1,155 @@ +name: bedtools_groupby +namespace: bedtools +description: | + Summarizes a dataset column based upon common column groupings. + Akin to the SQL "group by" command. +keywords: [groupby, BED] +links: + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/groupby.html + repository: https://github.com/arq5x/bedtools2 + homepage: https://bedtools.readthedocs.io/en/latest/# + issue_tracker: https://github.com/arq5x/bedtools2/issues +references: + doi: 10.1093/bioinformatics/btq033 +license: MIT +requirements: + commands: [bedtools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + direction: input + description: | + The input BED file to be used. + required: true + example: input_a.bed + + - name: Outputs + arguments: + - name: --output + type: file + direction: output + description: | + The output groupby BED file. + required: true + example: output.bed + + - name: Options + arguments: + - name: --groupby + alternatives: [-g, -grp] + type: string + description: | + Specify the columns (1-based) for the grouping. + The columns must be comma separated. + - Default: 1,2,3 + required: true + + - name: --column + alternatives: [-c, -opCols] + type: integer + description: | + Specify the column (1-based) that should be summarized. + required: true + + - name: --operation + alternatives: [-o, -ops] + type: string + description: | + Specify the operation that should be applied to opCol. + Valid operations: + sum, count, count_distinct, min, max, + mean, median, mode, antimode, + stdev, sstdev (sample standard dev.), + collapse (i.e., print a comma separated list (duplicates allowed)), + distinct (i.e., print a comma separated list (NO duplicates allowed)), + distinct_sort_num (as distinct, but sorted numerically, ascending), + distinct_sort_num_desc (as distinct, but sorted numerically, descending), + concat (i.e., merge values into a single, non-delimited string), + freqdesc (i.e., print desc. list of values:freq) + freqasc (i.e., print asc. list of values:freq) + first (i.e., print first value) + last (i.e., print last value) + + Default value: sum + + If there is only column, but multiple operations, all operations will be + applied on that column. Likewise, if there is only one operation, but + multiple columns, that operation will be applied to all columns. + Otherwise, the number of columns must match the the number of operations, + and will be applied in respective order. + E.g., "-c 5,4,6 -o sum,mean,count" will give the sum of column 5, + the mean of column 4, and the count of column 6. + The order of output columns will match the ordering given in the command. + + - name: --full + type: boolean_true + description: | + Print all columns from input file. The first line in the group is used. + Default: print only grouped columns. + + - name: --inheader + type: boolean_true + description: | + Input file has a header line - the first line will be ignored. + + - name: --outheader + type: boolean_true + description: | + Print header line in the output, detailing the column names. + If the input file has headers (-inheader), the output file + will use the input's column names. + If the input file has no headers, the output file + will use "col_1", "col_2", etc. as the column names. + + - name: --header + type: boolean_true + description: same as '-inheader -outheader'. + + - name: --ignorecase + type: boolean_true + description: | + Group values regardless of upper/lower case. + + - name: --precision + alternatives: -prec + type: integer + description: | + Sets the decimal precision for output. + default: 5 + + - name: --delimiter + alternatives: -delim + type: string + description: | + Specify a custom delimiter for the collapse operations. + example: "|" + default: "," + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bedtools, procps] + - type: docker + run: | + echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/bedtools/bedtools_groupby/help.txt b/src/bedtools/bedtools_groupby/help.txt new file mode 100644 index 00000000..a631b4b1 --- /dev/null +++ b/src/bedtools/bedtools_groupby/help.txt @@ -0,0 +1,93 @@ +```bash +bedtools groupby +``` + +Tool: bedtools groupby +Version: v2.30.0 +Summary: Summarizes a dataset column based upon + common column groupings. Akin to the SQL "group by" command. + +Usage: bedtools groupby -g [group_column(s)] -c [op_column(s)] -o [ops] + cat [FILE] | bedtools groupby -g [group_column(s)] -c [op_column(s)] -o [ops] + +Options: + -i Input file. Assumes "stdin" if omitted. + + -g -grp Specify the columns (1-based) for the grouping. + The columns must be comma separated. + - Default: 1,2,3 + + -c -opCols Specify the column (1-based) that should be summarized. + - Required. + + -o -ops Specify the operation that should be applied to opCol. + Valid operations: + sum, count, count_distinct, min, max, + mean, median, mode, antimode, + stdev, sstdev (sample standard dev.), + collapse (i.e., print a comma separated list (duplicates allowed)), + distinct (i.e., print a comma separated list (NO duplicates allowed)), + distinct_sort_num (as distinct, but sorted numerically, ascending), + distinct_sort_num_desc (as distinct, but sorted numerically, descending), + concat (i.e., merge values into a single, non-delimited string), + freqdesc (i.e., print desc. list of values:freq) + freqasc (i.e., print asc. list of values:freq) + first (i.e., print first value) + last (i.e., print last value) + - Default: sum + + If there is only column, but multiple operations, all operations will be + applied on that column. Likewise, if there is only one operation, but + multiple columns, that operation will be applied to all columns. + Otherwise, the number of columns must match the the number of operations, + and will be applied in respective order. + E.g., "-c 5,4,6 -o sum,mean,count" will give the sum of column 5, + the mean of column 4, and the count of column 6. + The order of output columns will match the ordering given in the command. + + + -full Print all columns from input file. The first line in the group is used. + Default: print only grouped columns. + + -inheader Input file has a header line - the first line will be ignored. + + -outheader Print header line in the output, detailing the column names. + If the input file has headers (-inheader), the output file + will use the input's column names. + If the input file has no headers, the output file + will use "col_1", "col_2", etc. as the column names. + + -header same as '-inheader -outheader' + + -ignorecase Group values regardless of upper/lower case. + + -prec Sets the decimal precision for output (Default: 5) + + -delim Specify a custom delimiter for the collapse operations. + - Example: -delim "|" + - Default: ",". + +Examples: + $ cat ex1.out + chr1 10 20 A chr1 15 25 B.1 1000 ATAT + chr1 10 20 A chr1 25 35 B.2 10000 CGCG + + $ groupBy -i ex1.out -g 1,2,3,4 -c 9 -o sum + chr1 10 20 A 11000 + + $ groupBy -i ex1.out -grp 1,2,3,4 -opCols 9,9 -ops sum,max + chr1 10 20 A 11000 10000 + + $ groupBy -i ex1.out -g 1,2,3,4 -c 8,9 -o collapse,mean + chr1 10 20 A B.1,B.2, 5500 + + $ cat ex1.out | groupBy -g 1,2,3,4 -c 8,9 -o collapse,mean + chr1 10 20 A B.1,B.2, 5500 + + $ cat ex1.out | groupBy -g 1,2,3,4 -c 10 -o concat + chr1 10 20 A ATATCGCG + +Notes: + (1) The input file/stream should be sorted/grouped by the -grp. columns + (2) If -i is unspecified, input is assumed to come from stdin. + diff --git a/src/bedtools/bedtools_groupby/script.sh b/src/bedtools/bedtools_groupby/script.sh new file mode 100644 index 00000000..b8a40cdc --- /dev/null +++ b/src/bedtools/bedtools_groupby/script.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +# Unset parameters +unset_if_false=( + par_full + par_inheader + par_outheader + par_header + par_ignorecase +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +bedtools groupby \ + ${par_full:+-full} \ + ${par_inheader:+-inheader} \ + ${par_outheader:+-outheader} \ + ${par_header:+-header} \ + ${par_ignorecase:+-ignorecase} \ + ${par_precision:+-prec "$par_precision"} \ + ${par_delimiter:+-delim "$par_delimiter"} \ + -i "$par_input" \ + -g "$par_groupby" \ + -c "$par_column" \ + ${par_operation:+-o "$par_operation"} \ + > "$par_output" + \ No newline at end of file diff --git a/src/bedtools/bedtools_groupby/test.sh b/src/bedtools/bedtools_groupby/test.sh new file mode 100644 index 00000000..ce99a1ec --- /dev/null +++ b/src/bedtools/bedtools_groupby/test.sh @@ -0,0 +1,198 @@ +#!/bin/bash + +# exit on error +set -eo pipefail + +## VIASH START +meta_executable="target/executable/bedtools/bedtools_groupby/bedtools_groupby" +meta_resources_dir="src/bedtools/bedtools_groupby" +## VIASH END + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create and populate example.bed +cat << EOF > $TMPDIR/example.bed +# Header +chr21 9719758 9729320 variant1 chr21 9719768 9721892 ALR/Alpha 1004 + +chr21 9719758 9729320 variant1 chr21 9721905 9725582 ALR/Alpha 1010 + +chr21 9719758 9729320 variant1 chr21 9725582 9725977 L1PA3 3288 + +chr21 9719758 9729320 variant1 chr21 9726021 9729309 ALR/Alpha 1051 + +chr21 9729310 9757478 variant2 chr21 9729320 9729809 L1PA3 3897 - +chr21 9729310 9757478 variant2 chr21 9729809 9730866 L1P1 8367 + +chr21 9729310 9757478 variant2 chr21 9730866 9734026 ALR/Alpha 1036 - +chr21 9729310 9757478 variant2 chr21 9734037 9757471 ALR/Alpha 1182 - +chr21 9795588 9796685 variant3 chr21 9795589 9795713 (GAATG)n 308 + +chr21 9795588 9796685 variant3 chr21 9795736 9795894 (GAATG)n 683 + +chr21 9795588 9796685 variant3 chr21 9795911 9796007 (GAATG)n 345 + +chr21 9795588 9796685 variant3 chr21 9796028 9796187 (GAATG)n 756 + +chr21 9795588 9796685 variant3 chr21 9796202 9796615 (GAATG)n 891 + +chr21 9795588 9796685 variant3 chr21 9796637 9796824 (GAATG)n 621 + +EOF + +# Create and populate expected output files for different tests +cat << EOF > $TMPDIR/expected.bed +chr21 9719758 9729320 6353 +chr21 9729310 9757478 14482 +chr21 9795588 9796685 3604 +EOF +cat << EOF > $TMPDIR/expected_max.bed +chr21 9719758 9729320 variant1 3288 +chr21 9729310 9757478 variant2 8367 +chr21 9795588 9796685 variant3 891 +EOF +cat << EOF > $TMPDIR/expected_full.bed +chr21 9719758 9729320 variant1 chr21 9719768 9721892 ALR/Alpha 1004 + 6353 +chr21 9729310 9757478 variant2 chr21 9729320 9729809 L1PA3 3897 - 14482 +chr21 9795588 9796685 variant3 chr21 9795589 9795713 (GAATG)n 308 + 3604 +EOF +cat << EOF > $TMPDIR/expected_delimited.bed +chr21 9719758 9729320 variant1 1004;1010;3288;1051 +chr21 9729310 9757478 variant2 3897;8367;1036;1182 +chr21 9795588 9796685 variant3 308;683;345;756;891;621 +EOF +cat << EOF > $TMPDIR/expected_precision.bed +chr21 9719758 9729320 variant1 1.6e+03 +chr21 9729310 9757478 variant2 3.6e+03 +chr21 9795588 9796685 variant3 6e+02 +EOF + +# Test 1: without operation option, default operation is sum +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null + +echo "> Run bedtools groupby on BED file" +"$meta_executable" \ + --input "../example.bed" \ + --groupby "1,2,3" \ + --column "9" \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected.bed" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: with operation max option +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bedtools groupby on BED file with max operation" +"$meta_executable" \ + --input "../example.bed" \ + --groupby "1-4" \ + --column "9" \ + --operation "max" \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_max.bed" +echo "- test2 succeeded -" + +popd > /dev/null + +# Test 3: full option +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null + +echo "> Run bedtools groupby on BED file with full option" +"$meta_executable" \ + --input "../example.bed" \ + --groupby "1-4" \ + --column "9" \ + --full \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_full.bed" +echo "- test3 succeeded -" + +popd > /dev/null + +# Test 4: header option +mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null + +echo "> Run bedtools groupby on BED file with header option" +"$meta_executable" \ + --input "../example.bed" \ + --groupby "1-4" \ + --column "9" \ + --header \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_file_contains "output.bed" "# Header" +echo "- test4 succeeded -" + +popd > /dev/null + +# Test 5: Delimiter and collapse +mkdir "$TMPDIR/test5" && pushd "$TMPDIR/test5" > /dev/null + +echo "> Run bedtools groupby on BED file with delimiter and collapse options" +"$meta_executable" \ + --input "../example.bed" \ + --groupby "1-4" \ + --column "9" \ + --operation "collapse" \ + --delimiter ";" \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_delimited.bed" +echo "- test5 succeeded -" + +popd > /dev/null + +# Test 6: precision option +mkdir "$TMPDIR/test6" && pushd "$TMPDIR/test6" > /dev/null + +echo "> Run bedtools groupby on BED file with precision option" +"$meta_executable" \ + --input "../example.bed" \ + --groupby "1-4" \ + --column "9" \ + --operation "mean" \ + --precision 2 \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_precision.bed" +echo "- test6 succeeded -" + +popd > /dev/null + +echo "---- All tests succeeded! ----" +exit 0 From f3e87e58c921a4ef59fe8946edcd066cdfc8de9c Mon Sep 17 00:00:00 2001 From: Theodoro Gasperin Terra Camargo <98555209+tgaspe@users.noreply.github.com> Date: Mon, 2 Sep 2024 15:06:37 +0200 Subject: [PATCH 07/28] Bedtools bed12tobed6 (#140) * Initial commit * Update test.sh * help file + n option * adding n_score option * small changes * Update CHANGELOG.md * Update CHANGELOG.md --------- Co-authored-by: Jakub Majercik <57993790+jakubmajercik@users.noreply.github.com> --- CHANGELOG.md | 1 + .../bedtools_bed12tobed6/config.vsh.yaml | 67 +++++++++++++++ src/bedtools/bedtools_bed12tobed6/help.txt | 13 +++ src/bedtools/bedtools_bed12tobed6/script.sh | 15 ++++ src/bedtools/bedtools_bed12tobed6/test.sh | 85 +++++++++++++++++++ 5 files changed, 181 insertions(+) create mode 100644 src/bedtools/bedtools_bed12tobed6/config.vsh.yaml create mode 100644 src/bedtools/bedtools_bed12tobed6/help.txt create mode 100644 src/bedtools/bedtools_bed12tobed6/script.sh create mode 100644 src/bedtools/bedtools_bed12tobed6/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 29fb8cfa..828253f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ - `bedtools/bedtools_merge`: Merges overlapping BED/GFF/VCF entries into a single interval (PR #118). - `bedtools/bedtools_bamtofastq`: Convert BAM alignments to FASTQ files (PR #101). - `bedtools/bedtools_bedtobam`: Converts genomic feature records (bed/gff/vcf) to BAM format (PR #111). + - `bedtools/bedtools_bed12tobed6`: Converts BED12 files to BED6 files (PR #140). - `bedtools/bedtools_links`: Creates an HTML file with links to an instance of the UCSC Genome Browser for all features / intervals in a (bed/gff/vcf) file (PR #137). * `qualimap/qualimap_rnaseq`: RNA-seq QC analysis using qualimap (PR #74). diff --git a/src/bedtools/bedtools_bed12tobed6/config.vsh.yaml b/src/bedtools/bedtools_bed12tobed6/config.vsh.yaml new file mode 100644 index 00000000..8dd6328c --- /dev/null +++ b/src/bedtools/bedtools_bed12tobed6/config.vsh.yaml @@ -0,0 +1,67 @@ +name: bedtools_bed12tobed6 +namespace: bedtools +description: | + Converts BED features in BED12 (a.k.a. “blocked” BED features such as genes) to discrete BED6 features. + For example, in the case of a gene with six exons, bed12ToBed6 would create six separate BED6 features (i.e., one for each exon). +keywords: [Converts, BED12, BED6] +links: + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/bed12tobed6.html + repository: https://github.com/arq5x/bedtools2 + homepage: https://bedtools.readthedocs.io/en/latest/# + issue_tracker: https://github.com/arq5x/bedtools2/issues +references: + doi: 10.1093/bioinformatics/btq033 +license: MIT +requirements: + commands: [bedtools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] + +argument_groups: + + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + description: Input BED12 file. + required: true + + - name: Outputs + arguments: + - name: --output + alternatives: -o + type: file + direction: output + description: Output BED6 file to be written. + + - name: Options + arguments: + - name: --n_score + alternatives: -n + type: boolean_true + description: | + Force the score to be the (1-based) block number from the BED12. + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bedtools, procps] + - type: docker + run: | + echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/bedtools/bedtools_bed12tobed6/help.txt b/src/bedtools/bedtools_bed12tobed6/help.txt new file mode 100644 index 00000000..17af6983 --- /dev/null +++ b/src/bedtools/bedtools_bed12tobed6/help.txt @@ -0,0 +1,13 @@ +``` +bedtools bed12tobed6 -h +``` + +Tool: bedtools bed12tobed6 (aka bed12ToBed6) +Version: v2.30.0 +Summary: Splits BED12 features into discrete BED6 features. + +Usage: bedtools bed12tobed6 [OPTIONS] -i + +Options: + -n Force the score to be the (1-based) block number from the BED12. + diff --git a/src/bedtools/bedtools_bed12tobed6/script.sh b/src/bedtools/bedtools_bed12tobed6/script.sh new file mode 100644 index 00000000..bbfaddc6 --- /dev/null +++ b/src/bedtools/bedtools_bed12tobed6/script.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -eo pipefail + +# Unset parameters +[[ "$par_n_score" == "false" ]] && unset par_n_score + +# Execute bedtools bed12tobed6 conversion +bedtools bed12tobed6 \ + ${par_n_score:+-n} \ + -i "$par_input" \ + > "$par_output" diff --git a/src/bedtools/bedtools_bed12tobed6/test.sh b/src/bedtools/bedtools_bed12tobed6/test.sh new file mode 100644 index 00000000..2ef596d9 --- /dev/null +++ b/src/bedtools/bedtools_bed12tobed6/test.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +# exit on error +set -eo pipefail + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create example BED12 file +cat < "$TMPDIR/example.bed12" +chr21 10079666 10120808 uc002yiv.1 0 - 10081686 1 0 1 2 0 6 0 8 0 4 528,91,101,215, 0,1930,39750,40927, +chr21 10080031 10081687 uc002yiw.1 0 - 10080031 1 0 0 8 0 0 3 1 0 2 200,91, 0,1565, +chr21 10081660 10120796 uc002yix.2 0 - 10081660 1 0 0 8 1 6 6 0 0 3 27,101,223, 0,37756,38913, +EOF + +# Expected output bed6 file +cat < "$TMPDIR/expected.bed6" +chr21 10079666 10120808 uc002yiv.1 0 - +chr21 10080031 10081687 uc002yiw.1 0 - +chr21 10081660 10120796 uc002yix.2 0 - +EOF +# Expected output bed6 file with -n option +cat < "$TMPDIR/expected_n.bed6" +chr21 10079666 10120808 uc002yiv.1 1 - +chr21 10080031 10081687 uc002yiw.1 1 - +chr21 10081660 10120796 uc002yix.2 1 - +EOF + +# Test 1: Default conversion BED12 to BED6 +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null + +echo "> Run bedtools_bed12tobed6 on BED12 file" +"$meta_executable" \ + --input "../example.bed12" \ + --output "output.bed6" + +# checks +assert_file_exists "output.bed6" +assert_file_not_empty "output.bed6" +assert_identical_content "output.bed6" "../expected.bed6" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: Conversion BED12 to BED6 with -n option +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bedtools_bed12tobed6 on BED12 file with -n option" +"$meta_executable" \ + --input "../example.bed12" \ + --output "output.bed6" \ + --n_score + +# checks +assert_file_exists "output.bed6" +assert_file_not_empty "output.bed6" +assert_identical_content "output.bed6" "../expected_n.bed6" +echo "- test2 succeeded -" + +popd > /dev/null + +echo "---- All tests succeeded! ----" +exit 0 From da3272d0118227ee788cd93b222201f557729397 Mon Sep 17 00:00:00 2001 From: Theodoro Gasperin Terra Camargo <98555209+tgaspe@users.noreply.github.com> Date: Mon, 2 Sep 2024 15:25:41 +0200 Subject: [PATCH 08/28] Bcftools sort (#141) * Initial commit * Update on config file * Update * Update config.vsh.yaml * Update config.vsh.yaml * Update test.sh * Update help.txt * adding meta variables * Adding test for bcf file * Update CHANGELOG.md * Update config.vsh.yaml * requested changes --------- Co-authored-by: Jakub Majercik <57993790+jakubmajercik@users.noreply.github.com> --- CHANGELOG.md | 2 + src/bcftools/bcftools_sort/config.vsh.yaml | 73 +++++++ src/bcftools/bcftools_sort/help.txt | 14 ++ src/bcftools/bcftools_sort/script.sh | 16 ++ src/bcftools/bcftools_sort/test.sh | 185 ++++++++++++++++++ .../bcftools_sort/test_data/example.bcf | Bin 0 -> 1183 bytes 6 files changed, 290 insertions(+) create mode 100644 src/bcftools/bcftools_sort/config.vsh.yaml create mode 100644 src/bcftools/bcftools_sort/help.txt create mode 100644 src/bcftools/bcftools_sort/script.sh create mode 100644 src/bcftools/bcftools_sort/test.sh create mode 100644 src/bcftools/bcftools_sort/test_data/example.bcf diff --git a/CHANGELOG.md b/CHANGELOG.md index 828253f0..11052113 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,6 +40,8 @@ * `rsem/rsem_prepare_reference`: Prepare transcript references for RSEM (PR #89). +* `bcftools`: + - `bcftools/bcftools_sort`: Sorts BCF/VCF files by position and other criteria (PR #141). ## MINOR CHANGES diff --git a/src/bcftools/bcftools_sort/config.vsh.yaml b/src/bcftools/bcftools_sort/config.vsh.yaml new file mode 100644 index 00000000..71a15309 --- /dev/null +++ b/src/bcftools/bcftools_sort/config.vsh.yaml @@ -0,0 +1,73 @@ +name: bcftools_sort +namespace: bcftools +description: | + Sorts VCF/BCF files. +keywords: [Sort, VCF, BCF] +links: + homepage: https://samtools.github.io/bcftools/ + documentation: https://samtools.github.io/bcftools/bcftools.html#sort + repository: https://github.com/samtools/bcftools + issue_tracker: https://github.com/samtools/bcftools/issues +references: + doi: https://doi.org/10.1093/gigascience/giab008 +license: MIT/Expat, GNU +requirements: + commands: [bcftools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + description: Input VCF/BCF file. + required: true + + - name: Outputs + arguments: + - name: --output + alternatives: -o + direction: output + type: file + description: Output sorted VCF/BCF file. + required: true + + - name: Options + arguments: + - name: --output_type + alternatives: -O + type: string + choices: [b, u, z, v] + description: | + Compresses or uncompresses the output. + The options are: + b: compressed BCF, + u: uncompressed BCF, + z: compressed VCF, + v: uncompressed VCF. + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bcftools, procps] + - type: docker + run: | + echo "bcftools: \"$(bcftools --version | grep 'bcftools' | sed -n 's/^bcftools //p')\"" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/bcftools/bcftools_sort/help.txt b/src/bcftools/bcftools_sort/help.txt new file mode 100644 index 00000000..3b5fa80b --- /dev/null +++ b/src/bcftools/bcftools_sort/help.txt @@ -0,0 +1,14 @@ +``` +bcftools sort +``` + +About: Sort VCF/BCF file. +Usage: bcftools sort [OPTIONS] + +Options: + -m, --max-mem FLOAT[kMG] maximum memory to use [768M] + -o, --output FILE output file name [stdout] + -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v] + -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v] + -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX] + diff --git a/src/bcftools/bcftools_sort/script.sh b/src/bcftools/bcftools_sort/script.sh new file mode 100644 index 00000000..e9afb223 --- /dev/null +++ b/src/bcftools/bcftools_sort/script.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +# Execute bedtools bamtofastq with the provided arguments +bcftools sort \ + -o "$par_output" \ + ${par_output_type:+-O "$par_output_type"} \ + ${meta_memory_mb:+-m "${meta_memory_mb}M"} \ + ${meta_temp_dir:+-T "$meta_temp_dir"} \ + $par_input \ + diff --git a/src/bcftools/bcftools_sort/test.sh b/src/bcftools/bcftools_sort/test.sh new file mode 100644 index 00000000..f406b8e2 --- /dev/null +++ b/src/bcftools/bcftools_sort/test.sh @@ -0,0 +1,185 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +test_data="$meta_resources_dir/test_data" + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create test data +cat < "$TMPDIR/example.vcf" +##fileformat=VCFv4.0 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=1000GenomesPilot-NCBI36 +##contig= +##contig= +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##ALT= +##ALT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 +19 111 . A C 9.6 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 +20 1235237 . T . . . . GT 0/0 0|0 ./. +20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. +20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,. +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.,. +20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:.:56,60 0|0:48:4:51,51 0/0:61:2:.,. +20 1234567 microsat1 G GA,GAC 50 PASS NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:.:4 0/2:17:2 1/1:40:3 +EOF + +# Create expected output +cat < "$TMPDIR/expected_output.vcf" +##fileformat=VCFv4.0 +##FILTER= +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=1000GenomesPilot-NCBI36 +##contig= +##contig= +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##ALT= +##ALT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +19 111 . A C 9.6 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 +19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 +20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. +20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,. +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.,. +20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:.:56,60 0|0:48:4:51,51 0/0:61:2:.,. +20 1234567 microsat1 G GA,GAC 50 PASS NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:.:4 0/2:17:2 1/1:40:3 +20 1235237 . T . . . . GT 0/0 0|0 ./. +EOF + +cat < "$TMPDIR/expected_bcf.vcf" +##fileformat=VCFv4.0 +##FILTER= +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=1000GenomesPilot-NCBI36 +##contig= +##contig= +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##ALT= +##ALT= +##bcftools_viewVersion=1.16+htslib-1.16 +##bcftools_viewCommand=view -O b -o example.bcf example.vcf.gz; Date=Mon Aug 26 13:00:22 2024 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +19 111 . A C 9.6 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 +19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 +20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. +20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,. +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.,. +20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:.:56,60 0|0:48:4:51,51 0/0:61:2:.,. +20 1234567 microsat1 G GA,GAC 50 PASS NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:.:4 0/2:17:2 1/1:40:3 +20 1235237 . T . . . . GT 0/0 0|0 ./. +EOF + + +# Test 1: Default Use +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null + +echo "> Run bcftools_sort on VCF file" +"$meta_executable" \ + --input "../example.vcf" \ + --output "output.vcf" \ + --output_type "v" \ + &> /dev/null + +# checks +assert_file_exists "output.vcf" +assert_file_not_empty "output.vcf" +assert_identical_content "output.vcf" "../expected_output.vcf" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: BCF file input +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bcftools_sort on BCF file" +"$meta_executable" \ + --input "${test_data}/example.bcf" \ + --output "output.vcf" \ + --output_type "v" \ + &> /dev/null + +# checks +assert_file_exists "output.vcf" +assert_file_not_empty "output.vcf" +assert_identical_content "output.vcf" "../expected_bcf.vcf" +echo "- test2 succeeded -" + +popd > /dev/null + +echo "---- All tests succeeded! ----" +exit 0 diff --git a/src/bcftools/bcftools_sort/test_data/example.bcf b/src/bcftools/bcftools_sort/test_data/example.bcf new file mode 100644 index 0000000000000000000000000000000000000000..d78ae010b4f1b6924f72b296a50e1cab5ca5f0ca GIT binary patch literal 1183 zcmV;Q1Yr9giwFb&00000{{{d;LjnMT1dWu>Z{tK1$6Y%~*IgFjN>zkFt7*dumQACv z)1*JByX!bj8YxZFCR@0nNis<+`PbT$F697`koLlz6^H{jByRi%T=*A|_zyU9ZM-0+9}|*T&~t=c8@RA!$wtY zYn@KMO8vfPxXf^DbxSi%1YV4KK5~ig)4^80QT;k$Y9{nA@!&YHZsPzPHo4ce%mk|w%yVzjba&W`i{+vQ7B2? z&zsa9v9Vg(E6clOxqK6U!A!xon#qBgJ0`wik5rd<%pfMbX|!r*p_YEuo)duFIj}h1UW<0A(m@WAAM@DF@n^|)=ii>RBPP@+I1K)kv&Hf)1(2~uu(sl z56}o|!@%ERnL`i>)v}QLBo)1jem?EE86gl1i zlo219hR11o=QxOvo@80G-U%6>%LO%P9%|0SLf z_msHG0y6^b9VL(G3mGRJ&nLt**fr`=(|L(x9J%c;x%s6fws3e2*RSD9sB+8{ksW2{T9FX!A^|rKlpP!ysvQ6uY;=qm(^@H z^x}DVQep97Y`nQQ6Ze;vj)J6Kdwv<0c9ha?ww~1_PFj%!DJ8uvr8mItskF*T_aY4` zA(fKek}!S(>nr!tpr;69IxQ@O_*I{Uc=k+)Lx3J2i!u2=kiXNvJ&f{7^E@zR;W4ZT zX#toAiY82_)dl9yf4>tmUscjToXg(9#ZoCKZnhf==K-=Nr62wC)h|ci{PyuvDFX|u zf%?>HCY{MZqzyg?Mj0-sX7;^xemZ@;5b4L?*W*Zs6(Oo-pnhJJ2O%-ukU)4DtMGc7 z;sLjJ7!;!d&KoiA)gLq5$xOKL+A{i{;}a%l>mSy&-MXgF@gm^skVQrkHieZSPKwWj xMW0F^!Fq}p{sk88Nlg_A001A02m}BC000301^_}s0stET0{{R300000005SPILiP4 literal 0 HcmV?d00001 From e6627ec728761fe63fe75b0a10ba51da2bccec21 Mon Sep 17 00:00:00 2001 From: Theodoro Gasperin Terra Camargo <98555209+tgaspe@users.noreply.github.com> Date: Tue, 3 Sep 2024 10:55:42 +0200 Subject: [PATCH 09/28] FastQC (#92) * Starting Component * Creating Files * update on config file * Update on test.sh * Update on config * Update script.sh * Update on script.sh * trying to figure multiple: true * Update on script.sh * Update on script * Adding some tests * More tests * Update on script.sh * Added more tests * Small Changes * Update test.sh * Update on Script and Test - change the --zip and --html to take wild card '*' * Added one more test * Removed test_data dir * More description * Update CHANGELOG.md * Update on config and script - meta_cpus - meta_tmp_dir * Bug Fixed * unset_if_false * Updating Tests * Update script.sh * debugging * Minor changes * Update config.vsh.yaml * Update config.vsh.yaml * Required Changes - large changes on script.sh * Update config.vsh.yaml * Adding extra links * tmpdir bug * Updating tests * minor changes * Adding extra output options --summary --data * minor change * Update script.sh * small change in config --- CHANGELOG.md | 2 + src/fastqc/config.vsh.yaml | 209 +++++++++++++++++++++++++++++++++ src/fastqc/help.txt | 125 ++++++++++++++++++++ src/fastqc/script.sh | 86 ++++++++++++++ src/fastqc/test.sh | 235 +++++++++++++++++++++++++++++++++++++ 5 files changed, 657 insertions(+) create mode 100644 src/fastqc/config.vsh.yaml create mode 100644 src/fastqc/help.txt create mode 100644 src/fastqc/script.sh create mode 100644 src/fastqc/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 11052113..98e78c17 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,6 +43,8 @@ * `bcftools`: - `bcftools/bcftools_sort`: Sorts BCF/VCF files by position and other criteria (PR #141). +* `fastqc`: High throughput sequence quality control analysis tool (PR #92). + ## MINOR CHANGES * `busco` components: update BUSCO to `5.7.1` (PR #72). diff --git a/src/fastqc/config.vsh.yaml b/src/fastqc/config.vsh.yaml new file mode 100644 index 00000000..75b16f36 --- /dev/null +++ b/src/fastqc/config.vsh.yaml @@ -0,0 +1,209 @@ +name: fastqc +description: FastQC - A high throughput sequence QC analysis tool. +keywords: [Quality control, BAM, SAM, FASTQ] +links: + homepage: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ + documentation: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/ + repository: https://github.com/s-andrews/FastQC + issue_tracker: https://github.com/s-andrews/FastQC/issues +license: GPL-3.0, Apache-2.0 +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + direction: input + multiple: true + description: | + FASTQ file(s) to be analyzed. + required: true + example: input.fq + + - name: Outputs + description: | + At least one of the output options (--html, --zip, --summary, --data) must be used. + arguments: + + - name: --html + type: file + direction: output + multiple: true + description: | + Create the HTML report of the results. + '*' wild card must be provided in the output file name. + Wild card will be replaced by the input file basename. + e.g. + --input "sample_1.fq" + --html "*.html" + would create an output html file named sample_1.html + example: "*.html" + + - name: --zip + type: file + direction: output + multiple: true + description: | + Create the zip file(s) containing: html report, data, images, icons, summary, etc. + '*' wild card must be provided in the output file name. + Wild card will be replaced by the input basename. + e.g. + --input "sample_1.fq" + --html "*.zip" + would create an output zip file named sample_1.zip + example: "*.zip" + + - name: --summary + type: file + direction: output + multiple: true + description: | + Create the summary file(s). + '*' wild card must be provided in the output file name. + Wild card will be replaced by the input basename. + e.g. + --input "sample_1.fq" + --summary "*_summary.txt" + would create an output summary.txt file named sample_1_summary.txt + example: "*_summary.txt" + + - name: --data + type: file + direction: output + multiple: true + description: | + Create the data file(s). + '*' wild card must be provided in the output file name. + Wild card will be replaced by the input basename. + e.g. + --input "sample_1.fq" + --summary "*_data.txt" + would create an output data.txt file named sample_1_data.txt + example: "*_data.txt" + + - name: Options + arguments: + - name: --casava + type: boolean_true + description: | + Files come from raw casava output. Files in the same sample + group (differing only by the group number) will be analysed + as a set rather than individually. Sequences with the filter + flag set in the header will be excluded from the analysis. + Files must have the same names given to them by casava + (including being gzipped and ending with .gz) otherwise they + won't be grouped together correctly. + + - name: --nano + type: boolean_true + description: | + Files come from nanopore sequences and are in fast5 format. In + this mode you can pass in directories to process and the program + will take in all fast5 files within those directories and produce + a single output file from the sequences found in all files. + + - name: --nofilter + type: boolean_true + description: | + If running with --casava then don't remove read flagged by + casava as poor quality when performing the QC analysis. + + - name: --nogroup + type: boolean_true + description: | + Disable grouping of bases for reads >50bp. + All reports will show data for every base in the read. + WARNING: Using this option will cause fastqc to crash + and burn if you use it on really long reads, and your + plots may end up a ridiculous size. You have been warned! + + - name: --min_length + type: integer + description: | + Sets an artificial lower limit on the length of the + sequence to be shown in the report. As long as you + set this to a value greater or equal to your longest + read length then this will be the sequence length used + to create your read groups. This can be useful for making + directly comparable statistics from datasets with somewhat + variable read lengths. + example: 0 + + - name: --format + alternatives: -f + type: string + description: | + Bypasses the normal sequence file format detection and + forces the program to use the specified format. + Valid formats are bam, sam, bam_mapped, sam_mapped, and fastq. + example: bam + + - name: --contaminants + alternatives: -c + type: file + description: | + Specifies a non-default file which contains the list + of contaminants to screen overrepresented sequences against. + The file must contain sets of named contaminants in the form + name[tab]sequence. Lines prefixed with a hash will be ignored. + example: contaminants.txt + + - name: --adapters + alternatives: -a + type: file + description: | + Specifies a non-default file which contains the list of + adapter sequences which will be explicitly searched against + the library. The file must contain sets of named adapters + in the form name[tab]sequence. Lines prefixed with a hash will be ignored. + example: adapters.txt + + - name: --limits + alternatives: -l + type: file + description: | + Specifies a non-default file which contains + a set of criteria which will be used to determine + the warn/error limits for the various modules. + This file can also be used to selectively remove + some modules from the output altogether. The format + needs to mirror the default limits.txt file found in + the Configuration folder. + example: limits.txt + + - name: --kmers + alternatives: -k + type: integer + description: | + Specifies the length of Kmer to look for in the Kmer + content module. Specified Kmer length must be between + 2 and 10. Default length is 7 if not specified. + example: 7 + + - name: --quiet + alternatives: -q + type: boolean_true + description: | + Suppress all progress messages on stdout and only report errors. + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: biocontainers/fastqc:v0.11.9_cv8 + setup: + - type: docker + run: | + echo "fastqc: $(fastqc --version | sed -n 's/^FastQC //p')" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/fastqc/help.txt b/src/fastqc/help.txt new file mode 100644 index 00000000..502aebc0 --- /dev/null +++ b/src/fastqc/help.txt @@ -0,0 +1,125 @@ +```bash +fastqc --help +``` + + FastQC - A high throughput sequence QC analysis tool + +SYNOPSIS + + fastqc seqfile1 seqfile2 .. seqfileN + + fastqc [-o output dir] [--(no)extract] [-f fastq|bam|sam] + [-c contaminant file] seqfile1 .. seqfileN + +DESCRIPTION + + FastQC reads a set of sequence files and produces from each one a quality + control report consisting of a number of different modules, each one of + which will help to identify a different potential type of problem in your + data. + + If no files to process are specified on the command line then the program + will start as an interactive graphical application. If files are provided + on the command line then the program will run with no user interaction + required. In this mode it is suitable for inclusion into a standardised + analysis pipeline. + + The options for the program as as follows: + + -h --help Print this help file and exit + + -v --version Print the version of the program and exit + + -o --outdir Create all output files in the specified output directory. + Please note that this directory must exist as the program + will not create it. If this option is not set then the + output file for each sequence file is created in the same + directory as the sequence file which was processed. + + --casava Files come from raw casava output. Files in the same sample + group (differing only by the group number) will be analysed + as a set rather than individually. Sequences with the filter + flag set in the header will be excluded from the analysis. + Files must have the same names given to them by casava + (including being gzipped and ending with .gz) otherwise they + won't be grouped together correctly. + + --nano Files come from nanopore sequences and are in fast5 format. In + this mode you can pass in directories to process and the program + will take in all fast5 files within those directories and produce + a single output file from the sequences found in all files. + + --nofilter If running with --casava then don't remove read flagged by + casava as poor quality when performing the QC analysis. + + --extract If set then the zipped output file will be uncompressed in + the same directory after it has been created. By default + this option will be set if fastqc is run in non-interactive + mode. + + -j --java Provides the full path to the java binary you want to use to + launch fastqc. If not supplied then java is assumed to be in + your path. + + --noextract Do not uncompress the output file after creating it. You + should set this option if you do not wish to uncompress + the output when running in non-interactive mode. + + --nogroup Disable grouping of bases for reads >50bp. All reports will + show data for every base in the read. WARNING: Using this + option will cause fastqc to crash and burn if you use it on + really long reads, and your plots may end up a ridiculous size. + You have been warned! + + --min_length Sets an artificial lower limit on the length of the sequence + to be shown in the report. As long as you set this to a value + greater or equal to your longest read length then this will be + the sequence length used to create your read groups. This can + be useful for making directly comaparable statistics from + datasets with somewhat variable read lengths. + + -f --format Bypasses the normal sequence file format detection and + forces the program to use the specified format. Valid + formats are bam,sam,bam_mapped,sam_mapped and fastq + + -t --threads Specifies the number of files which can be processed + simultaneously. Each thread will be allocated 250MB of + memory so you shouldn't run more threads than your + available memory will cope with, and not more than + 6 threads on a 32 bit machine + + -c Specifies a non-default file which contains the list of + --contaminants contaminants to screen overrepresented sequences against. + The file must contain sets of named contaminants in the + form name[tab]sequence. Lines prefixed with a hash will + be ignored. + + -a Specifies a non-default file which contains the list of + --adapters adapter sequences which will be explicity searched against + the library. The file must contain sets of named adapters + in the form name[tab]sequence. Lines prefixed with a hash + will be ignored. + + -l Specifies a non-default file which contains a set of criteria + --limits which will be used to determine the warn/error limits for the + various modules. This file can also be used to selectively + remove some modules from the output all together. The format + needs to mirror the default limits.txt file found in the + Configuration folder. + + -k --kmers Specifies the length of Kmer to look for in the Kmer content + module. Specified Kmer length must be between 2 and 10. Default + length is 7 if not specified. + + -q --quiet Supress all progress messages on stdout and only report errors. + + -d --dir Selects a directory to be used for temporary files written when + generating report images. Defaults to system temp directory if + not specified. + +BUGS + + Any bugs in fastqc should be reported either to simon.andrews@babraham.ac.uk + or in www.bioinformatics.babraham.ac.uk/bugzilla/ + + diff --git a/src/fastqc/script.sh b/src/fastqc/script.sh new file mode 100644 index 00000000..5cf55868 --- /dev/null +++ b/src/fastqc/script.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# exit on error +set -eo pipefail + +# Check if both outputs are empty, at least one must be passed. +if [[ -z "$par_html" ]] && [[ -z "$par_zip" ]] && [[ -z "$par_summary" ]] && [[ -z "$par_data" ]]; then + echo "Error: At least one of the output arguments (--html, --zip, --summary, and --data) must be passed." + exit 1 +fi + +# unset flags +unset_if_false=( + par_casava + par_nano + par_nofilter + par_extract + par_noextract + par_nogroup + par_quiet +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +tmpdir=$(mktemp -d "${meta_temp_dir}/${meta_name}-XXXXXXXX") +function clean_up { + rm -rf "$tmpdir" +} +trap clean_up EXIT + +# Create input array +IFS=";" read -ra input <<< $par_input + +# Run fastqc +fastqc \ + --extract \ + ${par_casava:+--casava} \ + ${par_nano:+--nano} \ + ${par_nofilter:+--nofilter} \ + ${par_nogroup:+--nogroup} \ + ${par_min_length:+--min_length "$par_min_length"} \ + ${par_format:+--format "$par_format"} \ + ${par_contaminants:+--contaminants "$par_contaminants"} \ + ${par_adapters:+--adapters "$par_adapters"} \ + ${par_limits:+--limits "$par_limits"} \ + ${par_kmers:+--kmers "$par_kmers"} \ + ${par_quiet:+--quiet} \ + ${meta_cpus:+--threads "$meta_cpus"} \ + ${meta_temp_dir:+--dir "$meta_temp_dir"} \ + --outdir "${tmpdir}" \ + "${input[@]}" + +# Move output files +for file in "${input[@]}"; do + # Removes everthing after the first dot of the basename + sample_name=$(basename "${file}" | sed 's/\..*$//') + if [[ -n "$par_html" ]]; then + input_html="${tmpdir}/${sample_name}_fastqc.html" + html_file="${par_html//\*/$sample_name}" + mv "$input_html" "$html_file" + fi + if [[ -n "$par_zip" ]]; then + input_zip="${tmpdir}/${sample_name}_fastqc.zip" + zip_file="${par_zip//\*/$sample_name}" + mv "$input_zip" "$zip_file" + fi + if [[ -n "$par_summary" ]]; then + summary_file="${tmpdir}/${sample_name}_fastqc/summary.txt" + new_summary="${par_summary//\*/$sample_name}" + mv "$summary_file" "$new_summary" + fi + if [[ -n "$par_data" ]]; then + data_file="${tmpdir}/${sample_name}_fastqc/fastqc_data.txt" + new_data="${par_data//\*/$sample_name}" + mv "$data_file" "$new_data" + fi + # Remove the extracted directory + rm -r "${tmpdir}/${sample_name}_fastqc" +done + diff --git a/src/fastqc/test.sh b/src/fastqc/test.sh new file mode 100644 index 00000000..8c581ac8 --- /dev/null +++ b/src/fastqc/test.sh @@ -0,0 +1,235 @@ +#!/bin/bash + +# exit on error +set -eo pipefail + +## VIASH START +# meta_executable="target/executable/fastqc" +# meta_resources_dir="src/fastqc" +## VIASH END + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create and populate input.fasta +cat > "$TMPDIR/input_1.fq" < "$TMPDIR/input_2.fq" < "$TMPDIR/contaminants.txt" +printf "contaminant_sequence2\tGATCTTGG\n" >> "$TMPDIR/contaminants.txt" + +# Create and populate SAM file +printf "@HD\tVN:1.0\tSO:unsorted\n" > "$TMPDIR/example.sam" +printf "@SQ\tSN:chr1\tLN:248956422\n" >> "$TMPDIR/example.sam" +printf "@SQ\tSN:chr2\tLN:242193529\n" >> "$TMPDIR/example.sam" +printf "@PG\tID:bowtie2\tPN:bowtie2\tVN:2.3.4.1\tCL:\"/usr/bin/bowtie2-align-s --wrapper basic-0 -x genome -U reads.fq -S output.sam\"\n" >> "$TMPDIR/example.sam" +printf "read1\t0\tchr1\t100\t255\t50M\t*\t0\t0\tACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT\tIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII\tAS:i:-10\tXN:i:0\tXM:i:0\tXO:i:0\tXG:i:0\tNM:i:0\tMD:Z:50\tYT:Z:UU\n" >> "$TMPDIR/example.sam" +printf "read2\t0\tchr2\t150\t255\t50M\t*\t0\t0\tTGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC\tIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII\tAS:i:-8\tXN:i:0\tXM:i:0\tXO:i:0\tXG:i:0\tNM:i:0\tMD:Z:50\tYT:Z:UU\n" >> "$TMPDIR/example.sam" +printf "read3\t16\tchr1\t200\t255\t50M\t*\t0\t0\tGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTA\tIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII\tAS:i:-12\tXN:i:0\tXM:i:0\tXO:i:0\tXG:i:0\tNM:i:0\tMD:Z:50\tYT:Z:UU" >> "$TMPDIR/example.sam" + +cat > "$TMPDIR/expected_summary.txt" < "$TMPDIR/expected_summary2.txt" < "$TMPDIR/expected_summary_sam.txt" < /dev/null + +echo "-> Run Test1: one input" +"$meta_executable" \ + --input "../input_1.fq" \ + --html "*_fastqc.html" \ + --zip "*_fastqc.zip" \ + --summary "*_summary.txt" \ + --data "*_data.txt" \ + --quiet \ + +assert_file_exists "input_1_fastqc.html" +assert_file_exists "input_1_fastqc.zip" +assert_file_exists "input_1_summary.txt" +assert_file_not_empty "input_1_fastqc.html" +assert_file_not_empty "input_1_fastqc.zip" +assert_identical_content "input_1_summary.txt" "../expected_summary.txt" +echo "- test succeeded -" + +popd > /dev/null + + +# Test 2: Run fastqc with multiple inputs +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "-> Run Test2: two inputs" +"$meta_executable" \ + --input "../input_1.fq" \ + --input "../input_2.fq" \ + --html "*_fastqc.html" \ + --zip "*_fastqc.zip" \ + --summary "*_summary.txt" \ + --data "*_data.txt" \ + --quiet \ + +# File 1 +assert_file_exists "input_1_fastqc.html" +assert_file_exists "input_1_fastqc.zip" +assert_file_exists "input_1_summary.txt" +assert_file_not_empty "input_1_fastqc.html" +assert_file_not_empty "input_1_fastqc.zip" +assert_identical_content "input_1_summary.txt" "../expected_summary.txt" +# File 2 +assert_file_exists "input_2_fastqc.html" +assert_file_exists "input_2_fastqc.zip" +assert_file_exists "input_2_summary.txt" +assert_file_not_empty "input_2_fastqc.html" +assert_file_not_empty "input_2_fastqc.zip" +assert_identical_content "input_2_summary.txt" "../expected_summary2.txt" +echo "- test succeeded -" + +popd > /dev/null + +# Test 3: Run fastqc with contaminants +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null + +echo "-> Run Test3: contaminants" +"$meta_executable" \ + --input "../input_1.fq" \ + --contaminants "../contaminants.txt" \ + --html "*_fastqc.html" \ + --zip "*_fastqc.zip" \ + --summary "*_summary.txt" \ + --data "*_data.txt" \ + --quiet \ + +assert_file_exists "input_1_fastqc.html" +assert_file_exists "input_1_fastqc.zip" +assert_file_exists "input_1_summary.txt" +assert_file_not_empty "input_1_fastqc.html" +assert_file_not_empty "input_1_fastqc.zip" +assert_identical_content "input_1_summary.txt" "../expected_summary.txt" +assert_file_contains "input_1_data.txt" "contaminant" +echo "- test succeeded -" + +popd > /dev/null + +# Test 4: Run fastqc with sam file +mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null + +echo "-> Run Test4: sam file" +"$meta_executable" \ + --input "../example.sam" \ + --format "sam" \ + --html "*_fastqc.html" \ + --zip "*_fastqc.zip" \ + --summary "*_summary.txt" \ + --data "*_data.txt" \ + --quiet \ + +assert_file_exists "example_fastqc.html" +assert_file_exists "example_fastqc.zip" +assert_file_exists "example_summary.txt" +assert_file_not_empty "example_fastqc.html" +assert_file_not_empty "example_fastqc.zip" +assert_identical_content "example_summary.txt" "../expected_summary_sam.txt" +echo "- test succeeded -" + +popd > /dev/null + +# Test 5: Run fastqc with multiple options +mkdir "$TMPDIR/test5" && pushd "$TMPDIR/test5" > /dev/null + +echo "-> Run Test5: multiple options" +"$meta_executable" \ + --input "../input_1.fq" \ + --contaminants "../contaminants.txt" \ + --format "fastq" \ + --nofilter \ + --nogroup \ + --min_length 10 \ + --kmers 5 \ + --html "*_fastqc.html" \ + --zip "*_fastqc.zip" \ + --summary "*_summary.txt" \ + --data "*_data.txt" \ + --quiet \ +# --casava \ + +assert_file_exists "input_1_fastqc.html" +assert_file_exists "input_1_fastqc.zip" +assert_file_exists "input_1_summary.txt" +assert_file_not_empty "input_1_fastqc.html" +assert_file_not_empty "input_1_fastqc.zip" +assert_identical_content "input_1_summary.txt" "../expected_summary.txt" +assert_file_contains "input_1_data.txt" "contaminant" +echo "- test succeeded -" + +popd > /dev/null + +echo "All tests succeeded!" +exit 0 From 99dec5923bfb3da165601a3f13502d498395b14d Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Fri, 6 Sep 2024 23:46:11 +0200 Subject: [PATCH 10/28] Bedtools genomecov (#150 and #128) * Initial commit * Update config.vsh.yaml * Update script.sh * update on test.sh * bug fixing * adding ibam option tests * depthzero and strand option tests * 5prime and max tests * more tests * Changelog * Update config.vsh.yaml * Update config.vsh.yaml * Update script.sh * Update test.sh * TMPDIR * Unset Variables * par_trackopts multiple: true * Minor update to CHANGELOG --------- Co-authored-by: tgaspe --- CHANGELOG.md | 2 + .../bedtools_genomecov/config.vsh.yaml | 208 +++++++++++ src/bedtools/bedtools_genomecov/help.txt | 101 ++++++ src/bedtools/bedtools_genomecov/script.sh | 55 +++ src/bedtools/bedtools_genomecov/test.sh | 333 ++++++++++++++++++ .../bedtools_genomecov/test_data/example.bam | Bin 0 -> 334 bytes 6 files changed, 699 insertions(+) create mode 100644 src/bedtools/bedtools_genomecov/config.vsh.yaml create mode 100644 src/bedtools/bedtools_genomecov/help.txt create mode 100644 src/bedtools/bedtools_genomecov/script.sh create mode 100644 src/bedtools/bedtools_genomecov/test.sh create mode 100644 src/bedtools/bedtools_genomecov/test_data/example.bam diff --git a/CHANGELOG.md b/CHANGELOG.md index 98e78c17..8f772450 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ * `bedtools`: - `bedtools/bedtools_intersect`: Allows one to screen for overlaps between two sets of genomic features (PR #94). - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98). + - `bedtools/bedtools_genomecov`: Compute the coverage of a feature file (bed/gff/vcf/bam) among a genome (PR #128). - `bedtools/bedtools_groupby`: Summarizes a dataset column based upon common column groupings. Akin to the SQL "group by" command (PR #123). - `bedtools/bedtools_merge`: Merges overlapping BED/GFF/VCF entries into a single interval (PR #118). - `bedtools/bedtools_bamtofastq`: Convert BAM alignments to FASTQ files (PR #101). @@ -45,6 +46,7 @@ * `fastqc`: High throughput sequence quality control analysis tool (PR #92). + ## MINOR CHANGES * `busco` components: update BUSCO to `5.7.1` (PR #72). diff --git a/src/bedtools/bedtools_genomecov/config.vsh.yaml b/src/bedtools/bedtools_genomecov/config.vsh.yaml new file mode 100644 index 00000000..775587de --- /dev/null +++ b/src/bedtools/bedtools_genomecov/config.vsh.yaml @@ -0,0 +1,208 @@ +name: bedtools_genomecov +namespace: bedtools +description: | + Compute the coverage of a feature file among a genome. +keywords: [genome coverage, BED, GFF, VCF, BAM] +links: + homepage: https://bedtools.readthedocs.io/en/latest/# + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/genomecov.html + repository: https://github.com/arq5x/bedtools2 + issue_tracker: https://github.com/arq5x/bedtools2/issues +references: + doi: 10.1093/bioinformatics/btq033 +license: MIT +requirements: + commands: [bedtools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + direction: input + description: | + The input file (BED/GFF/VCF) to be used. + example: input.bed + + - name: --input_bam + alternatives: -ibam + type: file + description: | + The input file is in BAM format. + Note: BAM _must_ be sorted by positions. + '--genome' option is ignored if you use '--input_bam' option! + + - name: --genome + alternatives: -g + type: file + direction: input + description: | + The genome file to be used. + example: genome.txt + + - name: Outputs + arguments: + - name: --output + type: file + direction: output + description: | + The output BED file. + required: true + example: output.bed + + - name: Options + arguments: + + - name: --depth + alternatives: -d + type: boolean_true + description: | + Report the depth at each genome position (with one-based coordinates). + Default behavior is to report a histogram. + + - name: --depth_zero + alternatives: -dz + type: boolean_true + description: | + Report the depth at each genome position (with zero-based coordinates). + Reports only non-zero positions. + Default behavior is to report a histogram. + + - name: --bed_graph + alternatives: -bg + type: boolean_true + description: | + Report depth in BedGraph format. For details, see: + genome.ucsc.edu/goldenPath/help/bedgraph.html + + - name: --bed_graph_zero_coverage + alternatives: -bga + type: boolean_true + description: | + Report depth in BedGraph format, as above (-bg). + However with this option, regions with zero + coverage are also reported. This allows one to + quickly extract all regions of a genome with 0 + coverage by applying: "grep -w 0$" to the output. + + - name: --split + type: boolean_true + description: | + Treat "split" BAM or BED12 entries as distinct BED intervals. + when computing coverage. + For BAM files, this uses the CIGAR "N" and "D" operations + to infer the blocks for computing coverage. + For BED12 files, this uses the BlockCount, BlockStarts, and BlockEnds + fields (i.e., columns 10,11,12). + + - name: --ignore_deletion + alternatives: -ignoreD + type: boolean_true + description: | + Ignore local deletions (CIGAR "D" operations) in BAM entries + when computing coverage. + + - name: --strand + type: string + choices: ["+", "-"] + description: | + Calculate coverage of intervals from a specific strand. + With BED files, requires at least 6 columns (strand is column 6). + + - name: --pair_end_coverage + alternatives: -pc + type: boolean_true + description: | + Calculate coverage of pair-end fragments. + Works for BAM files only + + - name: --fragment_size + alternatives: -fs + type: boolean_true + description: | + Force to use provided fragment size instead of read length + Works for BAM files only + + - name: --du + type: boolean_true + description: | + Change strand af the mate read (so both reads from the same strand) useful for strand specific + Works for BAM files only + + - name: --five_prime + alternatives: -5 + type: boolean_true + description: | + Calculate coverage of 5" positions (instead of entire interval). + + - name: --three_prime + alternatives: -3 + type: boolean_true + description: | + Calculate coverage of 3" positions (instead of entire interval). + + - name: --max + type: integer + min: 0 + description: | + Combine all positions with a depth >= max into + a single bin in the histogram. Irrelevant + for -d and -bedGraph + - (INTEGER) + + - name: --scale + type: double + min: 0 + description: | + Scale the coverage by a constant factor. + Each coverage value is multiplied by this factor before being reported. + Useful for normalizing coverage by, e.g., reads per million (RPM). + - Default is 1.0; i.e., unscaled. + - (FLOAT) + + - name: --trackline + type: boolean_true + description: | + Adds a UCSC/Genome-Browser track line definition in the first line of the output. + - See here for more details about track line definition: + http://genome.ucsc.edu/goldenPath/help/bedgraph.html + - NOTE: When adding a trackline definition, the output BedGraph can be easily + uploaded to the Genome Browser as a custom track, + BUT CAN NOT be converted into a BigWig file (w/o removing the first line). + + - name: --trackopts + type: string + description: | + Writes additional track line definition parameters in the first line. + - Example: + -trackopts 'name="My Track" visibility=2 color=255,30,30' + Note the use of single-quotes if you have spaces in your parameters. + - (TEXT) + multiple: true + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bedtools, procps] + - type: docker + run: | + echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/bedtools/bedtools_genomecov/help.txt b/src/bedtools/bedtools_genomecov/help.txt new file mode 100644 index 00000000..f13a71d3 --- /dev/null +++ b/src/bedtools/bedtools_genomecov/help.txt @@ -0,0 +1,101 @@ +```bash +bedtools genomecov +``` + +Tool: bedtools genomecov (aka genomeCoverageBed) +Version: v2.30.0 +Summary: Compute the coverage of a feature file among a genome. + +Usage: bedtools genomecov [OPTIONS] -i -g + +Options: + -ibam The input file is in BAM format. + Note: BAM _must_ be sorted by position + + -d Report the depth at each genome position (with one-based coordinates). + Default behavior is to report a histogram. + + -dz Report the depth at each genome position (with zero-based coordinates). + Reports only non-zero positions. + Default behavior is to report a histogram. + + -bg Report depth in BedGraph format. For details, see: + genome.ucsc.edu/goldenPath/help/bedgraph.html + + -bga Report depth in BedGraph format, as above (-bg). + However with this option, regions with zero + coverage are also reported. This allows one to + quickly extract all regions of a genome with 0 + coverage by applying: "grep -w 0$" to the output. + + -split Treat "split" BAM or BED12 entries as distinct BED intervals. + when computing coverage. + For BAM files, this uses the CIGAR "N" and "D" operations + to infer the blocks for computing coverage. + For BED12 files, this uses the BlockCount, BlockStarts, and BlockEnds + fields (i.e., columns 10,11,12). + + -ignoreD Ignore local deletions (CIGAR "D" operations) in BAM entries + when computing coverage. + + -strand Calculate coverage of intervals from a specific strand. + With BED files, requires at least 6 columns (strand is column 6). + - (STRING): can be + or - + + -pc Calculate coverage of pair-end fragments. + Works for BAM files only + -fs Force to use provided fragment size instead of read length + Works for BAM files only + -du Change strand af the mate read (so both reads from the same strand) useful for strand specific + Works for BAM files only + -5 Calculate coverage of 5" positions (instead of entire interval). + + -3 Calculate coverage of 3" positions (instead of entire interval). + + -max Combine all positions with a depth >= max into + a single bin in the histogram. Irrelevant + for -d and -bedGraph + - (INTEGER) + + -scale Scale the coverage by a constant factor. + Each coverage value is multiplied by this factor before being reported. + Useful for normalizing coverage by, e.g., reads per million (RPM). + - Default is 1.0; i.e., unscaled. + - (FLOAT) + + -trackline Adds a UCSC/Genome-Browser track line definition in the first line of the output. + - See here for more details about track line definition: + http://genome.ucsc.edu/goldenPath/help/bedgraph.html + - NOTE: When adding a trackline definition, the output BedGraph can be easily + uploaded to the Genome Browser as a custom track, + BUT CAN NOT be converted into a BigWig file (w/o removing the first line). + + -trackopts Writes additional track line definition parameters in the first line. + - Example: + -trackopts 'name="My Track" visibility=2 color=255,30,30' + Note the use of single-quotes if you have spaces in your parameters. + - (TEXT) + +Notes: + (1) The genome file should tab delimited and structured as follows: + + + For example, Human (hg19): + chr1 249250621 + chr2 243199373 + ... + chr18_gl000207_random 4262 + + (2) The input BED (-i) file must be grouped by chromosome. + A simple "sort -k 1,1 > .sorted" will suffice. + + (3) The input BAM (-ibam) file must be sorted by position. + A "samtools sort " should suffice. + +Tips: + One can use the UCSC Genome Browser's MySQL database to extract + chromosome sizes. For example, H. sapiens: + + mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \ + "select chrom, size from hg19.chromInfo" > hg19.genome + diff --git a/src/bedtools/bedtools_genomecov/script.sh b/src/bedtools/bedtools_genomecov/script.sh new file mode 100644 index 00000000..20fbd968 --- /dev/null +++ b/src/bedtools/bedtools_genomecov/script.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +# Unset variables +unset_if_false=( + par_input_bam + par_depth + par_depth_zero + par_bed_graph + par_bed_graph_zero_coverage + par_split + par_ignore_deletion + par_pair_end_coverage + par_fragment_size + par_du + par_five_prime + par_three_prime + par_trackline +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +# Create input array +IFS=";" read -ra trackopts <<< $par_trackopts + +bedtools genomecov \ + ${par_depth:+-d} \ + ${par_depth_zero:+-dz} \ + ${par_bed_graph:+-bg} \ + ${par_bed_graph_zero_coverage:+-bga} \ + ${par_split:+-split} \ + ${par_ignore_deletion:+-ignoreD} \ + ${par_du:+-du} \ + ${par_five_prime:+-5} \ + ${par_three_prime:+-3} \ + ${par_trackline:+-trackline} \ + ${par_strand:+-strand "$par_strand"} \ + ${par_max:+-max "$par_max"} \ + ${par_scale:+-scale "$par_scale"} \ + ${par_trackopts:+-trackopts "${trackopts[*]}"} \ + ${par_input_bam:+-ibam "$par_input_bam"} \ + ${par_input:+-i "$par_input"} \ + ${par_genome:+-g "$par_genome"} \ + ${par_pair_end_coverage:+-pc} \ + ${par_fragment_size:+-fs} \ + > "$par_output" + \ No newline at end of file diff --git a/src/bedtools/bedtools_genomecov/test.sh b/src/bedtools/bedtools_genomecov/test.sh new file mode 100644 index 00000000..7e4487da --- /dev/null +++ b/src/bedtools/bedtools_genomecov/test.sh @@ -0,0 +1,333 @@ +#!/bin/bash + +# exit on error +set -eo pipefail + +## VIASH START +meta_executable="target/executable/bedtools/bedtools_intersect/bedtools_intersect" +meta_resources_dir="src/bedtools/bedtools_intersect" +## VIASH END + +# directory of the bam file +test_data="$meta_resources_dir/test_data" + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create and populate input files +printf "chr1\t248956422\nchr2\t198295559\nchr3\t242193529\n" > "$TMPDIR/genome.txt" +printf "chr2\t128\t228\tmy_read/1\t37\t+\nchr2\t428\t528\tmy_read/2\t37\t-\n" > "$TMPDIR/example.bed" +printf "chr2\t128\t228\tmy_read/1\t60\t+\t128\t228\t255,0,0\t1\t100\t0\nchr2\t428\t528\tmy_read/2\t60\t-\t428\t528\t255,0,0\t1\t100\t0\n" > "$TMPDIR/example.bed12" +printf "chr2\t100\t103\n" > "$TMPDIR/example_dz.bed" + +# expected outputs +cat > "$TMPDIR/expected_default.bed" < "$TMPDIR/expected_ibam.bed" < "$TMPDIR/expected_ibam_pc.bed" < "$TMPDIR/expected_ibam_fs.bed" < "$TMPDIR/expected_dz.bed" < "$TMPDIR/expected_strand.bed" < "$TMPDIR/expected_5.bed" < "$TMPDIR/expected_bg_scale.bed" < "$TMPDIR/expected_trackopts.bed" < "$TMPDIR/expected_split.bed" < "$TMPDIR/expected_ignoreD_du.bed" < /dev/null + +echo "> Run bedtools_genomecov on BED file" +"$meta_executable" \ + --input "../example.bed" \ + --genome "../genome.txt" \ + --output "output.bed" + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_default.bed" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: ibam option +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bedtools_genomecov on BAM file with -ibam" +"$meta_executable" \ + --input_bam "$test_data/example.bam" \ + --output "output.bed" \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_ibam.bed" +echo "- test2 succeeded -" + +popd > /dev/null + +# Test 3: depth option +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null + +echo "> Run bedtools_genomecov on BED file with -dz" +"$meta_executable" \ + --input "../example_dz.bed" \ + --genome "../genome.txt" \ + --output "output.bed" \ + --depth_zero + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_dz.bed" +echo "- test3 succeeded -" + +popd > /dev/null + +# Test 4: strand option +mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null + +echo "> Run bedtools_genomecov on BED file with -strand" +"$meta_executable" \ + --input "../example.bed" \ + --genome "../genome.txt" \ + --output "output.bed" \ + --strand "-" \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_strand.bed" +echo "- test4 succeeded -" + +popd > /dev/null + +# Test 5: 5' end option +mkdir "$TMPDIR/test5" && pushd "$TMPDIR/test5" > /dev/null + +echo "> Run bedtools_genomecov on BED file with -5" +"$meta_executable" \ + --input "../example.bed" \ + --genome "../genome.txt" \ + --output "output.bed" \ + --five_prime \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_5.bed" +echo "- test5 succeeded -" + +popd > /dev/null + +# Test 6: max option +mkdir "$TMPDIR/test6" && pushd "$TMPDIR/test6" > /dev/null + +echo "> Run bedtools_genomecov on BED file with -max" +"$meta_executable" \ + --input "../example.bed" \ + --genome "../genome.txt" \ + --output "output.bed" \ + --max 100 \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_default.bed" +echo "- test6 succeeded -" + +popd > /dev/null + +# Test 7: bedgraph and scale option +mkdir "$TMPDIR/test7" && pushd "$TMPDIR/test7" > /dev/null + +echo "> Run bedtools_genomecov on BED file with -bg and -scale" +"$meta_executable" \ + --input "../example.bed" \ + --genome "../genome.txt" \ + --output "output.bed" \ + --bed_graph \ + --scale 100 \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_bg_scale.bed" +echo "- test7 succeeded -" + +popd > /dev/null + +# Test 8: trackopts option +mkdir "$TMPDIR/test8" && pushd "$TMPDIR/test8" > /dev/null + +echo "> Run bedtools_genomecov on BED file with -bg and -trackopts" +"$meta_executable" \ + --input "../example.bed" \ + --genome "../genome.txt" \ + --output "output.bed" \ + --bed_graph \ + --trackopts "name=example" \ + --trackopts "llama=Alpaco" \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_trackopts.bed" +echo "- test8 succeeded -" + +popd > /dev/null + +# Test 9: ibam pc options +mkdir "$TMPDIR/test9" && pushd "$TMPDIR/test9" > /dev/null + +echo "> Run bedtools_genomecov on BAM file with -ibam, -pc" +"$meta_executable" \ + --input_bam "$test_data/example.bam" \ + --output "output.bed" \ + --fragment_size \ + --pair_end_coverage \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_ibam_pc.bed" +echo "- test9 succeeded -" + +popd > /dev/null + +# Test 10: ibam fs options +mkdir "$TMPDIR/test10" && pushd "$TMPDIR/test10" > /dev/null + +echo "> Run bedtools_genomecov on BAM file with -ibam, -fs" +"$meta_executable" \ + --input_bam "$test_data/example.bam" \ + --output "output.bed" \ + --fragment_size \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_ibam_fs.bed" +echo "- test10 succeeded -" + +popd > /dev/null + +# Test 11: split +mkdir "$TMPDIR/test11" && pushd "$TMPDIR/test11" > /dev/null + +echo "> Run bedtools_genomecov on BED12 file with -split" +"$meta_executable" \ + --input "../example.bed12" \ + --genome "../genome.txt" \ + --output "output.bed" \ + --split \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_split.bed" +echo "- test11 succeeded -" + +popd > /dev/null + +# Test 12: ignore deletion and du +mkdir "$TMPDIR/test12" && pushd "$TMPDIR/test12" > /dev/null + +echo "> Run bedtools_genomecov on BAM file with -ignoreD and -du" +"$meta_executable" \ + --input_bam "$test_data/example.bam" \ + --output "output.bed" \ + --ignore_deletion \ + --du \ + +# checks +assert_file_exists "output.bed" +assert_file_not_empty "output.bed" +assert_identical_content "output.bed" "../expected_ignoreD_du.bed" +echo "- test12 succeeded -" + +popd > /dev/null + +echo "---- All tests succeeded! ----" +exit 0 diff --git a/src/bedtools/bedtools_genomecov/test_data/example.bam b/src/bedtools/bedtools_genomecov/test_data/example.bam new file mode 100644 index 0000000000000000000000000000000000000000..ffc075ab83a83a98ed1edbf88b26cc27ad8946c6 GIT binary patch literal 334 zcmb2|=3rp}f&Xj_PR>jWAq>SuUsA6mBqS7Y@IB%Aw%O~PhS4S?6Z1_bX2zRMuCZ>` z;o;@Ato^fw$CpQUheTtRYNNz-r#8JXHa3Ry>s4lk0?m>~GxQF_-U<7&m>dP#pU;|5 z)~CHK)-&PMX8(zQnRkjz7tzu&Q_9lpm^;_nXXDZb**~)OH9hZA+GbYw!F2!1eU^u& z=6?J8db>^n+vnS58VqGOpQde!^LhT^FPno$sK1R;RVb&i_o5|>-LG;Kg??MHCx&;~ ziZww?R#r16X1LX_ZFYQZ=WBLl9Y-y@V*W$>-;Wo3eOwoN_@m-GsXhDI Date: Mon, 9 Sep 2024 08:19:44 +0200 Subject: [PATCH 11/28] Fq subsample (#147) --- CHANGELOG.md | 2 + src/fq_subsample/config.vsh.yaml | 68 ++++++++++++++++++++++++ src/fq_subsample/help.txt | 20 +++++++ src/fq_subsample/script.sh | 26 +++++++++ src/fq_subsample/test.sh | 36 +++++++++++++ src/fq_subsample/test_data/a.3.fastq.gz | Bin 0 -> 292 bytes src/fq_subsample/test_data/a.4.fastq.gz | Bin 0 -> 301 bytes 7 files changed, 152 insertions(+) create mode 100644 src/fq_subsample/config.vsh.yaml create mode 100644 src/fq_subsample/help.txt create mode 100755 src/fq_subsample/script.sh create mode 100644 src/fq_subsample/test.sh create mode 100644 src/fq_subsample/test_data/a.3.fastq.gz create mode 100644 src/fq_subsample/test_data/a.4.fastq.gz diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f772450..6534eed1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -142,6 +142,8 @@ - `bedtools_getfasta`: extract sequences from a FASTA file for each of the intervals defined in a BED/GFF/VCF file (PR #59). +* `fq_subsample`: Sample a subset of records from single or paired FASTQ files (PR #147). + ## MINOR CHANGES * Uniformize component metadata (PR #23). diff --git a/src/fq_subsample/config.vsh.yaml b/src/fq_subsample/config.vsh.yaml new file mode 100644 index 00000000..2455a341 --- /dev/null +++ b/src/fq_subsample/config.vsh.yaml @@ -0,0 +1,68 @@ +name: fq_subsample +description: fq subsample outputs a subset of records from single or paired FASTQ files. +keywords: [fastq, subsample, subset] +links: + homepage: https://github.com/stjude-rust-labs/fq/blob/master/README.md + documentation: https://github.com/stjude-rust-labs/fq/blob/master/README.md + repository: https://github.com/stjude-rust-labs/fq +license: MIT + +argument_groups: +- name: "Input" + arguments: + - name: "--input_1" + type: file + required: true + description: First input fastq file to subsample. Accepts both raw and gzipped FASTQ inputs. + - name: "--input_2" + type: file + description: Second input fastq files to subsample. Accepts both raw and gzipped FASTQ inputs. + +- name: "Output" + arguments: + - name: "--output_1" + type: file + direction: output + description: Sampled read 1 fastq files. Output will be gzipped if ends in `.gz`. + - name: "--output_2" + type: file + direction: output + description: Sampled read 2 fastq files. Output will be gzipped if ends in `.gz`. + +- name: "Options" + arguments: + - name: "--probability" + type: double + description: The probability a record is kept, as a percentage (0.0, 1.0). Cannot be used with `record-count` + - name: "--record_count" + type: integer + description: The exact number of records to keep. Cannot be used with `probability` + - name: "--seed" + type: integer + description: Seed to use for the random number generator + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: + - type: docker + image: rust:1.81-slim + setup: + - type: docker + run: | + apt-get update && apt-get install -y git procps && \ + git clone --depth 1 --branch v0.12.0 https://github.com/stjude-rust-labs/fq.git && \ + cd fq && \ + cargo install --locked --path . && \ + mv target/release/fq /usr/local/bin/ && \ + cd / && rm -rf /fq + +runners: + - type: executable + - type: nextflow diff --git a/src/fq_subsample/help.txt b/src/fq_subsample/help.txt new file mode 100644 index 00000000..6f4a9acf --- /dev/null +++ b/src/fq_subsample/help.txt @@ -0,0 +1,20 @@ +``` +fq subsample -h +``` + +Outputs a subset of records + +Usage: fq subsample [OPTIONS] --r1-dst <--probability |--record-count > [R2_SRC] + +Arguments: + Read 1 source. Accepts both raw and gzipped FASTQ inputs + [R2_SRC] Read 2 source. Accepts both raw and gzipped FASTQ inputs + +Options: + -p, --probability The probability a record is kept, as a percentage (0.0, 1.0). Cannot be used with `record-count` + -n, --record-count The exact number of records to keep. Cannot be used with `probability` + -s, --seed Seed to use for the random number generator + --r1-dst Read 1 destination. Output will be gzipped if ends in `.gz` + --r2-dst Read 2 destination. Output will be gzipped if ends in `.gz` + -h, --help Print help + -V, --version \ No newline at end of file diff --git a/src/fq_subsample/script.sh b/src/fq_subsample/script.sh new file mode 100755 index 00000000..bcc81b40 --- /dev/null +++ b/src/fq_subsample/script.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -eo pipefail + + +required_args=("-p" "--probability" "-n" "--record_count") + +# exclusive OR for required arguments $par_probability and $par_record_count +if [[ -n $par_probability && -n $par_record_count ]] || [[ -z $par_probability && -z $par_record_count ]]; then + echo "FQ/SUBSAMPLE requires either --probability or --record_count to be specified" + exit 1 +fi + + +fq subsample \ + ${par_output_1:+--r1-dst "${par_output_1}"} \ + ${par_output_2:+--r2-dst "${par_output_2}"} \ + ${par_probability:+--probability "${par_probability}"} \ + ${par_record_count:+--record-count "${par_record_count}"} \ + ${par_seed:+--seed "${par_seed}"} \ + ${par_input_1} \ + ${par_input_2} + diff --git a/src/fq_subsample/test.sh b/src/fq_subsample/test.sh new file mode 100644 index 00000000..1de48e95 --- /dev/null +++ b/src/fq_subsample/test.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +echo ">>> Testing $meta_executable" + +echo ">>> Testing for paired-end reads" +"$meta_executable" \ + --input_1 $meta_resources_dir/test_data/a.3.fastq.gz \ + --input_2 $meta_resources_dir/test_data/a.4.fastq.gz \ + --record_count 3 \ + --seed 1 \ + --output_1 a.1.subsampled.fastq \ + --output_2 a.2.subsampled.fastq + +echo ">> Checking if the correct files are present" +[ ! -f "a.1.subsampled.fastq" ] && echo "Subsampled FASTQ file for read 1 is missing!" && exit 1 +[ $(wc -l < a.1.subsampled.fastq) -ne 12 ] && echo "Subsampled FASTQ file for read 1 does not contain the expected number of records" && exit 1 +[ ! -f "a.2.subsampled.fastq" ] && echo "Subsampled FASTQ file for read 2 is missing" && exit 1 +[ $(wc -l < a.2.subsampled.fastq) -ne 12 ] && echo "Subsampled FASTQ file for read 2 does not contain the expected number of records" && exit 1 + +rm a.1.subsampled.fastq a.2.subsampled.fastq + +echo ">>> Testing for single-end reads" +"$meta_executable" \ + --input_1 $meta_resources_dir/test_data/a.3.fastq.gz \ + --record_count 3 \ + --seed 1 \ + --output_1 a.1.subsampled.fastq + + +echo ">> Checking if the correct files are present" +[ ! -f "a.1.subsampled.fastq" ] && echo "Subsampled FASTQ file is missing" && exit 1 +[ $(wc -l < a.1.subsampled.fastq) -ne 12 ] && echo "Subsampled FASTQ file does not contain the expected number of records" && exit 1 + +echo ">>> Tests finished successfully" +exit 0 + diff --git a/src/fq_subsample/test_data/a.3.fastq.gz b/src/fq_subsample/test_data/a.3.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..3e38d06dc5213e2b60cf8feab54214ef6ae72095 GIT binary patch literal 292 zcmV+<0o(o`iwFopgw{AqTPku(#7Qf5avvPnFZM+Apx**Oa!?txr3QV-KZjV->mD8KoHv7#-z3Z4r{+?NKtIv%ALBp1?Z)Zpi_@6{>nZp{l zpnEW6Vuo5f3k)_CS+Enz@c0PCa|geeO1hrf6{+00VDfRbahk2}!7qJ+#!w%N%nT$& q=?+h;4lV-+mG4KNiZN0-nItN^Op$ogq>5BhG3pb)YY7nG0ssKu1BeL# literal 0 HcmV?d00001 diff --git a/src/fq_subsample/test_data/a.4.fastq.gz b/src/fq_subsample/test_data/a.4.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..3164c6148650e36532545b7946efa9a16055db5d GIT binary patch literal 301 zcmV+|0n+{-iwFpdgVkmL17R*SE@okKba4QclD}%iFbs$HJcai{?fi98G@K%^_p4su zpdHG=4W&beK793aE?fgCy*k8_6@xxLe!66TNB^7^ZV)idU^Ud zeZIYXbyM3^!osXsSp}(z+L2i-5vyb?=8QX%SyifsYQ{=`tlNd^@PlcHb+G8JahHhM z8WtRM#0J7;1BHM|@mewSy+pUQA?nAj9oxxW<1dbKR7_YiGA zZiwo>i^DWVwk#hC~KOM9F)iI44T9dZJXoXJ35-@igEx-~s>uZ@7@Z literal 0 HcmV?d00001 From 320d044fe45e565fbc9772640ebf6f39c5584b4a Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Mon, 9 Sep 2024 08:49:14 +0200 Subject: [PATCH 12/28] Sortmerna (#146) --- CHANGELOG.md | 3 + src/sortmerna/config.vsh.yaml | 290 ++++++++++++++++++++ src/sortmerna/help.txt | 319 ++++++++++++++++++++++ src/sortmerna/script.sh | 108 ++++++++ src/sortmerna/test.sh | 101 +++++++ src/sortmerna/test_data/rRNA/database1.fa | 24 ++ src/sortmerna/test_data/rRNA/database2.fa | 16 ++ src/sortmerna/test_data/reads_1.fq.gz | Bin 0 -> 189 bytes src/sortmerna/test_data/reads_2.fq.gz | Bin 0 -> 147 bytes src/sortmerna/test_data/script.sh | 8 + 10 files changed, 869 insertions(+) create mode 100644 src/sortmerna/config.vsh.yaml create mode 100644 src/sortmerna/help.txt create mode 100755 src/sortmerna/script.sh create mode 100644 src/sortmerna/test.sh create mode 100644 src/sortmerna/test_data/rRNA/database1.fa create mode 100644 src/sortmerna/test_data/rRNA/database2.fa create mode 100644 src/sortmerna/test_data/reads_1.fq.gz create mode 100644 src/sortmerna/test_data/reads_2.fq.gz create mode 100755 src/sortmerna/test_data/script.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 6534eed1..5041f082 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -142,6 +142,9 @@ - `bedtools_getfasta`: extract sequences from a FASTA file for each of the intervals defined in a BED/GFF/VCF file (PR #59). +* `sortmerna`: Local sequence alignment tool for mapping, clustering, and filtering rRNA from metatranscriptomic + data. (PR #146) + * `fq_subsample`: Sample a subset of records from single or paired FASTQ files (PR #147). ## MINOR CHANGES diff --git a/src/sortmerna/config.vsh.yaml b/src/sortmerna/config.vsh.yaml new file mode 100644 index 00000000..6477660f --- /dev/null +++ b/src/sortmerna/config.vsh.yaml @@ -0,0 +1,290 @@ +name: sortmerna +description: | + Local sequence alignment tool for filtering, mapping and clustering. The main + application of SortMeRNA is filtering rRNA from metatranscriptomic data. +keywords: [sort, mRNA, rRNA, alignment, filtering, mapping, clustering] +links: + homepage: https://sortmerna.readthedocs.io/en/latest/ + documentation: https://sortmerna.readthedocs.io/en/latest/manual4.0.html + repository: https://github.com/sortmerna/sortmerna +references: + doi: 10.1093/bioinformatics/bts611 +license: GPL-3.0 + +argument_groups: +- name: "Input" + arguments: + - name: "--paired" + type: boolean_true + description: | + Reads are paired-end. If a single reads file is provided, use this option + to indicate the file contains interleaved paired reads when neither + 'paired_in' | 'paired_out' | 'out2' | 'sout' are specified. + - name: "--input" + type: file + multiple: true + description: Input fastq + - name: "--ref" + type: file + multiple: true + description: Reference fasta file(s) for rRNA database. + - name: "--ribo_database_manifest" + type: file + description: Text file containing paths to fasta files (one per line) that will be used to create the database for SortMeRNA. + +- name: "Output" + arguments: + - name: "--log" + type: file + direction: output + must_exist: false + example: $id.sortmerna.log + description: Sortmerna log file. + - name: "--output" + alternatives: ["--aligned"] + type: string + description: | + Directory and file prefix for aligned output. The appropriate extension: + (fasta|fastq|blast|sam|etc) is automatically added. + If 'dir' is not specified, the output is created in the WORKDIR/out/. + If 'pfx' is not specified, the prefix 'aligned' is used. + - name: "--other" + type: string + description: Create Non-aligned reads output file with this path/prefix. Must be used with fastx. + +- name: "Options" + arguments: + - name: "--kvdb" + type: string + description: Path to directory of the key-value database file, used for storing the alignment results. + - name: "--idx_dir" + type: string + description: Path to the directory for storing the reference index files. + - name: "--readb" + type: string + description: Path to the directory for storing pre-processed reads. + - name: "--fastx" + type: boolean_true + description: Output aligned reads into FASTA/FASTQ file + - name: "--sam" + type: boolean_true + description: Output SAM alignment for aligned reads. + - name: "--sq" + type: boolean_true + description: Add SQ tags to the SAM file + - name: "--blast" + type: string + description: | + Blast options: + * '0' - pairwise + * '1' - tabular(Blast - m 8 format) + * '1 cigar' - tabular + column for CIGAR + * '1 cigar qcov' - tabular + columns for CIGAR and query coverage + * '1 cigar qcov qstrand' - tabular + columns for CIGAR, query coverage and strand + choices: ['0', '1', '1 cigar', '1 cigar qcov', '1 cigar qcov qstrand'] + - name: "--num_alignments" + type: integer + description: | + Report first INT alignments per read reaching E-value. If Int = 0, all alignments will be output. Default: '0' + example: 0 + - name: "--min_lis" + type: integer + description: | + search all alignments having the first INT longest LIS. LIS stands for Longest Increasing Subsequence, it is + computed using seeds’ positions to expand hits into longer matches prior to Smith-Waterman alignment. Default: '2'. + example: 2 + - name: "--print_all_reads" + type: boolean_true + description: output null alignment strings for non-aligned reads to SAM and/or BLAST tabular files. + - name: "--paired_in" + type: boolean_true + description: | + In the case where a pair of reads is aligned with a score above the threshold, the output of the reads is controlled + by the following options: + * --paired_in and --paired_out are both false: Only one read per pair is output to the aligned fasta file. + * --paired_in is true and --paired_out is false: Both reads of the pair are output to the aligned fasta file. + * --paired_in is false and --paired_out is true: Both reads are output the the other fasta file (if it is specified). + - name: "--paired_out" + type: boolean_true + description: See description of --paired_in. + - name: "--out2" + type: boolean_true + description: | + Output paired reads into separate files. Must be used with '--fastx'. If a single reads file is provided, this options + implies interleaved paired reads. When used with 'sout', four (4) output files for aligned reads will be generated: + 'aligned-paired-fwd, aligned-paired-rev, aligned-singleton-fwd, aligned-singleton-rev'. If 'other' option is also used, + eight (8) output files will be generated. + - name: "--sout" + type: boolean_true + description: | + Separate paired and singleton aligned reads. Must be used with '--fastx'. If a single reads file is provided, + this options implies interleaved paired reads. Cannot be used with '--paired_in' or '--paired_out'. + - name: "--zip_out" + type: string + description: | + Compress the output files. The possible values are: + * '1/true/t/yes/y' + * '0/false/f/no/n' + *'-1' (the same format as input - default) + The values are Not case sensitive. + choices: ['1', 'true', 't', 'yes', 'y', '0', 'false', 'f', 'no', 'n', '-1'] + example: "-1" + - name: "--match" + type: integer + description: | + Smith-Waterman score for a match (positive integer). Default: '2'. + example: 2 + - name: "--mismatch" + type: integer + description: | + Smith-Waterman penalty for a mismatch (negative integer). Default: '-3'. + example: -3 + - name: "--gap_open" + type: integer + description: | + Smith-Waterman penalty for introducing a gap (positive integer). Default: '5'. + example: 5 + - name: "--gap_ext" + type: integer + description: | + Smith-Waterman penalty for extending a gap (positive integer). Default: '2'. + example: 2 + - name: "--N" + type: integer + description: | + Smith-Waterman penalty for ambiguous letters (N’s) scored as --mismatch. Default: '-1'.\ + example: -1 + - name: "--a" + type: integer + description: | + Number of threads to use. Default: '1'. + example: 1 + - name: "--e" + type: double + description: | + E-value threshold. Default: '1'. + example: 1 + - name: "--F" + type: boolean_true + description: Search only the forward strand. + - name: "--R" + type: boolean_true + description: Search only the reverse-complementary strand. + - name: "--num_alignment" + type: integer + description: | + Report first INT alignments per read reaching E-value (--num_alignments 0 signifies all alignments will be output). + Default: '-1' + example: -1 + - name: "--best" + type: integer + description: | + Report INT best alignments per read reaching E-value by searching --min_lis INT candidate alignments (--best 0 + signifies all candidate alignments will be searched) Default: '1'. + example: 1 + - name: "--verbose" + alternatives: ["-v"] + type: boolean_true + description: Verbose output. + +- name: "OTU picking options" + arguments: + - name: "--id" + type: double + description: | + %id similarity threshold (the alignment must still pass the E-value threshold). Default: '0.97'. + example: 0.97 + - name: "--coverage" + type: double + description: | + %query coverage threshold (the alignment must still pass the E-value threshold). Default: '0.97'. + example: 0.97 + - name: "--de_novo" + type: boolean_true + description: | + FASTA/FASTQ file for reads matching database < %id off (set using --id) and < %cov (set using --coverage) + (alignment must still pass the E-value threshold). + - name: "--otu_map" + type: boolean_true + description: | + Output OTU map (input to QIIME’s make_otu_table.py). + +- name: "Advanced options" + arguments: + - name: "--num_seed" + type: integer + description: | + Number of seeds matched before searching for candidate LIS. Default: '2'. + example: 2 + - name: "--passes" + type: integer + multiple: true + description: | + Three intervals at which to place the seed on the read L,L/2,3 (L is the seed length set in ./indexdb_rna). + - name: "--edge" + type: string + description: | + The number (or percentage if followed by %) of nucleotides to add to each edge of the alignment region on the + reference sequence before performing Smith-Waterman alignment. Default: '4'. + example: 4 + - name: "--full_search" + type: boolean_true + description: | + Search for all 0-error and 1-error seed off matches in the index rather than stopping after finding a 0-error match + (<1% gain in sensitivity with up four-fold decrease in speed). + +- name: "Indexing Options" + arguments: + - name: "--index" + type: integer + description: | + Create index files for the reference database. By default when this option is not used, the program checks the + reference index and builds it if not already existing. + This can be changed by using '-index' as follows: + * '-index 0' - skip indexing. If the index does not exist, the program will terminate + and warn to build the index prior performing the alignment + * '-index 1' - only perform the indexing and terminate + * '-index 2' - the default behaviour, the same as when not using this option at all + example: 2 + choices: [0, 1, 2] + - name: "-L" + type: double + description: | + Indexing seed length. Default: '18' + example: 18 + - name: "--interval" + type: integer + description: | + Index every Nth L-mer in the reference database. Default: '1' + example: 1 + - name: "--max_pos" + type: integer + description: | + Maximum number of positions to store for each unique L-mer. Set to 0 to store all positions. Default: '1000' + example: 1000 + + + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: +- type: docker + image: ubuntu:22.04 + setup: + - type: docker + run: | + apt-get update && \ + apt-get install -y --no-install-recommends gzip cmake g++ wget && \ + apt-get clean && \ + wget --no-check-certificate https://github.com/sortmerna/sortmerna/releases/download/v4.3.6/sortmerna-4.3.6-Linux.sh && \ + bash sortmerna-4.3.6-Linux.sh --skip-license +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/sortmerna/help.txt b/src/sortmerna/help.txt new file mode 100644 index 00000000..f0842707 --- /dev/null +++ b/src/sortmerna/help.txt @@ -0,0 +1,319 @@ +``` +sortmerna -h +``` + + + Program: SortMeRNA version 4.3.6 + Copyright: 2016-2020 Clarity Genomics BVBA: + Turnhoutseweg 30, 2340 Beerse, Belgium + 2014-2016 Knight Lab: + Department of Pediatrics, UCSD, La Jolla + 2012-2014 Bonsai Bioinformatics Research Group: + LIFL, University Lille 1, CNRS UMR 8022, INRIA Nord-Europe + Disclaimer: SortMeRNA comes with ABSOLUTELY NO WARRANTY; without even the + implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU Lesser General Public License for more details. + Contributors: Jenya Kopylova jenya.kopylov@gmail.com + Laurent Noé laurent.noe@lifl.fr + Pierre Pericard pierre.pericard@lifl.fr + Daniel McDonald wasade@gmail.com + Mikaël Salson mikael.salson@lifl.fr + Hélène Touzet helene.touzet@lifl.fr + Rob Knight robknight@ucsd.edu + + Usage: sortmerna -ref FILE [-ref FILE] -reads FWD_READS [-reads REV_READS] [OPTIONS]: + ------------------------------------------------------------------------------------------------------------- + | option type-format description default | + ------------------------------------------------------------------------------------------------------------- + + [REQUIRED] + --ref PATH Required Reference file (FASTA) absolute or relative path. + + Use mutliple times, once per a reference file + + + --reads PATH Required Raw reads file (FASTA/FASTQ/FASTA.GZ/FASTQ.GZ). + + Use twice for files with paired reads. + The file extensions are Not important. The program automatically + recognizes the file format as flat/compressed, fasta/fastq + + + + [COMMON] + --workdir PATH Optional Workspace directory USRDIR/sortmerna/run/ + + Default structure: WORKDIR/ + idx/ (References index) + kvdb/ (Key-value storage for alignments) + out/ (processing output) + readb/ (pre-processed reads/index) + + + --kvdb PATH Optional Directory for Key-value database WORKDIR/kvdb + + KVDB is used for storing the alignment results. + + + --idx-dir PATH Optional Directory for storing Reference index. WORKDIR/idx + + + --readb PATH Optional Storage for pre-processed reads WORKDIR/readb/ + + Directory storing the split reads, or the random access index of compressed reads + + + --fastx BOOL Optional Output aligned reads into FASTA/FASTQ file + --sam BOOL Optional Output SAM alignment for aligned reads. + + + --SQ BOOL Optional Add SQ tags to the SAM file + + + --blast STR Optional output alignments in various Blast-like formats + + Sample values: '0' - pairwise + '1' - tabular (Blast - m 8 format) + '1 cigar' - tabular + column for CIGAR + '1 cigar qcov' - tabular + columns for CIGAR and query coverage + '1 cigar qcov qstrand' - tabular + columns for CIGAR, query coverage, + and strand + + + --aligned STR/BOOL Optional Aligned reads file prefix [dir/][pfx] WORKDIR/out/aligned + + Directory and file prefix for aligned output i.e. each + output file goes into the specified directory with the given prefix. + The appropriate extension: (fasta|fastq|blast|sam|etc) is automatically added. + Both 'dir' and 'pfx' are optional. + The 'dir' can be a relative or an absolute path. + If 'dir' is not specified, the output is created in the WORKDIR/out/ + If 'pfx' is not specified, the prefix 'aligned' is used + Examples: + '-aligned $MYDIR/dir_1/dir_2/1' -> $MYDIR/dir_1/dir_2/1.fasta + '-aligned dir_1/apfx' -> $PWD/dir_1/apfx.fasta + '-aligned dir_1/' -> $PWD/aligned.fasta + '-aligned apfx' -> $PWD/apfx.fasta + '-aligned (no argument)' -> WORKDIR/out/aligned.fasta + + + --other STR/BOOL Optional Non-aligned reads file prefix [dir/][pfx] WORKDIR/out/other + + Directory and file prefix for non-aligned output i.e. each + output file goes into the specified directory with the given prefix. + The appropriate extension: (fasta|fastq|blast|sam|etc) is automatically added. + Must be used with 'fastx'. + Both 'dir' and 'pfx' are optional. + The 'dir' can be a relative or an absolute path. + If 'dir' is not specified, the output is created in the WORKDIR/out/ + If 'pfx' is not specified, the prefix 'other' is used + Examples: + '-other $MYDIR/dir_1/dir_2/1' -> $MYDIR/dir_1/dir_2/1.fasta + '-other dir_1/apfx' -> $PWD/dir_1/apfx.fasta + '-other dir_1/' -> $PWD/dir_1/other.fasta + '-other apfx' -> $PWD/apfx.fasta + '-other (no argument)' -> aligned_out/other.fasta + i.e. the same output directory + as used for aligned output + + + --num_alignments INT Optional Positive integer (INT >=0). + + If used with '-no-best' reports first INT alignments per read reaching + E-value threshold, which allows to lower the CPU time and memory use. + Otherwise outputs INT best alignments. + If INT = 0, all alignments are output + + + --no-best BOOL Optional Disable best alignments search False + + The 'best' alignment is the highest scoring alignment out of All alignments of a read, + and the read can potentially be aligned (reaching E-value threshold) to multiple reference + sequences. + By default the program searches for best alignments i.e. performs an exhaustive search + over all references. Using '-no-best' will make the program to search just + the first N alignments, where N is set using '-num_alignments' i.e. 1 by default. + + + --min_lis INT Optional Search only alignments that have the LIS 2 + of at least N seeds long + + LIS stands for Longest Increasing Subsequence. It is computed using seeds, which + are k-mers common to the read and the reference sequence. Sorted sequences of such seeds + are used to filter the candidate references prior performing the Smith-Waterman alignment. + + + --print_all_reads BOOL Optional Output null alignment strings for non-aligned reads False + to SAM and/or BLAST tabular files + + --paired BOOL Optional Flags paired reads False + + If a single reads file is provided, use this option to indicate + the file contains interleaved paired reads when neither + 'paired_in' | 'paired_out' | 'out2' | 'sout' are specified. + + + --paired_in BOOL Optional Flags the paired-end reads as Aligned, False + when either of them is Aligned. + + With this option both reads are output into Aligned FASTA/Q file + Must be used with 'fastx'. + Mutually exclusive with 'paired_out'. + + + --paired_out BOOL Optional Flags the paired-end reads as Non-aligned, False + when either of them is non-aligned. + + With this option both reads are output into Non-Aligned FASTA/Q file + Must be used with 'fastx'. + Mutually exclusive with 'paired_in'. + + + --out2 BOOL Optional Output paired reads into separate files. False + + Must be used with 'fastx'. + If a single reads file is provided, this options implies interleaved paired reads + When used with 'sout', four (4) output files for aligned reads will be generated: + 'aligned-paired-fwd, aligned-paired-rev, aligned-singleton-fwd, aligned-singleton-rev'. + If 'other' option is also used, eight (8) output files will be generated. + + + --sout BOOL Optional Separate paired and singleton aligned reads. False + + To be used with 'fastx'. + If a single reads file is provided, this options implies interleaved paired reads + Cannot be used with 'paired_in' | 'paired_out' + + + --zip-out STR/BOOL Optional Controls the output compression '-1' + + By default the report files are produced in the same format as the input i.e. + if the reads files are compressed (gz), the output is also compressed. + The default behaviour can be overriden by using '-zip-out'. + The possible values: '1/true/t/yes/y' + '0/false/f/no/n' + '-1' (the same format as input - default) + The values are Not case sensitive i.e. 'Yes, YES, yEs, Y, y' are all OK + Examples: + '-reads freads.gz -zip-out n' : generate flat output when the input is compressed + '-reads freads.flat -zip-out' : compress the output when the input files are flat + + + --match INT Optional SW score (positive integer) for a match. 2 + + --mismatch INT Optional SW penalty (negative integer) for a mismatch. -3 + + --gap_open INT Optional SW penalty (positive integer) for introducing a gap. 5 + + --gap_ext INT Optional SW penalty (positive integer) for extending a gap. 2 + + -e DOUBLE Optional E-value threshold. 1 + + Defines the 'statistical significance' of a local alignment. + Exponentially correllates with the Minimal Alignment score. + Higher E-values (100, 1000, ...) cause More reads to Pass the alignment threshold + + + -F BOOL Optional Search only the forward strand. False + + -N BOOL Optional SW penalty for ambiguous letters (N's) scored + as --mismatch + + -R BOOL Optional Search only the reverse-complementary strand. False + + + [OTU_PICKING] + --id INT Optional %%id similarity threshold (the alignment 0.97 + must still pass the E-value threshold). + + --coverage INT Optional %%query coverage threshold (the alignment must 0.97 + still pass the E-value threshold) + + --de_novo_otu BOOL Optional Output FASTA file with 'de novo' reads False + + Read is 'de novo' if its alignment score passes E-value threshold, but both the identity + '-id', and the '-coverage' are below their corresponding thresholds + i.e. ID < %%id and COV < %%cov + + + --otu_map BOOL Optional Output OTU map (input to QIIME's make_otu_table.py). False + Cannot be used with 'no-best because + the grouping is done around the best alignment' + + + [ADVANCED] + --passes INT,INT,INT Optional Three intervals at which to place the seed on L,L/2,3 + the read (L is the seed length) + + --edges INT Optional Number (or percent if INT followed by %% sign) of 4 + nucleotides to add to each edge of the read + prior to SW local alignment + + --num_seeds BOOL Optional Number of seeds matched before searching 2 + for candidate LIS + + --full_search INT Optional Search for all 0-error and 1-error seed False + matches in the index rather than stopping + after finding a 0-error match (<1%% gain in + sensitivity with up four-fold decrease in speed) + + --pid BOOL Optional Add pid to output file names. False + + -a INT Optional DEPRECATED in favour of '-threads'. Number of numCores + processing threads to use. + Automatically redirects to '-threads' + + --threads INT Optional Number of Processing threads to use 2 + + + [INDEXING] + --index INT Optional Build reference database index 2 + + By default when this option is not used, the program checks the reference index and + builds it if not already existing. + This can be changed by using '-index' as follows: + '-index 0' - skip indexing. If the index does not exist, the program will terminate + and warn to build the index prior performing the alignment + '-index 1' - only perform the indexing and terminate + '-index 2' - the default behaviour, the same as when not using this option at all + + + -L DOUBLE Optional Indexing: seed length. 18 + + -m DOUBLE Optional Indexing: the amount of memory (in Mbytes) for 3072 + building the index. + + -v BOOL Optional Produce verbose output when building the index True + + --interval INT Optional Indexing: Positive integer: index every Nth L-mer in 1 + the reference database e.g. '-interval 2'. + + --max_pos INT Optional Indexing: maximum (integer) number of positions to 1000 + store for each unique L-mer. + If 0 - all positions are stored. + + + [HELP] + -h BOOL Optional Print help information + + --version BOOL Optional Print SortMeRNA version number + + + [DEVELOPER] + --dbg_put_db BOOL Optional + --cmd BOOL Optional Launch an interactive session (command prompt) False + + --task INT Optional Processing Task 4 + + Possible values: 0 - align. Only perform alignment + 1 - post-processing (log writing) + 2 - generate reports + 3 - align and post-process + 4 - all + + + --dbg-level INT Optional Debug level 0 + + Controls verbosity of the execution trace. Default value of 0 corresponds to + the least verbose output. + The highest value currently is 2. diff --git a/src/sortmerna/script.sh b/src/sortmerna/script.sh new file mode 100755 index 00000000..8dda3d60 --- /dev/null +++ b/src/sortmerna/script.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -eo pipefail + +unset_if_false=( par_fastx par_sq par_fastx par_print_all_reads par_paired_in par_paired_out + par_F par_R par_verbose par_de_novo par_otu_map par_full_search par_out2 + par_sout par_sam par_paired ) + + +for var in "${unset_if_false[@]}"; do + if [ "${!var}" == "false" ]; then + unset $var + fi +done + +reads=() +IFS=";" read -ra input <<< "$par_input" +if [ "${#input[@]}" -eq 2 ]; then + reads="--reads ${input[0]} --reads ${input[1]}" + # set paired to true in case it's not + par_paired=true +else + reads="--reads ${input[0]}" + par_paired=false +fi + +refs=() + +# check if references are input normally or through a manifest file +if [[ ! -z "$par_ribo_database_manifest" ]]; then + while IFS= read -r path || [[ -n $path ]]; do + refs=$refs" --ref $path" + done < $par_ribo_database_manifest + +elif [[ ! -z "$par_ref" ]]; then + IFS=";" read -ra ref <<< "$par_ref" + # check if length is 2 and par_paired is set to true + if [[ "${#ref[@]}" -eq 2 && "$par_paired" == "true" ]]; then + refs="--ref ${ref[0]} --ref ${ref[1]}" + # check if length is 1 and par_paired is set to false + elif [[ "${#ref[@]}" -eq 1 && "$par_paired" == "false" ]]; then + refs="--ref $par_ref" + else # if one reference provided but paired is set to true: + echo "Two reference fasta files are required for paired-end reads" + exit 1 + fi +else + echo "No reference fasta file(s) provided" + exit 1 +fi + + +sortmerna \ + $refs \ + $reads \ + --workdir . \ + ${par_output:+--aligned "${par_output}"} \ + ${par_fastx:+--fastx} \ + ${par_other:+--other "${par_other}"} \ + ${par_kvdb:+--kvdb "${par_kvdb}"} \ + ${par_idx_dir:+--idx-dir "${par_idx_dir}"} \ + ${par_readb:+--readb "${par_readb}"} \ + ${par_sam:+--sam} \ + ${par_sq:+--sq} \ + ${par_blast:+--blast "${par_blast}"} \ + ${par_num_alignments:+--num_alignments "${par_num_alignments}"} \ + ${par_min_lis:+--min_lis "${par_min_lis}"} \ + ${par_print_all_reads:+--print_all_reads} \ + ${par_paired_in:+--paired_in} \ + ${par_paired_out:+--paired_out} \ + ${par_out2:+--out2} \ + ${par_sout:+--sout} \ + ${par_zip_out:+--zip-out "${par_zip_out}"} \ + ${par_match:+--match "${par_match}"} \ + ${par_mismatch:+--mismatch "${par_mismatch}"} \ + ${par_gap_open:+--gap_open "${par_gap_open}"} \ + ${par_gap_ext:+--gap_ext "${par_gap_ext}"} \ + ${par_N:+-N "${par_N}"} \ + ${par_a:+-a "${par_a}"} \ + ${par_e:+-e "${par_e}"} \ + ${par_F:+-F} \ + ${par_R:+-R} \ + ${par_num_alignment:+--num_alignment "${par_num_alignment}"} \ + ${par_best:+--best "${par_best}"} \ + ${par_verbose:+--verbose} \ + ${par_id:+--id "${par_id}"} \ + ${par_coverage:+--coverage "${par_coverage}"} \ + ${par_de_novo:+--de_novo} \ + ${par_otu_map:+--otu_map} \ + ${par_num_seed:+--num_seed "${par_num_seed}"} \ + ${par_passes:+--passes "${par_passes}"} \ + ${par_edge:+--edge "${par_edge}"} \ + ${par_full_search:+--full_search} \ + ${par_index:+--index "${par_index}"} \ + ${par_L:+-L $par_L} \ + ${par_interval:+--interval "${par_interval}"} \ + ${par_max_pos:+--max_pos "${par_max_pos}"} + + +if [ ! -z $par_log ]; then + mv "${par_output}.log" $par_log +fi + +exit 0 + diff --git a/src/sortmerna/test.sh b/src/sortmerna/test.sh new file mode 100644 index 00000000..390b9307 --- /dev/null +++ b/src/sortmerna/test.sh @@ -0,0 +1,101 @@ +#!/bin/bash + +echo ">>> Testing $meta_functionality_name" + +find $meta_resources_dir/test_data/rRNA -type f > test_data/rrna-db.txt + +echo ">>> Testing for paired-end reads and database manifest" +# out2 separates the read pairs into two files (one fwd and one rev) +# paired_in outputs both reads of a pair +# other is the output file for non-rRNA reads +"$meta_executable" \ + --output "rRNA_reads" \ + --other "non_rRNA_reads" \ + --input "$meta_resources_dir/test_data/reads_1.fq.gz;$meta_resources_dir/test_data/reads_2.fq.gz" \ + --ribo_database_manifest test_data/rrna-db.txt \ + --log test_log.log \ + --paired_in \ + --fastx \ + --out2 + + +echo ">> Checking if the correct files are present" +[[ -f "rRNA_reads_fwd.fq.gz" ]] || [[ -f "rRNA_reads_rev.fq.gz" ]] || { echo "rRNA output fastq file is missing!"; exit 1; } +[[ -s "rRNA_reads_fwd.fq.gz" ]] && [[ -s "rRNA_reads_rev.fq.gz" ]] || { echo "rRNA output fastq file is empty!"; exit 1; } +[[ -f "non_rRNA_reads_fwd.fq.gz" ]] || [[ -f "non_rRNA_reads_rev.fq.gz" ]] || { echo "Non-rRNA output fastq file is missing!"; exit 1;} +gzip -dk non_rRNA_reads_fwd.fq.gz +gzip -dk non_rRNA_reads_rev.fq.gz +[[ ! -s "non_rRNA_reads_fwd.fq" ]] && [[ ! -s "non_rRNA_reads_rev.fq" ]] || { echo "Non-rRNA output fastq file is not empty!"; exit 1;} + +rm -f rRNA_reads_fwd.fq.gz rRNA_reads_rev.fq.gz non_rRNA_reads_fwd.fq.gz non_rRNA_reads_rev.fq.gz test_log.log +rm -rf kvdb/ + +################################################################################ +echo ">>> Testing for paired-end reads and --ref and --paired_out argumens" +"$meta_executable" \ + --output "rRNA_reads" \ + --other "non_rRNA_reads" \ + --input "$meta_resources_dir/test_data/reads_1.fq.gz;$meta_resources_dir/test_data/reads_2.fq.gz" \ + --ref "$meta_resources_dir/test_data/rRNA/database1.fa;$meta_resources_dir/test_data/rRNA/database2.fa" \ + --log test_log.log \ + --paired_out \ + --fastx \ + --out2 + +echo ">> Checking if the correct files are present" +[[ -f "rRNA_reads_fwd.fq.gz" ]] || [[ -f "rRNA_reads_rev.fq.gz" ]] || { echo "rRNA output fastq file is missing!"; exit 1; } +gzip -dkf rRNA_reads_fwd.fq.gz +[[ ! -s "rRNA_reads_fwd.fq" ]] && [[ ! -s "rRNA_reads_rev.fq" ]] || { echo "rRNA output fastq file is not empty!"; exit 1; } +[[ -f "non_rRNA_reads_fwd.fq.gz" ]] || [[ -f "non_rRNA_reads_rev.fq.gz" ]] || { echo "Non-rRNA output fastq file is missing!"; exit 1;} +gzip -dkf non_rRNA_reads_fwd.fq.gz +gzip -dkf non_rRNA_reads_rev.fq.gz +[[ -s "non_rRNA_reads_fwd.fq" ]] && [[ -s "non_rRNA_reads_rev.fq" ]] || { echo "Non-rRNA output fastq file is empty!"; exit 1; } + +rm -f rRNA_reads_fwd.fq.gz rRNA_reads_rev.fq.gz non_rRNA_reads_fwd.fq.gz non_rRNA_reads_rev.fq.gz test_log.log +rm -rf kvdb/ + +################################################################################ + +echo ">>> Testing for single-end reads and --ref argument" +"$meta_executable" \ + --aligned "rRNA_reads" \ + --other "non_rRNA_reads" \ + --input $meta_resources_dir/test_data/reads_1.fq.gz \ + --ref $meta_resources_dir/test_data/rRNA/database1.fa \ + --log test_log.log \ + --fastx + +echo ">> Checking if the correct files are present" +[[ ! -f "rRNA_reads.fq.gz" ]] && echo "rRNA output fastq file is missing!" && exit 1 +gzip -dk rRNA_reads.fq.gz +[[ -s "rRNA_reads.fq" ]] && echo "rRNA output fastq file is not empty!" && exit 1 +[[ ! -f "non_rRNA_reads.fq.gz" ]] && echo "Non-rRNA output fastq file is missing!" && exit 1 +[[ ! -s "non_rRNA_reads.fq.gz" ]] && echo "Non-rRNA output fastq file is empty!" && exit 1 + +rm -f rRNA_reads.fq.gz non_rRNA_reads.fq.gz test_log.log +rm -rf kvdb/ + +################################################################################ + +echo ">>> Testing for single-end reads with singleton output files" +"$meta_executable" \ + --aligned "rRNA_reads" \ + --other "non_rRNA_reads" \ + --input "$meta_resources_dir/test_data/reads_1.fq.gz;$meta_resources_dir/test_data/reads_2.fq.gz" \ + --ribo_database_manifest test_data/rrna-db.txt \ + --log test_log.log \ + --fastx \ + --sout + +echo ">> Checking if the correct files are present" +[[ ! -f "rRNA_reads_paired.fq.gz" ]] && echo "Aligned paired fwd output fastq file is missing!" && exit 1 +[[ ! -f "rRNA_reads_singleton.fq.gz" ]] && echo "Aligned singleton fwd output fastq file is missing!" && exit 1 +[[ ! -f "non_rRNA_reads_fwd.fq" ]] && echo "Non-rRNA fwd output fastq file is missing!" && exit 1 +[[ ! -f "non_rRNA_reads_rev.fq" ]] && echo "Non-rRNA rev output fastq file is missing!" && exit 1 +[[ ! -f "non_rRNA_reads_singleton.fq.gz" ]] && echo "Non-rRNA singleton output fastq file is missing!" && exit 1 +[[ ! -f "non_rRNA_reads_paired.fq.gz" ]] && echo "Non-rRNA paired output fastq file is missing!" && exit 1 + + + +echo ">>> All tests passed" +exit 0 \ No newline at end of file diff --git a/src/sortmerna/test_data/rRNA/database1.fa b/src/sortmerna/test_data/rRNA/database1.fa new file mode 100644 index 00000000..bae23aba --- /dev/null +++ b/src/sortmerna/test_data/rRNA/database1.fa @@ -0,0 +1,24 @@ +>AY846379.1.1791 Eukaryota;Archaeplastida;Chloroplastida;Chlorophyta;Chlorophyceae;Sphaeropleales;Monoraphidium;Monoraphidium sp. Itas 9/21 14-6w +CCUGGUUGAUCCUGCCAGUAGUCAUAUGCUUGUCUCAAAGAUUAAGCCAUGCAUGUCUAAGUAUAAACUGCUUAUACUGU +GAAACUGCGAAUGGCUCAUUAAAUCAGUUAUAGUUUAUUUGAUGGUACCUCUACACGGAUAACCGUAGUAAUUCUAGAGC +UAAUACGUGCGUAAAUCCCGACUUCUGGAAGGGACGUAUUUAUUAGAUAAAAGGCCGACCGAGCUUUGCUCGACCCGCGG +UGAAUCAUGAUAACUUCACGAAUCGCAUAGCCUUGUGCUGGCGAUGUUUCAUUCAAAUUUCUGCCCUAUCAACUUUCGAU +GGUAGGAUAGAGGCCUACCAUGGUGGUAACGGGUGACGGAGGAUUAGGGUUCGAUUCCGGAGAGGGAGCCUGAGAAACGG +CUACCACAUCCAAGGAAGGCAGCAGGCGCGCAAAUUACCCAAUCCUGAUACGGGGAGGUAGUGACAAUAAAUAACAAUGC +CGGGCAUUUCAUGUCUGGCAAUUGGAAUGAGUACAAUCUAAAUCCCUUAACGAGGAUCAAUUGGAGGGCAAGUCUGGUGC +CAGCAGCCGCGGUAAUUCCAGCUCCAAUAGCGUAUAUUUAAGUUGUUGCAGUUAAAAAGCUCGUAGUUGGAUUUCGGGUG +GGUUCCAGCGGUCCGCCUAUGGUGAGUACUGCUGUGGCCCUCCUUUUUGUCGGGGACGGGCUCCUGGGCUUCAUUGUCCG +GGACUCGGAGUCGACGAUGAUACUUUGAGUAAAUUAGAGUGUUCAAAGCAAGCCUACGCUCUGAAUACUUUAGCAUGGAA +UAUCGCGAUAGGACUCUGGCCUAUCUCGUUGGUCUGUAGGACCGGAGUAAUGAUUAAGAGGGACAGUCGGGGGCAUUCGU +AUUUCAUUGUCAGAGGUGAAAUUCUUGGAUUUAUGAAAGACGAACUACUGCGAAAGCAUUUGCCAAGGAUGUUUUCAUUA +AUCAAGAACGAAAGUUGGGGGCUCGAAGACGAUUAGAUACCGUCGUAGUCUCAACCAUAAACGAUGCCGACUAGGGAUUG +GAGGAUGUUCUUUUGAUGACUUCUCCAGCACCUUAUGAGAAAUCAAAGUUUUUGGGUUCCGGGGGGAGUAUGGUCGCAAG +GCUGAAACUUAAAGGAAUUGACGGAAGGGCACCACCAGGCGUGGAGCCUGCGGCUUAAUUUGACUCAACACGGGAAAACU +UACCAGGUCCAGACAUAGUGAGGAUUGACAGAUUGAGAGCUCUUUCUUGAUUCUAUGGGUGGUGGUGCAUGGCCGUUCUU +AGUUGGUGGGUUGCCUUGUCAGGUUGAUUCCGGUAACGAACGAGACCUCAGCCUGCUAAAUAUGUCACAUUCGCUUUUUG +CGGAUGGCCGACUUCUUAGAGGGACUAUUGGCGUUUAGUCAAUGGAAGUAUGAGGCAAUAACAGGUCUGUGAUGCCCUUA +GAUGUUCUGGGCCGCACGCGCGCUACACUGACGCAUUCAGCAAGCCUAUCCUUGACCGAGAGGUCUGGGUAAUCUUUGAA +ACUGCGUCGUGAUGGGGAUAGAUUAUUGCAAUUAUUAGUCUUCAACGAGGAAUGCCUAGUAAGCGCAAGUCAUCAGCUUG +CGUUGAUUACGUCCCUGCCCUUUGUACACACCGCCCGUCGCUCCUACCGAUUGGGUGUGCUGGUGAAGUGUUCGGAUUGG +CAGAGCGGGUGGCAACACUUGCUUUUGCCGAGAAGUUCAUUAAACCCUCCCACCUAGAGGAAGGAGAAGUCGUAACAAGG +UUUCCGUAGGUGAACCUGCAGAAG \ No newline at end of file diff --git a/src/sortmerna/test_data/rRNA/database2.fa b/src/sortmerna/test_data/rRNA/database2.fa new file mode 100644 index 00000000..87b5bc99 --- /dev/null +++ b/src/sortmerna/test_data/rRNA/database2.fa @@ -0,0 +1,16 @@ +>AB001445.1.1538 Bacteria;Proteobacteria;Gammaproteobacteria;Pseudomonadales;Pseudomonadaceae;Pseudomonas;Pseudomonas amygdali pv. morsprunorum +AGAGUUUGAUCAUGGCUCAGAUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUCGAGCGGCAGCACGGGUACUUGUAC +CUGGUGGCGAGCGGCGGACGGGUGAGUAAUGCCUAGGAAUCUGCCUGGUAGUGGGGGAUAACGCUCGGAAACGGACGCUA +AUACCGCAUACGUCCUACGGGAGAAAGCAGGGGACCUUCGGGCCUUGCGCUAUCAGAUGAGCCUAGGUCGGAUUAGCUAG +UUGGUGAGGUAAUGGCUCACCAAGGCGACGAUCCGUAACUGGUCUGAGAGGAUGAUCAGUCACACUGGAACUGAGACACG +GUCCAGACUCCUACGGGAGGCAGCAGUGGGGAAUAUUGGACAAUGGGCGAAAGCCUGAUCCAGCCAUGCCGCGUGUGUGA +AGAAGGUCUUCGGAUUGUAAAGCACUUUAAGUUGGGAGGAAGGGCAGUUACCUAAUACGUAUCUGUUUUGACGUUACCGA +CAGAAUAAGCACCGGCUAACUCUGUGCCAGCAGCCGCGGUAAUACAGAGGGUGCAAGCGUUAAUCGGAAUUACUGGGCGU +AAAGCGCGCGUAGGUGGUUUGUUAAGUUGAAUGUGAAAUCCCCGGGCUCAACCUGGGAACUGCAUCCAAAACUGGCAAGC +UAGAGUAUGGUAGAGGGUGGUGGAAUUUCCUGUGUAGCGGUGAAAUGCGUAGAUAUAGGAAGGAACACCAGUGGCGAAGG +CGACCACCUGGACUGAUACUGACACUGAGGUGCGAAAGCGUGGGGAGCAAACAGGAUUAGAUACCCUGGUAGUCCACGCC +GUAAACGAUGUCAACUAGCCGUUGGGAGCCUUGAGCUCUUAGUGGCGCAGCUAACGCAUUAAGUUGACCGCCUGGGGAGU +ACGGCCGCAAGGUUAAAACUCAAAUGAAUUGACGGGGGCCCGCACAAGCGGUGGAGCAUGUGGUUUAAUUCGAAGCAACG +CGAAGAACCUUACCAGGCCUUGACAUCCAAUGAAUCCUUUAGAGAUAGAGGAGUGCCUUCGGGAGCAUUGAGACAGGUGC +UGCAUGGCUGUCGUCAGCUCGUGUCGUGAGAUGUUGGGUUAAGUCCCGUAACGAGCGCAACCCUUGUCCUUAGUUACCAG +CACGUCAUGGUGGGCACUCUAAGGAGACUGCCGGUGACAAACCGGAGGAAGGUGGGGAUGACGUCAAGUCAUCAUGGCCC diff --git a/src/sortmerna/test_data/reads_1.fq.gz b/src/sortmerna/test_data/reads_1.fq.gz new file mode 100644 index 0000000000000000000000000000000000000000..41c02a22dbbae13db84acf1e79bc4fc3fa8589e6 GIT binary patch literal 189 zcmV;u07CyCiwFo$iqvKR19D|yWOH9JE@p86wU0dx!Y~Yl_nZQWu>(o}P^}JqwIX+b zPO-%OPl6LsC<}stmpJjW<4E7M&Ycg1S#Apj3L$tq`=RbA_@+MuTFFyl z78alq=EMofi84dLb}3jyAYujED%nAymVqrZpNFl>Dky^%D&v_(cRIcb rrwC*NyuH|rwZ}M?)J Date: Tue, 10 Sep 2024 15:51:12 +0200 Subject: [PATCH 13/28] Bcftools stats (#142) * Initial Commit * Adding options to config * Update on script * update * Adding test 2 and 3 * Update on config and test * adding more tests * debugging and adding tests * Adding last tests * removing test_data dir * Update CHANGELOG.md * small changes * small change in help file * Requested changes --------- Co-authored-by: Jakub Majercik <57993790+jakubmajercik@users.noreply.github.com> --- CHANGELOG.md | 1 + src/bcftools/bcftools_stats/config.vsh.yaml | 240 +++++++++++++++ src/bcftools/bcftools_stats/help.txt | 35 +++ src/bcftools/bcftools_stats/script.sh | 56 ++++ src/bcftools/bcftools_stats/test.sh | 306 ++++++++++++++++++++ 5 files changed, 638 insertions(+) create mode 100644 src/bcftools/bcftools_stats/config.vsh.yaml create mode 100644 src/bcftools/bcftools_stats/help.txt create mode 100644 src/bcftools/bcftools_stats/script.sh create mode 100644 src/bcftools/bcftools_stats/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 5041f082..2dd152bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,6 +42,7 @@ * `rsem/rsem_prepare_reference`: Prepare transcript references for RSEM (PR #89). * `bcftools`: + - `bcftools/bcftools_stats`: Parses VCF or BCF and produces a txt stats file which can be plotted using plot-vcfstats (PR #142). - `bcftools/bcftools_sort`: Sorts BCF/VCF files by position and other criteria (PR #141). * `fastqc`: High throughput sequence quality control analysis tool (PR #92). diff --git a/src/bcftools/bcftools_stats/config.vsh.yaml b/src/bcftools/bcftools_stats/config.vsh.yaml new file mode 100644 index 00000000..8fb57f7a --- /dev/null +++ b/src/bcftools/bcftools_stats/config.vsh.yaml @@ -0,0 +1,240 @@ +name: bcftools_stats +namespace: bcftools +description: | + Parses VCF or BCF and produces a txt stats file which can be plotted using plot-vcfstats. + When two files are given, the program generates separate stats for intersection + and the complements. By default only sites are compared, -s/-S must given to include + also sample columns. +keywords: [Stats, VCF, BCF] +links: + homepage: https://samtools.github.io/bcftools/ + documentation: https://samtools.github.io/bcftools/bcftools.html#stats + repository: https://github.com/samtools/bcftools + issue_tracker: https://github.com/samtools/bcftools/issues +references: + doi: https://doi.org/10.1093/gigascience/giab008 +license: MIT/Expat, GNU +requirements: + commands: [bcftools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author ] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + multiple: true + description: Input VCF/BCF file. Maximum of two files. + required: true + + - name: Outputs + arguments: + - name: --output + alternatives: -o + direction: output + type: file + description: Output txt statistics file. + required: true + + - name: Options + arguments: + + - name: --allele_frequency_bins + alternatives: --af_bins + type: string + description: | + Allele frequency bins, a list of bin values (0.1,0.5,1). + example: 0.1,0.5,1 + + - name: --allele_frequency_bins_file + alternatives: --af_bins_file + type: file + description: | + Same as allele_frequency_bins, but in a file. + Format of file is one value per line. + e.g. + 0.1 + 0.5 + 1 + + - name: --allele_frequency_tag + alternatives: --af_tag + type: string + description: | + Allele frequency tag to use, by default estimated from AN,AC or GT. + + - name: --first_allele_only + alternatives: --first_only + type: boolean_true + description: | + Include only 1st allele at multiallelic sites. + + - name: --collapse + alternatives: --c + type: string + choices: [ snps, indels, both, all, some, none ] + description: | + Treat as identical records with . + See https://samtools.github.io/bcftools/bcftools.html#common_options for details. + + - name: --depth + alternatives: --d + type: string + description: | + Depth distribution: min,max,bin size. + example: 0,500,1 + + - name: --exclude + alternatives: --e + type: string + description: | + Exclude sites for which the expression is true. + See https://samtools.github.io/bcftools/bcftools.html#expressions for details. + example: 'QUAL < 30 && DP < 10' + + - name: --exons + alternatives: --E + type: file + description: | + tab-delimited file with exons for indel frameshifts statistics. + The columns of the file are CHR, FROM, TO, with 1-based, inclusive, positions. + The file is BGZF-compressed and indexed with tabix (e.g. tabix -s1 -b2 -e3 file.gz). + + - name: --apply_filters + alternatives: --f + type: string + description: | + Require at least one of the listed FILTER strings (e.g. "PASS,."). + + - name: --fasta_reference + alternatives: --F + type: file + description: | + Faidx indexed reference sequence file to determine INDEL context. + + - name: --include + alternatives: --i + type: string + description: | + Select sites for which the expression is true. + See https://samtools.github.io/bcftools/bcftools.html#expressions for details. + example: 'QUAL >= 30 && DP >= 10' + + - name: --split_by_ID + alternatives: --I + type: boolean_true + description: | + Collect stats for sites with ID separately (known vs novel). + + - name: --regions + alternatives: --r + type: string + description: | + Restrict to comma-separated list of regions. + Following formats are supported: chr|chr:pos|chr:beg-end|chr:beg-[,…​]. + example: '20:1000000-2000000' + + - name: --regions_file + alternatives: --R + type: file + description: | + Restrict to regions listed in a file. + Regions can be specified either on a VCF, BED, or tab-delimited file (the default). + For more information check manual. + + - name: --regions_overlap + type: string + choices: ['pos', 'record', 'variant', '0', '1', '2'] + description: | + This option controls how overlapping records are determined: + set to 'pos' or '0' if the VCF record has to have POS inside a region (this corresponds to the default behavior of -t/-T); + set to 'record' or '1' if also overlapping records with POS outside a region should be included (this is the default behavior of -r/-R, + and includes indels with POS at the end of a region, which are technically outside the region); + or set to 'variant' or '2' to include only true overlapping variation (compare the full VCF representation "TA>T-" vs the true sequence variation "A>-"). + + - name: --samples + alternatives: --s + type: string + description: | + List of samples for sample stats, "-" to include all samples. + + - name: --samples_file + alternatives: --S + type: file + description: | + File of samples to include. + e.g. + sample1 1 + sample2 2 + sample3 2 + + - name: --targets + alternatives: --t + type: string + description: | + Similar as -r, --regions, but the next position is accessed by streaming the whole VCF/BCF + rather than using the tbi/csi index. Both -r and -t options can be applied simultaneously: -r uses the + index to jump to a region and -t discards positions which are not in the targets. Unlike -r, targets + can be prefixed with "^" to request logical complement. For example, "^X,Y,MT" indicates that + sequences X, Y and MT should be skipped. Yet another difference between the -t/-T and -r/-R is + that -r/-R checks for proper overlaps and considers both POS and the end position of an indel, + while -t/-T considers the POS coordinate only (by default; see also --regions-overlap and --targets-overlap). + Note that -t cannot be used in combination with -T. + Following formats are supported: chr|chr:pos|chr:beg-end|chr:beg-[,…​]. + example: '20:1000000-2000000' + + - name: --targets_file + alternatives: --T + type: file + description: | + Similar to --regions_file option but streams rather than index-jumps. + + - name: --targets_overlaps + type: string + choices: ['pos', 'record', 'variant', '0', '1', '2'] + description: | + Include if POS in the region (0), record overlaps (1), variant overlaps (2). + + - name: --user_tstv + alternatives: --u + type: string + description: | + Collect Ts/Tv stats for any tag using the given binning [0:1:100]. + Format is . + A subfield can be selected as e.g. 'PV4[0]', here the first value of the PV4 tag. + + + - name: --verbose + alternatives: --v + type: boolean_true + description: | + Produce verbose per-site and per-sample output. + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bcftools, procps] + - type: docker + run: | + echo "bcftools: \"$(bcftools --version | grep 'bcftools' | sed -n 's/^bcftools //p')\"" > /var/software_versions.txt + test_setup: + - type: apt + packages: [tabix] + +runners: + - type: executable + - type: nextflow + diff --git a/src/bcftools/bcftools_stats/help.txt b/src/bcftools/bcftools_stats/help.txt new file mode 100644 index 00000000..e702e838 --- /dev/null +++ b/src/bcftools/bcftools_stats/help.txt @@ -0,0 +1,35 @@ +``` +bcftools stats -h +``` + +About: Parses VCF or BCF and produces stats which can be plotted using plot-vcfstats. + When two files are given, the program generates separate stats for intersection + and the complements. By default only sites are compared, -s/-S must given to include + also sample columns. +Usage: bcftools stats [options] [] + +Options: + --af-bins LIST Allele frequency bins, a list (0.1,0.5,1) or a file (0.1\n0.5\n1) + --af-tag STRING Allele frequency tag to use, by default estimated from AN,AC or GT + -1, --1st-allele-only Include only 1st allele at multiallelic sites + -c, --collapse STRING Treat as identical records with , see man page for details [none] + -d, --depth INT,INT,INT Depth distribution: min,max,bin size [0,500,1] + -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details) + -E, --exons FILE.gz Tab-delimited file with exons for indel frameshifts (chr,beg,end; 1-based, inclusive, bgzip compressed) + -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. "PASS,.") + -F, --fasta-ref FILE Faidx indexed reference sequence file to determine INDEL context + -i, --include EXPR Select sites for which the expression is true (see man page for details) + -I, --split-by-ID Collect stats for sites with ID separately (known vs novel) + -r, --regions REGION Restrict to comma-separated list of regions + -R, --regions-file FILE Restrict to regions listed in a file + --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1] + -s, --samples LIST List of samples for sample stats, "-" to include all samples + -S, --samples-file FILE File of samples to include + -t, --targets REGION Similar to -r but streams rather than index-jumps + -T, --targets-file FILE Similar to -R but streams rather than index-jumps + --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0] + -u, --user-tstv TAG[:min:max:n] Collect Ts/Tv stats for any tag using the given binning [0:1:100] + A subfield can be selected as e.g. 'PV4[0]', here the first value of the PV4 tag + --threads INT Use multithreading with worker threads [0] + -v, --verbose Produce verbose per-site and per-sample output + diff --git a/src/bcftools/bcftools_stats/script.sh b/src/bcftools/bcftools_stats/script.sh new file mode 100644 index 00000000..119502fd --- /dev/null +++ b/src/bcftools/bcftools_stats/script.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +# Unset parameters +unset_if_false=( + par_first_allele_only + par_split_by_ID + par_verbose +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +# Create input array +IFS=";" read -ra input <<< $par_input + +# Check the size of the input array +if [[ ${#input[@]} -gt 2 ]]; then + echo "Error: --input only takes a max of two files!" + exit 1 +fi + +# Execute bcftools stats with the provided arguments +bcftools stats \ + ${par_first_allele_only:+--1st-allele-only} \ + ${par_split_by_ID:+--split-by-ID} \ + ${par_verbose:+--verbose} \ + ${par_allele_frequency_bins:+--af-bins "${par_allele_frequency_bins}"} \ + ${par_allele_frequency_bins_file:+--af-bins "${par_allele_frequency_bins_file}"} \ + ${par_allele_frequency_tag:+--af-tag "${par_allele_frequency_tag}"} \ + ${par_collapse:+-c "${par_collapse}"} \ + ${par_depth:+-d "${par_depth}"} \ + ${par_exclude:+-e "${par_exclude}"} \ + ${par_exons:+-E "${par_exons}"} \ + ${par_apply_filters:+-f "${par_apply_filters}"} \ + ${par_fasta_reference:+-F "${par_fasta_reference}"} \ + ${par_include:+-i "${par_include}"} \ + ${par_regions:+-r "${par_regions}"} \ + ${par_regions_file:+-R "${par_regions_file}"} \ + ${par_regions_overlap:+--regions-overlap "${par_regions_overlap}"} \ + ${par_samples:+-s "${par_samples}"} \ + ${par_samples_file:+-S "${par_samples_file}"} \ + ${par_targets:+-t "${par_targets}"} \ + ${par_targets_file:+-T "${par_targets_file}"} \ + ${par_targets_overlaps:+--targets-overlap "${par_targets_overlaps}"} \ + ${par_user_tstv:+-u "${par_user_tstv}"} \ + "${input[@]}" \ + > $par_output + diff --git a/src/bcftools/bcftools_stats/test.sh b/src/bcftools/bcftools_stats/test.sh new file mode 100644 index 00000000..18f0256b --- /dev/null +++ b/src/bcftools/bcftools_stats/test.sh @@ -0,0 +1,306 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +#test_data="$meta_resources_dir/test_data" + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create test data +cat < "$TMPDIR/example.vcf" +##fileformat=VCFv4.0 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=1000GenomesPilot-NCBI36 +##contig= +##contig= +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##ALT= +##ALT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +19 111 . A C 9.6 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 +19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 +20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. +20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,. +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.,. +20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:.:56,60 0|0:48:4:51,51 0/0:61:2:.,. +20 1234567 microsat1 G GA,GAC 50 PASS NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:.:4 0/2:17:2 1/1:40:3 +20 1235237 . T . . . . GT 0/0 0|0 ./. +EOF + +bgzip -c $TMPDIR/example.vcf > $TMPDIR/example.vcf.gz +tabix -p vcf $TMPDIR/example.vcf.gz + +cat < "$TMPDIR/exons.bed" +chr19 12345 12567 +chr20 23456 23789 +EOF + +# Compressing and indexing the exons file +bgzip -c $TMPDIR/exons.bed > $TMPDIR/exons.bed.gz +tabix -s1 -b2 -e3 $TMPDIR/exons.bed.gz + +# Create fai test file +# cat < "$TMPDIR/reference.fasta.fai" +# 19 100 895464957 60 61 +# 20 10000 1083893029 60 61 +# EOF + +# Create allele frequency bins file +cat < "$TMPDIR/allele_frequency_bins.txt" +0.1 +0.2 +0.3 +0.4 +0.5 +0.6 +0.7 +0.8 +0.9 +EOF + +# Test 1: Default Use +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null + +echo "> Run bcftools_stats on VCF file" +"$meta_executable" \ + --input "../example.vcf" \ + --output "stats.txt" \ + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats ../example.vcf" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: First allele only +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bcftools_stats on VCF file with first allele only" +"$meta_executable" \ + --input "../example.vcf" \ + --output "stats.txt" \ + --first_allele_only \ + --allele_frequency_bins "0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9" \ + --allele_frequency_tag "AF" \ + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats --1st-allele-only --af-bins 0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9 --af-tag AF ../example.vcf" +echo "- test2 succeeded -" + +popd > /dev/null + +# Test 3: Split by ID +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null + +echo "> Run bcftools_stats on VCF file with split by ID" +"$meta_executable" \ + --input "../example.vcf" \ + --output "stats.txt" \ + --split_by_ID \ + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats --split-by-ID ../example.vcf" +echo "- test3 succeeded -" + +popd > /dev/null + +# Test 4: Collapse, Depth, Exclude +mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null + +echo "> Run bcftools_stats on VCF file with collapse, depth, and exclude" +"$meta_executable" \ + --input "../example.vcf" \ + --output "stats.txt" \ + --depth "0,500,1" \ + --exclude "GT='mis'" \ + --collapse "snps" \ + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats -c snps -d 0,500,1 -e GT='mis' ../example.vcf" +echo "- test4 succeeded -" + +popd > /dev/null + +# Test 5: Exons, Apply Filters +mkdir "$TMPDIR/test5" && pushd "$TMPDIR/test5" > /dev/null + +echo "> Run bcftools_stats on VCF file with exons, apply filters, and fasta reference" +"$meta_executable" \ + --input "../example.vcf" \ + --output "stats.txt" \ + --exons "../exons.bed.gz" \ + --apply_filters "PASS" \ +# --fasta_reference "../reference.fasta.fai" \ + +# NOTE: fasta_reference option not included in testing because of error from bcftools stats. + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats -E ../exons.bed.gz -f PASS ../example.vcf" +#assert_file_contains "stats.txt" "bcftools stats -E ../exons.bed.gz -f PASS -F ../reference.fasta.fai ../example.vcf" +echo "- test5 succeeded -" + +popd > /dev/null + +# Test 6: Include, Regions +mkdir "$TMPDIR/test6" && pushd "$TMPDIR/test6" > /dev/null + +echo "> Run bcftools_stats on VCF file with include and regions options" +"$meta_executable" \ + --input "../example.vcf.gz" \ + --output "stats.txt" \ + --include "GT='mis'" \ + --regions "20:1000000-2000000" \ + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats -i GT='mis' -r 20:1000000-2000000 ../example.vcf.gz" +echo "- test6 succeeded -" + +popd > /dev/null + +# Test 7: Regions Overlap, Samples +mkdir "$TMPDIR/test7" && pushd "$TMPDIR/test7" > /dev/null + +echo "> Run bcftools_stats on VCF file with regions overlap, and samples options" +"$meta_executable" \ + --input "../example.vcf" \ + --output "stats.txt" \ + --regions_overlap "record" \ + --samples "NA00001,NA00002" \ + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats --regions-overlap record -s NA00001,NA00002 ../example.vcf" +echo "- test7 succeeded -" + +popd > /dev/null + +# Test 8: Targets, Targets File, Targets Overlaps +mkdir "$TMPDIR/test8" && pushd "$TMPDIR/test8" > /dev/null + +echo "> Run bcftools_stats on VCF file with targets, targets file, and targets overlaps" +"$meta_executable" \ + --input "../example.vcf" \ + --output "stats.txt" \ + --targets "20:1000000-2000000" \ + --targets_overlaps "pos" \ + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats -t 20:1000000-2000000 --targets-overlap pos ../example.vcf" +echo "- test8 succeeded -" + +popd > /dev/null + +# Test 9: User TSTV and Verbose +mkdir "$TMPDIR/test9" && pushd "$TMPDIR/test9" > /dev/null + +echo "> Run bcftools_stats on VCF file with user TSTV and verbose" +"$meta_executable" \ + --input "../example.vcf" \ + --output "stats.txt" \ + --user_tstv "DP" \ + --verbose \ + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats --verbose -u DP ../example.vcf" +echo "- test9 succeeded -" + +popd > /dev/null + +# Test 10: Two vcf files +mkdir "$TMPDIR/test10" && pushd "$TMPDIR/test10" > /dev/null + +echo "> Run bcftools_stats on two VCF files" +"$meta_executable" \ + --input "../example.vcf.gz" \ + --input "../example.vcf.gz" \ + --output "stats.txt" \ + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats ../example.vcf.gz ../example.vcf.gz" +echo "- test10 succeeded -" + +popd > /dev/null + +# Test 11: with allele frequency bins file option +mkdir "$TMPDIR/test11" && pushd "$TMPDIR/test11" > /dev/null + +echo "> Run bcftools_stats on VCF file with allele frequency bins file option" +"$meta_executable" \ + --input "../example.vcf" \ + --output "stats.txt" \ + --allele_frequency_bins "../allele_frequency_bins.txt" \ + +# checks +assert_file_exists "stats.txt" +assert_file_not_empty "stats.txt" +assert_file_contains "stats.txt" "bcftools stats --af-bins ../allele_frequency_bins.txt ../example.vcf" +echo "- test11 succeeded -" + +popd > /dev/null + + +echo "---- All tests succeeded! ----" +exit 0 + + From c3ba4a78497f7518725bb7d3e213b2a9bcee511e Mon Sep 17 00:00:00 2001 From: Theodoro Gasperin Terra Camargo <98555209+tgaspe@users.noreply.github.com> Date: Tue, 10 Sep 2024 15:53:13 +0200 Subject: [PATCH 14/28] Bcftools annotate (#143) * Initial commit * Update config.vsh.yaml * changes in config file * Update script.sh * Help File * Update script.sh * Update test.sh * bug fixing and adding tests * Update test.sh * Update test.sh * adding 3rd test * More tests * Moreee tests * Update test.sh * small changes * Update CHANGELOG.md * Update config.vsh.yaml * bug fixing on config * Requested changes --------- Co-authored-by: Jakub Majercik <57993790+jakubmajercik@users.noreply.github.com> --- CHANGELOG.md | 2 +- .../bcftools_annotate/config.vsh.yaml | 250 ++++++++++++++ src/bcftools/bcftools_annotate/help.txt | 41 +++ src/bcftools/bcftools_annotate/script.sh | 54 ++++ src/bcftools/bcftools_annotate/test.sh | 305 ++++++++++++++++++ 5 files changed, 651 insertions(+), 1 deletion(-) create mode 100644 src/bcftools/bcftools_annotate/config.vsh.yaml create mode 100644 src/bcftools/bcftools_annotate/help.txt create mode 100644 src/bcftools/bcftools_annotate/script.sh create mode 100644 src/bcftools/bcftools_annotate/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 2dd152bb..bb640d50 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,12 +42,12 @@ * `rsem/rsem_prepare_reference`: Prepare transcript references for RSEM (PR #89). * `bcftools`: + - `bcftools_annotate`: Add or remove annotations from a VCF/BCF file (PR #143). - `bcftools/bcftools_stats`: Parses VCF or BCF and produces a txt stats file which can be plotted using plot-vcfstats (PR #142). - `bcftools/bcftools_sort`: Sorts BCF/VCF files by position and other criteria (PR #141). * `fastqc`: High throughput sequence quality control analysis tool (PR #92). - ## MINOR CHANGES * `busco` components: update BUSCO to `5.7.1` (PR #72). diff --git a/src/bcftools/bcftools_annotate/config.vsh.yaml b/src/bcftools/bcftools_annotate/config.vsh.yaml new file mode 100644 index 00000000..67e8f46e --- /dev/null +++ b/src/bcftools/bcftools_annotate/config.vsh.yaml @@ -0,0 +1,250 @@ +name: bcftools_annotate +namespace: bcftools +description: | + Add or remove annotations from a VCF/BCF file. +keywords: [Annotate, VCF, BCF] +links: + homepage: https://samtools.github.io/bcftools/ + documentation: https://samtools.github.io/bcftools/bcftools.html#annotate + repository: https://github.com/samtools/bcftools + issue_tracker: https://github.com/samtools/bcftools/issues +references: + doi: https://doi.org/10.1093/gigascience/giab008 +license: MIT/Expat, GNU +requirements: + commands: [bcftools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [author] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + multiple: true + description: Input VCF/BCF file. + required: true + + - name: Outputs + arguments: + - name: --output + alternatives: -o + direction: output + type: file + description: Output annotated file. + required: true + + - name: Options + description: | + For examples on how to use use bcftools annotate see http://samtools.github.io/bcftools/howtos/annotate.html. + For more details on the options see https://samtools.github.io/bcftools/bcftools.html#annotate. + arguments: + + - name: --annotations + alternatives: --a + type: file + description: | + VCF file or tabix-indexed FILE with annotations: CHR\tPOS[\tVALUE]+ . + + - name: --columns + alternatives: --c + type: string + description: | + List of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. + See man page for details. + + - name: --columns_file + alternatives: --C + type: file + description: | + Read -c columns from FILE, one name per row, with optional --merge_logic TYPE: NAME[ TYPE]. + + - name: --exclude + alternatives: --e + type: string + description: | + Exclude sites for which the expression is true. + See https://samtools.github.io/bcftools/bcftools.html#expressions for details. + example: 'QUAL >= 30 && DP >= 10' + + - name: --force + type: boolean_true + description: | + continue even when parsing errors, such as undefined tags, are encountered. + Note this can be an unsafe operation and can result in corrupted BCF files. + If this option is used, make sure to sanity check the result thoroughly. + + - name: --header_line + alternatives: --H + type: string + description: | + Header line which should be appended to the VCF header, can be given multiple times. + + - name: --header_lines + alternatives: --h + type: file + description: | + File with header lines to append to the VCF header. + For example: + ##INFO= + ##INFO= + + - name: --set_id + alternatives: --I + type: string + description: | + Set ID column using a `bcftools query`-like expression, see man page for details. + + - name: --include + type: string + description: | + Select sites for which the expression is true. + See https://samtools.github.io/bcftools/bcftools.html#expressions for details. + example: 'QUAL >= 30 && DP >= 10' + + - name: --keep_sites + alternatives: --k + type: boolean_true + description: | + Leave --include/--exclude sites unchanged instead of discarding them. + + - name: --merge_logic + alternatives: --l + type: string + choices: + description: | + When multiple regions overlap a single record, this option defines how to treat multiple annotation values. + See man page for more details. + + - name: --mark_sites + alternatives: --m + type: string + description: | + Annotate sites which are present ("+") or absent ("-") in the -a file with a new INFO/TAG flag. + + - name: --min_overlap + type: string + description: | + Minimum overlap required as a fraction of the variant in the annotation -a file (ANN), + in the target VCF file (:VCF), or both for reciprocal overlap (ANN:VCF). + By default overlaps of arbitrary length are sufficient. + The option can be used only with the tab-delimited annotation -a file and with BEG and END columns present. + + - name: --no_version + type: boolean_true + description: | + Do not append version and command line information to the output VCF header. + + - name: --output_type + alternatives: --O + type: string + choices: ['u', 'z', 'b', 'v'] + description: | + Output type: + u: uncompressed BCF + z: compressed VCF + b: compressed BCF + v: uncompressed VCF + + - name: --pair_logic + type: string + choices: ['snps', 'indels', 'both', 'all', 'some', 'exact'] + description: | + Controls how to match records from the annotation file to the target VCF. + Effective only when -a is a VCF or BCF file. + The option replaces the former uninuitive --collapse. + See Common Options for more. + + - name: --regions + alternatives: --r + type: string + description: | + Restrict to comma-separated list of regions. + Following formats are supported: chr|chr:pos|chr:beg-end|chr:beg-[,…​]. + example: '20:1000000-2000000' + + - name: --regions_file + alternatives: --R + type: file + description: | + Restrict to regions listed in a file. + Regions can be specified either on a VCF, BED, or tab-delimited file (the default). + For more information check manual. + + - name: --regions_overlap + type: string + choices: ['pos', 'record', 'variant', '0', '1', '2'] + description: | + This option controls how overlapping records are determined: + set to 'pos' or '0' if the VCF record has to have POS inside a region (this corresponds to the default behavior of -t/-T); + set to 'record' or '1' if also overlapping records with POS outside a region should be included (this is the default behavior of -r/-R, + and includes indels with POS at the end of a region, which are technically outside the region); + or set to 'variant' or '2' to include only true overlapping variation (compare the full VCF representation "TA>T-" vs the true sequence variation "A>-"). + + - name: --rename_annotations + type: file + description: | + Rename annotations: TYPE/old\tnew, where TYPE is one of FILTER,INFO,FORMAT. + + - name: --rename_chromosomes + type: file + description: | + Rename chromosomes according to the map in file, with "old_name new_name\n" pairs + separated by whitespaces, each on a separate line. + + - name: --samples + type: string + description: | + Subset of samples to annotate. + See also https://samtools.github.io/bcftools/bcftools.html#common_options. + + - name: --samples_file + type: file + description: | + Subset of samples to annotate in file format. + See also https://samtools.github.io/bcftools/bcftools.html#common_options. + + - name: --single_overlaps + type: boolean_true + description: | + Use this option to keep memory requirements low with very large annotation files. + Note, however, that this comes at a cost, only single overlapping intervals are considered in this mode. + This was the default mode until the commit af6f0c9 (Feb 24 2019). + + - name: --remove + alternatives: --x + type: string + description: | + List of annotations to remove. + Use "FILTER" to remove all filters or "FILTER/SomeFilter" to remove a specific filter. + Similarly, "INFO" can be used to remove all INFO tags and "FORMAT" to remove all FORMAT tags except GT. + To remove all INFO tags except "FOO" and "BAR", use "^INFO/FOO,INFO/BAR" (and similarly for FORMAT and FILTER). + "INFO" can be abbreviated to "INF" and "FORMAT" to "FMT". + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bcftools, procps] + - type: docker + run: | + echo "bcftools: \"$(bcftools --version | grep 'bcftools' | sed -n 's/^bcftools //p')\"" > /var/software_versions.txt + test_setup: + - type: apt + packages: [tabix] + +runners: + - type: executable + - type: nextflow + diff --git a/src/bcftools/bcftools_annotate/help.txt b/src/bcftools/bcftools_annotate/help.txt new file mode 100644 index 00000000..2d1c7807 --- /dev/null +++ b/src/bcftools/bcftools_annotate/help.txt @@ -0,0 +1,41 @@ +``` +bcftools annotate -h +``` + +annotate: option requires an argument -- 'h' + +About: Annotate and edit VCF/BCF files. +Usage: bcftools annotate [options] VCF + +Options: + -a, --annotations FILE VCF file or tabix-indexed FILE with annotations: CHR\tPOS[\tVALUE]+ + -c, --columns LIST List of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details + -C, --columns-file FILE Read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE] + -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details) + --force Continue despite parsing error (at your own risk!) + -H, --header-line STR Header line which should be appended to the VCF header, can be given multiple times + -h, --header-lines FILE Lines which should be appended to the VCF header + -I, --set-id [+]FORMAT Set ID column using a `bcftools query`-like expression, see man page for details + -i, --include EXPR Select sites for which the expression is true (see man page for details) + -k, --keep-sites Leave -i/-e sites unchanged instead of discarding them + -l, --merge-logic TAG:TYPE Merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL + -m, --mark-sites [+-]TAG Add INFO/TAG flag to sites which are ("+") or are not ("-") listed in the -a file + --min-overlap ANN:VCF Required overlap as a fraction of variant in the -a file (ANN), the VCF (:VCF), or reciprocal (ANN:VCF) + --no-version Do not append version and command line to the header + -o, --output FILE Write output to a file [standard output] + -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v] + --pair-logic STR Matching records by , see man page for details [some] + -r, --regions REGION Restrict to comma-separated list of regions + -R, --regions-file FILE Restrict to regions listed in FILE + --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1] + --rename-annots FILE Rename annotations: TYPE/old\tnew, where TYPE is one of FILTER,INFO,FORMAT + --rename-chrs FILE Rename sequences according to the mapping: old\tnew + -s, --samples [^]LIST Comma separated list of samples to annotate (or exclude with "^" prefix) + -S, --samples-file [^]FILE File of samples to annotate (or exclude with "^" prefix) + --single-overlaps Keep memory low by avoiding complexities arising from handling multiple overlapping intervals + -x, --remove LIST List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with "^" prefix). See man page for details + --threads INT Number of extra output compression threads [0] + +Examples: + http://samtools.github.io/bcftools/howtos/annotate.html + diff --git a/src/bcftools/bcftools_annotate/script.sh b/src/bcftools/bcftools_annotate/script.sh new file mode 100644 index 00000000..18137bbf --- /dev/null +++ b/src/bcftools/bcftools_annotate/script.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +# Unset parameters +unset_if_false=( + par_force + par_keep_sites + par_no_version + par_single_overlaps +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +# Execute bcftools annotate with the provided arguments +bcftools annotate \ + ${par_annotations:+-a "$par_annotations"} \ + ${par_columns:+-c "$par_columns"} \ + ${par_columns_file:+-C "$par_columns_file"} \ + ${par_exclude:+-e "$par_exclude"} \ + ${par_force:+--force} \ + ${par_header_line:+-H "$par_header_line"} \ + ${par_header_lines:+-h "$par_header_lines"} \ + ${par_set_id:+-I "$par_set_id"} \ + ${par_include:+-i "$par_include"} \ + ${par_keep_sites:+-k} \ + ${par_merge_logic:+-l "$par_merge_logic"} \ + ${par_mark_sites:+-m "$par_mark_sites"} \ + ${par_min_overlap:+--min-overlap "$par_min_overlap"} \ + ${par_no_version:+--no-version} \ + ${par_samples_file:+-S "$par_samples_file"} \ + ${par_output_type:+-O "$par_output_type"} \ + ${par_pair_logic:+--pair-logic "$par_pair_logic"} \ + ${par_regions:+-r "$par_regions"} \ + ${par_regions_file:+-R "$par_regions_file"} \ + ${par_regions_overlap:+--regions-overlap "$par_regions_overlap"} \ + ${par_rename_annotations:+--rename-annots "$par_rename_annotations"} \ + ${par_rename_chromosomes:+--rename-chrs "$par_rename_chromosomes"} \ + ${par_samples:+-s "$par_samples"} \ + ${par_single_overlaps:+--single-overlaps} \ + ${par_threads:+--threads "$par_threads"} \ + ${par_remove:+-x "$par_remove"} \ + -o $par_output \ + $par_input + + + \ No newline at end of file diff --git a/src/bcftools/bcftools_annotate/test.sh b/src/bcftools/bcftools_annotate/test.sh new file mode 100644 index 00000000..39835c82 --- /dev/null +++ b/src/bcftools/bcftools_annotate/test.sh @@ -0,0 +1,305 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +#test_data="$meta_resources_dir/test_data" + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create test data +cat < "$TMPDIR/example.vcf" +##fileformat=VCFv4.1 +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 +1 752567 llama A C . . . . . +1 752722 . G A . . . . . +EOF + +bgzip -c $TMPDIR/example.vcf > $TMPDIR/example.vcf.gz +tabix -p vcf $TMPDIR/example.vcf.gz + +cat < "$TMPDIR/annots.tsv" +1 752567 752567 FooValue1 12345 +1 752722 752722 FooValue2 67890 +EOF + +cat < "$TMPDIR/rename.tsv" +INFO/. Luigi +EOF + +bgzip $TMPDIR/annots.tsv +tabix -s1 -b2 -e3 $TMPDIR/annots.tsv.gz + +cat < "$TMPDIR/header.hdr" +##FORMAT= +##INFO= +EOF + +cat < "$TMPDIR/rename_chrm.tsv" +1 chr1 +2 chr2 +EOF + +# Test 1: Remove ID annotations +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null + +echo "> Run bcftools_annotate remove annotations" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --remove "ID" \ + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "1 752567 . A C" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: Annotate with -a, -c and -h options +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bcftools_annotate with -a, -c and -h options" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --annotations "../annots.tsv.gz" \ + --header_lines "../header.hdr" \ + --columns "CHROM,FROM,TO,FMT/FOO,BAR" \ + --mark_sites "BAR" \ + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" $(echo -e "1\t752567\tllama\tA\tC\t.\t.\tBAR=12345\tFOO\tFooValue1") +echo "- test2 succeeded -" + +popd > /dev/null + +# Test 3: +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null + +echo "> Run bcftools_annotate with --set_id option" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --set_id "+'%CHROM\_%POS\_%REF\_%FIRST_ALT'" \ + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "'1_752722_G_A'" +echo "- test3 succeeded -" + +popd > /dev/null + +# Test 4: +mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null + +echo "> Run bcftools_annotate with --rename-annotations option" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --rename_annotations "../rename.tsv" + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "##bcftools_annotateCommand=annotate --rename-annots ../rename.tsv -o annotated.vcf" +echo "- test4 succeeded -" + +popd > /dev/null + +# Test 5: Rename chromosomes +mkdir "$TMPDIR/test5" && pushd "$TMPDIR/test5" > /dev/null + +echo "> Run bcftools_annotate with --rename-chromosomes option" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --rename_chromosomes "../rename_chrm.tsv" + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "chr1" +echo "- test5 succeeded -" + +popd > /dev/null + +# Test 6: Sample option +mkdir "$TMPDIR/test6" && pushd "$TMPDIR/test6" > /dev/null + +echo "> Run bcftools_annotate with -s option" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --samples "SAMPLE1" + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "##bcftools_annotateCommand=annotate -s SAMPLE1 -o annotated.vcf ../example.vcf" +echo "- test6 succeeded -" + +popd > /dev/null + +# Test 7: Single overlaps +mkdir "$TMPDIR/test7" && pushd "$TMPDIR/test7" > /dev/null + +echo "> Run bcftools_annotate with --single-overlaps option" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --single_overlaps \ + --keep_sites \ + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "annotate -k --single-overlaps -o annotated.vcf ../example.vcf" +echo "- test7 succeeded -" + +popd > /dev/null + +# Test 8: Min overlap +mkdir "$TMPDIR/test8" && pushd "$TMPDIR/test8" > /dev/null + +echo "> Run bcftools_annotate with --min-overlap option" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --annotations "../annots.tsv.gz" \ + --columns "CHROM,FROM,TO,FMT/FOO,BAR" \ + --header_lines "../header.hdr" \ + --min_overlap "1" + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "annotate -a ../annots.tsv.gz -c CHROM,FROM,TO,FMT/FOO,BAR -h ../header.hdr --min-overlap 1 -o annotated.vcf ../example.vcf" +echo "- test8 succeeded -" + +popd > /dev/null + +# Test 9: Regions +mkdir "$TMPDIR/test9" && pushd "$TMPDIR/test9" > /dev/null + +echo "> Run bcftools_annotate with -r option" +"$meta_executable" \ + --input "../example.vcf.gz" \ + --output "annotated.vcf" \ + --regions "1:752567-752722" + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "annotate -r 1:752567-752722 -o annotated.vcf ../example.vcf.gz" +echo "- test9 succeeded -" + +popd > /dev/null + +# Test 10: pair-logic +mkdir "$TMPDIR/test10" && pushd "$TMPDIR/test10" > /dev/null + +echo "> Run bcftools_annotate with --pair-logic option" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --pair_logic "all" + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "annotate --pair-logic all -o annotated.vcf ../example.vcf" +echo "- test10 succeeded -" + +popd > /dev/null + +# Test 11: regions-overlap +mkdir "$TMPDIR/test11" && pushd "$TMPDIR/test11" > /dev/null + +echo "> Run bcftools_annotate with --regions-overlap option" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --regions_overlap "1" + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "annotate --regions-overlap 1 -o annotated.vcf ../example.vcf" +echo "- test11 succeeded -" + +popd > /dev/null + +# Test 12: include +mkdir "$TMPDIR/test12" && pushd "$TMPDIR/test12" > /dev/null + +echo "> Run bcftools_annotate with -i option" +"$meta_executable" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --include "FILTER='PASS'" \ + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "annotate -i FILTER='PASS' -o annotated.vcf ../example.vcf" +echo "- test12 succeeded -" + +popd > /dev/null + +# Test 13: exclude +mkdir "$TMPDIR/test13" && pushd "$TMPDIR/test13" > /dev/null + +echo "> Run bcftools_annotate with -e option" +"$meta_executable" \ + --annotations "../annots.tsv.gz" \ + --input "../example.vcf" \ + --output "annotated.vcf" \ + --exclude "FILTER='PASS'" \ + --header_lines "../header.hdr" \ + --columns "CHROM,FROM,TO,FMT/FOO,BAR" \ + --merge_logic "FOO:first" \ + +# checks +assert_file_exists "annotated.vcf" +assert_file_not_empty "annotated.vcf" +assert_file_contains "annotated.vcf" "annotate -a ../annots.tsv.gz -c CHROM,FROM,TO,FMT/FOO,BAR -e FILTER='PASS' -h ../header.hdr -l FOO:first -o annotated.vcf ../example.vcf" +echo "- test13 succeeded -" + +popd > /dev/null + + +echo "---- All tests succeeded! ----" +exit 0 + From dc7b33d51f274cb156b1f1b0fbdc6fed0b757720 Mon Sep 17 00:00:00 2001 From: Theodoro Gasperin Terra Camargo <98555209+tgaspe@users.noreply.github.com> Date: Tue, 10 Sep 2024 16:15:44 +0200 Subject: [PATCH 15/28] Bcftools Norm (#144) * Initial Commit * config and help.txt * script.sh * test template * More tests and debugging * test 5 and 6 * test 7, 8, 9 * Update test.sh * fixing bug on config * Changelog * Update config.vsh.yaml * Requested changes * Bug fixing --------- Co-authored-by: Jakub Majercik <57993790+jakubmajercik@users.noreply.github.com> --- CHANGELOG.md | 1 + src/bcftools/bcftools_norm/config.vsh.yaml | 194 +++++++++++++++++ src/bcftools/bcftools_norm/help.txt | 41 ++++ src/bcftools/bcftools_norm/script.sh | 49 +++++ src/bcftools/bcftools_norm/test.sh | 231 +++++++++++++++++++++ 5 files changed, 516 insertions(+) create mode 100644 src/bcftools/bcftools_norm/config.vsh.yaml create mode 100644 src/bcftools/bcftools_norm/help.txt create mode 100644 src/bcftools/bcftools_norm/script.sh create mode 100644 src/bcftools/bcftools_norm/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index bb640d50..25850193 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,6 +42,7 @@ * `rsem/rsem_prepare_reference`: Prepare transcript references for RSEM (PR #89). * `bcftools`: + - `bcftools_norm`: Left-align and normalize indels, check if REF alleles match the reference, split multiallelic sites into multiple rows; recover multiallelics from multiple rows (PR #144). - `bcftools_annotate`: Add or remove annotations from a VCF/BCF file (PR #143). - `bcftools/bcftools_stats`: Parses VCF or BCF and produces a txt stats file which can be plotted using plot-vcfstats (PR #142). - `bcftools/bcftools_sort`: Sorts BCF/VCF files by position and other criteria (PR #141). diff --git a/src/bcftools/bcftools_norm/config.vsh.yaml b/src/bcftools/bcftools_norm/config.vsh.yaml new file mode 100644 index 00000000..5c525d3a --- /dev/null +++ b/src/bcftools/bcftools_norm/config.vsh.yaml @@ -0,0 +1,194 @@ +name: bcftools_norm +namespace: bcftools +description: | + Left-align and normalize indels, check if REF alleles match the reference, split multiallelic sites into multiple rows; + recover multiallelics from multiple rows. +keywords: [Normalize, VCF, BCF] +links: + homepage: https://samtools.github.io/bcftools/ + documentation: https://samtools.github.io/bcftools/bcftools.html#norm + repository: https://github.com/samtools/bcftools + issue_tracker: https://github.com/samtools/bcftools/issues +references: + doi: https://doi.org/10.1093/gigascience/giab008 +license: MIT/Expat, GNU +requirements: + commands: [bcftools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [author] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + description: Input VCF/BCF file. + required: true + + - name: Outputs + arguments: + - name: --output + alternatives: -o + direction: output + type: file + description: Output normalized VCF/BCF file. + required: true + + - name: Options + arguments: + + - name: --atomize + alternatives: -a + type: boolean_true + description: | + Decompose complex variants (e.g., MNVs become consecutive SNVs). + + - name: --atom_overlaps + type: string + choices: [".", "*"] + description: | + Use the star allele (*) for overlapping alleles or set to missing (.). + + - name: --check_ref + alternatives: -c + type: string + choices: ['e', 'w', 'x', 's'] + description: | + Check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites. + + - name: --remove_duplicates + alternatives: -d + type: string + choices: ['snps', 'indels', 'both', 'all', 'exact', 'none'] + description: Remove duplicate snps, indels, both, all, exact matches, or none (old -D option). + + - name: --fasta_ref + alternatives: -f + type: file + description: Reference fasta sequence file. + + - name: --force + type: boolean_true + description: | + Try to proceed even if malformed tags are encountered. + Experimental, use at your own risk. + + - name: --keep_sum + type: string + description: | + Keep vector sum constant when splitting multiallelics (see github issue #360). + + - name: --multiallelics + alternatives: -m + type: string + choices: ['+snps', '+indels', '+both', '+any', '-snps', '-indels', '-both', '-any'] + description: | + Split multiallelics (-) or join biallelics (+), type: snps, indels, both, any [default: both]. + + - name: --no_version + type: boolean_true + description: Do not append version and command line information to the header. + + - name: --do_not_normalize + alternatives: -N + type: boolean_true + description: Do not normalize indels (with -m or -c s). + + - name: --output_type + alternatives: --O + type: string + choices: ['u', 'z', 'b', 'v'] + description: | + Output type: + u: uncompressed BCF + z: compressed VCF + b: compressed BCF + v: uncompressed VCF + + - name: --old_rec_tag + type: string + description: Annotate modified records with INFO/STR indicating the original variant. + + - name: --regions + alternatives: --r + type: string + description: | + Restrict to comma-separated list of regions. + Following formats are supported: chr|chr:pos|chr:beg-end|chr:beg-[,…​]. + example: '20:1000000-2000000' + + - name: --regions_file + alternatives: --R + type: file + description: | + Restrict to regions listed in a file. + Regions can be specified either on a VCF, BED, or tab-delimited file (the default). + For more information check manual. + + - name: --regions_overlap + type: string + choices: ['pos', 'record', 'variant', '0', '1', '2'] + description: | + This option controls how overlapping records are determined: + set to 'pos' or '0' if the VCF record has to have POS inside a region (this corresponds to the default behavior of -t/-T); + set to 'record' or '1' if also overlapping records with POS outside a region should be included (this is the default behavior of -r/-R, + and includes indels with POS at the end of a region, which are technically outside the region); + or set to 'variant' or '2' to include only true overlapping variation (compare the full VCF representation "TA>T-" vs the true sequence variation "A>-"). + + - name: --site_win + alternatives: -w + type: integer + description: | + Buffer for sorting lines that changed position during realignment. + + - name: --strict_filter + alternatives: -s + type: boolean_true + description: When merging (-m+), merged site is PASS only if all sites being merged PASS. + + - name: --targets + alternatives: -t + type: string + description: Similar to --regions but streams rather than index-jumps. + example: '20:1000000-2000000' + + - name: --targets_file + alternatives: -T + type: file + description: Similar to --regions_file but streams rather than index-jumps. + + - name: --targets_overlap + type: string + choices: ['pos', 'record', 'variant', '0', '1', '2'] + description: | + Include if POS in the region (0), record overlaps (1), variant overlaps (2). + Similar to --regions_overlap. + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bcftools, procps] + - type: docker + run: | + echo "bcftools: \"$(bcftools --version | grep 'bcftools' | sed -n 's/^bcftools //p')\"" > /var/software_versions.txt + test_setup: + - type: apt + packages: [tabix] + +runners: + - type: executable + - type: nextflow + + diff --git a/src/bcftools/bcftools_norm/help.txt b/src/bcftools/bcftools_norm/help.txt new file mode 100644 index 00000000..02e9761a --- /dev/null +++ b/src/bcftools/bcftools_norm/help.txt @@ -0,0 +1,41 @@ +``` +bcftools norm -h +``` + +About: Left-align and normalize indels; check if REF alleles match the reference; + split multiallelic sites into multiple rows; recover multiallelics from + multiple rows. +Usage: bcftools norm [options] + +Options: + -a, --atomize Decompose complex variants (e.g. MNVs become consecutive SNVs) + --atom-overlaps '*'|. Use the star allele (*) for overlapping alleles or set to missing (.) [*] + -c, --check-ref e|w|x|s Check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e] + -D, --remove-duplicates Remove duplicate lines of the same type. + -d, --rm-dup TYPE Remove duplicate snps|indels|both|all|exact + -f, --fasta-ref FILE Reference sequence + --force Try to proceed even if malformed tags are encountered. Experimental, use at your own risk + --keep-sum TAG,.. Keep vector sum constant when splitting multiallelics (see github issue #360) + -m, --multiallelics -|+TYPE Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both] + --no-version Do not append version and command line to the header + -N, --do-not-normalize Do not normalize indels (with -m or -c s) + --old-rec-tag STR Annotate modified records with INFO/STR indicating the original variant + -o, --output FILE Write output to a file [standard output] + -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v] + -r, --regions REGION Restrict to comma-separated list of regions + -R, --regions-file FILE Restrict to regions listed in a file + --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1] + -s, --strict-filter When merging (-m+), merged site is PASS only if all sites being merged PASS + -t, --targets REGION Similar to -r but streams rather than index-jumps + -T, --targets-file FILE Similar to -R but streams rather than index-jumps + --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0] + --threads INT Use multithreading with worker threads [0] + -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000] + +Examples: + # normalize and left-align indels + bcftools norm -f ref.fa in.vcf + + # split multi-allelic sites + bcftools norm -m- in.vcf + diff --git a/src/bcftools/bcftools_norm/script.sh b/src/bcftools/bcftools_norm/script.sh new file mode 100644 index 00000000..0f43e593 --- /dev/null +++ b/src/bcftools/bcftools_norm/script.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +# Unset parameters +unset_if_false=( + par_atomize + par_remove_duplicates + par_force + par_no_version + par_do_not_normalize + par_strict_filter +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +# Execute bcftools norm with the provided arguments +bcftools norm \ + ${par_atomize:+--atomize} \ + ${par_atom_overlaps:+--atom-overlaps "$par_atom_overlaps"} \ + ${par_check_ref:+-c "$par_check_ref"} \ + ${par_remove_duplicates:+-d "$par_remove_duplicates"} \ + ${par_fasta_ref:+-f "$par_fasta_ref"} \ + ${par_force:+--force} \ + ${par_keep_sum:+--keep-sum "$par_keep_sum"} \ + ${par_multiallelics:+-m "$par_multiallelics"} \ + ${par_no_version:+--no-version} \ + ${par_do_not_normalize:+-N} \ + ${par_old_rec_tag:+--old-rec-tag "$par_old_rec_tag"} \ + ${par_regions:+-r "$par_regions"} \ + ${par_regions_file:+-R "$par_regions_file"} \ + ${par_regions_overlap:+--regions-overlap "$par_regions_overlap"} \ + ${par_site_win:+-w "$par_site_win"} \ + ${par_strict_filter:+-s} \ + ${par_targets:+-t "$par_targets"} \ + ${par_targets_file:+-T "$par_targets_file"} \ + ${par_targets_overlap:+--targets-overlap "$par_targets_overlap"} \ + ${meta_cpus:+--threads "$meta_cpus"} \ + ${par_output_type:+-O "$par_output_type"} \ + -o $par_output \ + $par_input + diff --git a/src/bcftools/bcftools_norm/test.sh b/src/bcftools/bcftools_norm/test.sh new file mode 100644 index 00000000..254c7176 --- /dev/null +++ b/src/bcftools/bcftools_norm/test.sh @@ -0,0 +1,231 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +#test_data="$meta_resources_dir/test_data" + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create test data +cat < "$TMPDIR/example.vcf" +##fileformat=VCFv4.1 +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 +1 752567 llama G C,A . . . . 1/2 +1 752722 . G A,AAA . . . . ./. +EOF + +bgzip -c $TMPDIR/example.vcf > $TMPDIR/example.vcf.gz +tabix -p vcf $TMPDIR/example.vcf.gz + +cat < "$TMPDIR/reference.fa" +>1 +ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG +>2 +CGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGAT +EOF + +# Test 1: Remove ID annotations +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null + +echo "> Run bcftools_norm" +"$meta_executable" \ + --input "../example.vcf" \ + --output "normalized.vcf" \ + --atomize \ + --atom_overlaps "." \ + &> /dev/null + +# checks +assert_file_exists "normalized.vcf" +assert_file_not_empty "normalized.vcf" +assert_file_contains "normalized.vcf" "bcftools_normCommand=norm --atomize --atom-overlaps . -o normalized.vcf ../example.vcf" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: Check reference +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bcftools_norm with remove duplicates" +"$meta_executable" \ + --input "../example.vcf" \ + --output "normalized.vcf" \ + --atomize \ + --remove_duplicates 'all' \ + &> /dev/null + +# checks +assert_file_exists "normalized.vcf" +assert_file_not_empty "normalized.vcf" +assert_file_contains "normalized.vcf" "norm --atomize -d all -o normalized.vcf ../example.vcf" +echo "- test2 succeeded -" + +popd > /dev/null + +# Test 3: Check reference and fasta reference +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null + +echo "> Run bcftools_norm with check reference and fasta reference" +"$meta_executable" \ + --input "../example.vcf" \ + --output "normalized.vcf" \ + --atomize \ + --fasta_ref "../reference.fa" \ + --check_ref "e" \ + &> /dev/null + +# checks +assert_file_exists "normalized.vcf" +assert_file_not_empty "normalized.vcf" +assert_file_contains "normalized.vcf" "norm --atomize -c e -f ../reference.fa -o normalized.vcf ../example.vcf" +echo "- test3 succeeded -" + +popd > /dev/null + +# Test 4: Multiallelics +mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null + +echo "> Run bcftools_norm with multiallelics" +"$meta_executable" \ + --input "../example.vcf" \ + --output "normalized.vcf" \ + --multiallelics "-any" \ + --old_rec_tag "wazzaaa" \ + &> /dev/null + +# checks +assert_file_exists "normalized.vcf" +assert_file_not_empty "normalized.vcf" +assert_file_contains "normalized.vcf" "norm -m -any --old-rec-tag wazzaaa -o normalized.vcf ../example.vcf" +echo "- test4 succeeded -" + +popd > /dev/null + +# Test 5: Regions +mkdir "$TMPDIR/test5" && pushd "$TMPDIR/test5" > /dev/null + +echo "> Run bcftools_norm with regions" +"$meta_executable" \ + --input "../example.vcf.gz" \ + --output "normalized.vcf" \ + --atomize \ + --regions "1:752567-752722" \ + &> /dev/null + +# checks +assert_file_exists "normalized.vcf" +assert_file_not_empty "normalized.vcf" +assert_file_contains "normalized.vcf" "norm --atomize -r 1:752567-752722 -o normalized.vcf ../example.vcf.gz" +echo "- test5 succeeded -" + +popd > /dev/null + +# Test 6: Targets +mkdir "$TMPDIR/test6" && pushd "$TMPDIR/test6" > /dev/null + +echo "> Run bcftools_norm with targets" +"$meta_executable" \ + --input "../example.vcf" \ + --output "normalized.vcf" \ + --atomize \ + --targets "1:752567-752722" \ + &> /dev/null + +# checks +assert_file_exists "normalized.vcf" +assert_file_not_empty "normalized.vcf" +assert_file_contains "normalized.vcf" "norm --atomize -t 1:752567-752722 -o normalized.vcf ../example.vcf" +echo "- test6 succeeded -" + +popd > /dev/null + +# Test 7: Regions overlap +mkdir "$TMPDIR/test7" && pushd "$TMPDIR/test7" > /dev/null + +echo "> Run bcftools_norm with regions overlap" +"$meta_executable" \ + --input "../example.vcf" \ + --output "normalized.vcf" \ + --atomize \ + --regions_overlap "pos" \ + &> /dev/null + +# checks +assert_file_exists "normalized.vcf" +assert_file_not_empty "normalized.vcf" +assert_file_contains "normalized.vcf" "norm --atomize --regions-overlap pos -o normalized.vcf ../example.vcf" +echo "- test7 succeeded -" + +popd > /dev/null + +# Test 8: Strict filter and targets overlap +mkdir "$TMPDIR/test8" && pushd "$TMPDIR/test8" > /dev/null + +echo "> Run bcftools_norm with strict filter and targets overlap" +"$meta_executable" \ + --input "../example.vcf" \ + --output "normalized.vcf" \ + --atomize \ + --strict_filter \ + --targets_overlap "1" \ + &> /dev/null + +# checks +assert_file_exists "normalized.vcf" +assert_file_not_empty "normalized.vcf" +assert_file_contains "normalized.vcf" "norm --atomize -s --targets-overlap 1 -o normalized.vcf ../example.vcf" +echo "- test8 succeeded -" + +popd > /dev/null + +# Test 9: Do not normalize +mkdir "$TMPDIR/test9" && pushd "$TMPDIR/test9" > /dev/null + +echo "> Run bcftools_norm with do not normalize" +"$meta_executable" \ + --input "../example.vcf" \ + --output "normalized.vcf" \ + --do_not_normalize \ + --atomize \ + &> /dev/null + +# checks +assert_file_exists "normalized.vcf" +assert_file_not_empty "normalized.vcf" +assert_file_contains "normalized.vcf" "norm --atomize -N -o normalized.vcf ../example.vcf" +echo "- test9 succeeded -" + +popd > /dev/null + +echo "---- All tests succeeded! ----" +exit 0 + + From bd8ca889d13784c5a7502bb977c6659fe420d973 Mon Sep 17 00:00:00 2001 From: Theodoro Gasperin Terra Camargo <98555209+tgaspe@users.noreply.github.com> Date: Tue, 10 Sep 2024 16:17:22 +0200 Subject: [PATCH 16/28] Bcftools Concat (#145) * Initial Commint * Create help.txt * Update config.vsh.yaml * Update config.vsh.yaml * Update config.vsh.yaml * Update script.sh * add template for tests * Update test.sh * small changes in config file * adding more tests * adding more test * Update CHANGELOG.md --------- Co-authored-by: Jakub Majercik <57993790+jakubmajercik@users.noreply.github.com> --- CHANGELOG.md | 5 +- src/bcftools/bcftools_concat/config.vsh.yaml | 172 ++++++++++++++ src/bcftools/bcftools_concat/help.txt | 36 +++ src/bcftools/bcftools_concat/script.sh | 54 +++++ src/bcftools/bcftools_concat/test.sh | 227 +++++++++++++++++++ 5 files changed, 492 insertions(+), 2 deletions(-) create mode 100644 src/bcftools/bcftools_concat/config.vsh.yaml create mode 100644 src/bcftools/bcftools_concat/help.txt create mode 100644 src/bcftools/bcftools_concat/script.sh create mode 100644 src/bcftools/bcftools_concat/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 25850193..034e2422 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,8 +42,9 @@ * `rsem/rsem_prepare_reference`: Prepare transcript references for RSEM (PR #89). * `bcftools`: - - `bcftools_norm`: Left-align and normalize indels, check if REF alleles match the reference, split multiallelic sites into multiple rows; recover multiallelics from multiple rows (PR #144). - - `bcftools_annotate`: Add or remove annotations from a VCF/BCF file (PR #143). + - `bcftools/bcftools_concat`: Concatenate or combine VCF/BCF files (PR #145). + - `bcftools/bcftools_norm`: Left-align and normalize indels, check if REF alleles match the reference, split multiallelic sites into multiple rows; recover multiallelics from multiple rows (PR #144). + - `bcftools/bcftools_annotate`: Add or remove annotations from a VCF/BCF file (PR #143). - `bcftools/bcftools_stats`: Parses VCF or BCF and produces a txt stats file which can be plotted using plot-vcfstats (PR #142). - `bcftools/bcftools_sort`: Sorts BCF/VCF files by position and other criteria (PR #141). diff --git a/src/bcftools/bcftools_concat/config.vsh.yaml b/src/bcftools/bcftools_concat/config.vsh.yaml new file mode 100644 index 00000000..2bb32f1c --- /dev/null +++ b/src/bcftools/bcftools_concat/config.vsh.yaml @@ -0,0 +1,172 @@ +name: bcftools_concat +namespace: bcftools +description: | + Concatenate or combine VCF/BCF files. All source files must have the same sample + columns appearing in the same order. The program can be used, for example, to + concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel + VCF into one. The input files must be sorted by chr and position. The files + must be given in the correct order to produce sorted VCF on output unless + the -a, --allow-overlaps option is specified. With the --naive option, the files + are concatenated without being recompressed, which is very fast. +keywords: [Concatenate, VCF, BCF] +links: + homepage: https://samtools.github.io/bcftools/ + documentation: https://samtools.github.io/bcftools/bcftools.html#concat + repository: https://github.com/samtools/bcftools + issue_tracker: https://github.com/samtools/bcftools/issues +references: + doi: https://doi.org/10.1093/gigascience/giab008 +license: MIT/Expat, GNU +requirements: + commands: [bcftools] +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [author] + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -i + type: file + multiple: true + description: Input VCF/BCF files to concatenate. + + - name: --file_list + alternatives: -f + type: file + description: Read the list of VCF/BCF files from a file, one file name per line. + + - name: Outputs + arguments: + - name: --output + alternatives: -o + direction: output + type: file + description: Output concatenated VCF/BCF file. + required: true + + - name: Options + arguments: + + - name: --allow_overlaps + alternatives: -a + type: boolean_true + description: | + First coordinate of the next file can precede last record of the current file. + + - name: --compact_PS + alternatives: -c + type: boolean_true + description: | + Do not output PS tag at each site, only at the start of a new phase set block. + + - name: --remove_duplicates + alternatives: -d + type: string + choices: ['snps', 'indels', 'both', 'all', 'exact', 'none'] + description: | + Output duplicate records present in multiple files only once: . + + - name: --ligate + alternatives: -l + type: boolean_true + description: Ligate phased VCFs by matching phase at overlapping haplotypes. + + - name: --ligate_force + type: boolean_true + description: Ligate even non-overlapping chunks, keep all sites. + + - name: --ligate_warn + type: boolean_true + description: Drop sites in imperfect overlaps. + + - name: --no_version + type: boolean_true + description: Do not append version and command line information to the header. + + - name: --naive + alternatives: -n + type: boolean_true + description: Concatenate files without recompression, a header check compatibility is performed. + + - name: --naive_force + type: boolean_true + description: | + Same as --naive, but header compatibility is not checked. + Dangerous, use with caution. + + - name: --output_type + alternatives: -O + type: string + choices: ['u', 'z', 'b', 'v'] + description: | + Output type: + u: uncompressed BCF + z: compressed VCF + b: compressed BCF + v: uncompressed VCF + + - name: --min_PQ + alternatives: -q + type: integer + description: Break phase set if phasing quality is lower than . + example: 30 + + - name: --regions + alternatives: -r + type: string + description: | + Restrict to comma-separated list of regions. + Following formats are supported: chr|chr:pos|chr:beg-end|chr:beg-[,…​]. + example: '20:1000000-2000000' + + - name: --regions_file + alternatives: -R + type: file + description: | + Restrict to regions listed in a file. + Regions can be specified either on a VCF, BED, or tab-delimited file (the default). + For more information check manual. + + - name: --regions_overlap + type: string + choices: ['pos', 'record', 'variant', '0', '1', '2'] + description: | + This option controls how overlapping records are determined: + set to 'pos' or '0' if the VCF record has to have POS inside a region (this corresponds to the default behavior of -t/-T); + set to 'record' or '1' if also overlapping records with POS outside a region should be included (this is the default behavior of -r/-R, + and includes indels with POS at the end of a region, which are technically outside the region); + or set to 'variant' or '2' to include only true overlapping variation (compare the full VCF representation "TA>T-" vs the true sequence variation "A>-"). + + #PS: Verbose seems to be broken in this version of bcftools + # - name: --verbose + # alternatives: -v + # type: integer + # choices: [0, 1] + # description: Set verbosity level. + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bcftools, procps] + - type: docker + run: | + echo "bcftools: \"$(bcftools --version | grep 'bcftools' | sed -n 's/^bcftools //p')\"" > /var/software_versions.txt + test_setup: + - type: apt + packages: [tabix] + +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/bcftools/bcftools_concat/help.txt b/src/bcftools/bcftools_concat/help.txt new file mode 100644 index 00000000..fc0f1914 --- /dev/null +++ b/src/bcftools/bcftools_concat/help.txt @@ -0,0 +1,36 @@ +``` +bcftools concat -h +``` + +concat: option requires an argument -- 'h' + +About: Concatenate or combine VCF/BCF files. All source files must have the same sample + columns appearing in the same order. The program can be used, for example, to + concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel + VCF into one. The input files must be sorted by chr and position. The files + must be given in the correct order to produce sorted VCF on output unless + the -a, --allow-overlaps option is specified. With the --naive option, the files + are concatenated without being recompressed, which is very fast. +Usage: bcftools concat [options] [ [...]] + +Options: + -a, --allow-overlaps First coordinate of the next file can precede last record of the current file. + -c, --compact-PS Do not output PS tag at each site, only at the start of a new phase set block. + -d, --rm-dups STRING Output duplicate records present in multiple files only once: + -D, --remove-duplicates Alias for -d exact + -f, --file-list FILE Read the list of files from a file. + -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes + --ligate-force Ligate even non-overlapping chunks, keep all sites + --ligate-warn Drop sites in imperfect overlaps + --no-version Do not append version and command line to the header + -n, --naive Concatenate files without recompression, a header check compatibility is performed + --naive-force Same as --naive, but header compatibility is not checked. Dangerous, use with caution. + -o, --output FILE Write output to a file [standard output] + -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v] + -q, --min-PQ INT Break phase set if phasing quality is lower than [30] + -r, --regions REGION Restrict to comma-separated list of regions + -R, --regions-file FILE Restrict to regions listed in a file + --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1] + --threads INT Use multithreading with worker threads [0] + -v, --verbose 0|1 Set verbosity level [1] + diff --git a/src/bcftools/bcftools_concat/script.sh b/src/bcftools/bcftools_concat/script.sh new file mode 100644 index 00000000..5614cd1b --- /dev/null +++ b/src/bcftools/bcftools_concat/script.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +# Unset parameters +unset_if_false=( + par_allow_overlaps + par_compact_PS + par_ligate + par_ligate_force + par_ligate_warn + par_no_version + par_naive + par_naive_force +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +# Check to see whether the par_input or the par_file_list is set +if [[ -z "${par_input}" && -z "${par_file_list}" ]]; then + echo "Error: One of the parameters '--input' or '--file_list' must be used." + exit 1 +fi + +# Create input array +IFS=";" read -ra input <<< $par_input + +# Execute bcftools concat with the provided arguments +bcftools concat \ + ${par_allow_overlaps:+-a} \ + ${par_compact_PS:+-c} \ + ${par_remove_duplicates:+-d "$par_remove_duplicates"} \ + ${par_ligate:+-l} \ + ${par_ligate_force:+--ligate-force} \ + ${par_ligate_warn:+--ligate-warn} \ + ${par_no_version:+--no-version} \ + ${par_naive:+-n} \ + ${par_naive_force:+--naive-force} \ + ${par_output_type:+--O "$par_output_type"} \ + ${par_min_PQ:+-q "$par_min_PQ"} \ + ${par_regions:+-r "$par_regions"} \ + ${par_regions_file:+-R "$par_regions_file"} \ + ${par_regions_overlap:+--regions-overlap "$par_regions_overlap"} \ + ${meta_cpus:+--threads "$meta_cpus"} \ + -o $par_output \ + ${par_file_list:+-f "$par_file_list"} \ + ${input[@]} \ \ No newline at end of file diff --git a/src/bcftools/bcftools_concat/test.sh b/src/bcftools/bcftools_concat/test.sh new file mode 100644 index 00000000..3c1c7bb6 --- /dev/null +++ b/src/bcftools/bcftools_concat/test.sh @@ -0,0 +1,227 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +#test_data="$meta_resources_dir/test_data" + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_identical_content() { + diff -a "$2" "$1" \ + || (echo "Files are not identical!" && exit 1) +} +############################################# + +# Create directories for tests +echo "Creating Test Data..." +TMPDIR=$(mktemp -d "$meta_temp_dir/XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create test data +cat < "$TMPDIR/example.vcf" +##fileformat=VCFv4.1 +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 +1 752567 llama G C,A 15 . . . 1/2 +1 752752 . G A,AAA 20 . . . ./. +EOF + +bgzip -c $TMPDIR/example.vcf > $TMPDIR/example.vcf.gz +tabix -p vcf $TMPDIR/example.vcf.gz + +cat < "$TMPDIR/example_2.vcf" +##fileformat=VCFv4.1 +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 +1 752569 cat G C,A 15 . . . 1/2 +1 752739 . G A,AAA 20 . . . ./. +EOF + +bgzip -c $TMPDIR/example_2.vcf > $TMPDIR/example_2.vcf.gz +tabix -p vcf $TMPDIR/example_2.vcf.gz + +cat < "$TMPDIR/file_list.txt" +$TMPDIR/example.vcf.gz +$TMPDIR/example_2.vcf.gz +EOF + +# Test 1: Default test +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null + +echo "> Run bcftools_concat default test" +"$meta_executable" \ + --input "../example.vcf" \ + --input "../example_2.vcf" \ + --output "concatenated.vcf" \ + &> /dev/null + +# checks +assert_file_exists "concatenated.vcf" +assert_file_not_empty "concatenated.vcf" +assert_file_contains "concatenated.vcf" "concat -o concatenated.vcf ../example.vcf ../example_2.vcf" +echo "- test1 succeeded -" + +popd > /dev/null + +# Test 2: Allow overlaps, compact PS and remove duplicates +mkdir "$TMPDIR/test2" && pushd "$TMPDIR/test2" > /dev/null + +echo "> Run bcftools_concat test with allow overlaps, and remove duplicates" +"$meta_executable" \ + --input "../example.vcf.gz" \ + --input "../example_2.vcf.gz" \ + --output "concatenated.vcf" \ + --allow_overlaps \ + --remove_duplicates 'none' \ + &> /dev/null + +# checks +assert_file_exists "concatenated.vcf" +assert_file_not_empty "concatenated.vcf" +assert_file_contains "concatenated.vcf" "concat -a -d none -o concatenated.vcf ../example.vcf.gz ../example_2.vcf.gz" +echo "- test2 succeeded -" + +popd > /dev/null + + +# Test 3: Ligate, ligate force and ligate warn +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null + +echo "> Run bcftools_concat test with ligate, ligate force and ligate warn" +"$meta_executable" \ + --input "../example.vcf.gz" \ + --input "../example_2.vcf.gz" \ + --output "concatenated.vcf" \ + --ligate \ + --compact_PS \ + &> /dev/null + + +# checks +assert_file_exists "concatenated.vcf" +assert_file_not_empty "concatenated.vcf" +assert_file_contains "concatenated.vcf" "concat -c -l -o concatenated.vcf ../example.vcf.gz ../example_2.vcf.gz" +echo "- test3 succeeded -" + +popd > /dev/null + +# Test 4: file list with ligate force and ligate warn +mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null + +echo "> Run bcftools_concat test with file list, ligate force and ligate warn" +"$meta_executable" \ + --file_list "../file_list.txt" \ + --output "concatenated.vcf" \ + --ligate_force \ + &> /dev/null + +# checks +assert_file_exists "concatenated.vcf" +assert_file_not_empty "concatenated.vcf" +assert_file_contains "concatenated.vcf" "concat --ligate-force -o concatenated.vcf -f ../file_list.txt" +echo "- test4 succeeded -" + +popd > /dev/null + +# Test 5: ligate warn and naive +mkdir "$TMPDIR/test5" && pushd "$TMPDIR/test5" > /dev/null + +echo "> Run bcftools_concat test with ligate warn and naive" +"$meta_executable" \ + --input "../example.vcf.gz" \ + --input "../example_2.vcf.gz" \ + --output "concatenated.vcf.gz" \ + --ligate_warn \ + --naive \ + &> /dev/null + +bgzip -d concatenated.vcf.gz + +# checks +assert_file_exists "concatenated.vcf" +assert_file_not_empty "concatenated.vcf" +assert_file_contains "concatenated.vcf" "##fileformat=VCFv4.1" +echo "- test5 succeeded -" + +popd > /dev/null + +# Test 6: minimal PQ +mkdir "$TMPDIR/test6" && pushd "$TMPDIR/test6" > /dev/null + +echo "> Run bcftools_concat test with minimal PQ" +"$meta_executable" \ + --input "../example.vcf.gz" \ + --input "../example_2.vcf.gz" \ + --output "concatenated.vcf" \ + --min_PQ 20 \ + &> /dev/null + +# checks +assert_file_exists "concatenated.vcf" +assert_file_not_empty "concatenated.vcf" +assert_file_contains "concatenated.vcf" "concat -q 20 -o concatenated.vcf ../example.vcf.gz ../example_2.vcf.gz" +echo "- test6 succeeded -" + +popd > /dev/null + +# Test 7: regions +mkdir "$TMPDIR/test7" && pushd "$TMPDIR/test7" > /dev/null + +echo "> Run bcftools_concat test with regions" +"$meta_executable" \ + --input "../example.vcf.gz" \ + --input "../example_2.vcf.gz" \ + --output "concatenated.vcf" \ + --allow_overlaps \ + --regions "1:752569-752739" \ + &> /dev/null + +# checks +assert_file_exists "concatenated.vcf" +assert_file_not_empty "concatenated.vcf" +assert_file_contains "concatenated.vcf" "concat -a -r 1:752569-752739 -o concatenated.vcf ../example.vcf.gz ../example_2.vcf.gz" +echo "- test7 succeeded -" + +popd > /dev/null + +# Test 8: regions overlap +mkdir "$TMPDIR/test8" && pushd "$TMPDIR/test8" > /dev/null + +echo "> Run bcftools_concat test with regions overlap" +"$meta_executable" \ + --input "../example.vcf.gz" \ + --input "../example_2.vcf.gz" \ + --output "concatenated.vcf" \ + --allow_overlaps \ + --regions_overlap 'pos' \ + &> /dev/null + +# checks +assert_file_exists "concatenated.vcf" +assert_file_not_empty "concatenated.vcf" +assert_file_contains "concatenated.vcf" "concat -a --regions-overlap pos -o concatenated.vcf ../example.vcf.gz ../example_2.vcf.gz" +echo "- test8 succeeded -" + +popd > /dev/null + +echo "---- All tests succeeded! ----" +exit 0 + + + From 3f6a1b52f8aedb15ec3bd6e243de3267a94e4e2e Mon Sep 17 00:00:00 2001 From: Emma Rousseau Date: Fri, 13 Sep 2024 09:08:23 +0200 Subject: [PATCH 17/28] Umitools prepare for rsem (#148) --- CHANGELOG.md | 3 +- .../umi_tools_prepareforrsem/config.vsh.yaml | 107 +++++++ .../umi_tools_prepareforrsem/help.txt | 54 ++++ .../prepare-for-rsem.py | 271 ++++++++++++++++++ .../umi_tools_prepareforrsem/script.sh | 32 +++ .../umi_tools_prepareforrsem/test.sh | 55 ++++ .../test_data/log.log | 103 +++++++ .../test_data/test.bam | Bin 0 -> 11123 bytes .../test_data/test.sam | 119 ++++++++ .../test_data/test_dedup.bam | Bin 0 -> 18822 bytes .../test_data/test_dedup.sam | 201 +++++++++++++ 11 files changed, 944 insertions(+), 1 deletion(-) create mode 100644 src/umi_tools/umi_tools_prepareforrsem/config.vsh.yaml create mode 100644 src/umi_tools/umi_tools_prepareforrsem/help.txt create mode 100644 src/umi_tools/umi_tools_prepareforrsem/prepare-for-rsem.py create mode 100755 src/umi_tools/umi_tools_prepareforrsem/script.sh create mode 100644 src/umi_tools/umi_tools_prepareforrsem/test.sh create mode 100644 src/umi_tools/umi_tools_prepareforrsem/test_data/log.log create mode 100644 src/umi_tools/umi_tools_prepareforrsem/test_data/test.bam create mode 100644 src/umi_tools/umi_tools_prepareforrsem/test_data/test.sam create mode 100644 src/umi_tools/umi_tools_prepareforrsem/test_data/test_dedup.bam create mode 100644 src/umi_tools/umi_tools_prepareforrsem/test_data/test_dedup.sam diff --git a/CHANGELOG.md b/CHANGELOG.md index 034e2422..d88d0996 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -137,7 +137,8 @@ - `samtools/samtools_fastq`: Converts a SAM/BAM/CRAM file to FASTA (PR #53). * `umi_tools`: - -`umi_tools/umi_tools_extract`: Flexible removal of UMI sequences from fastq reads (PR #71). + - `umi_tools/umi_tools_extract`: Flexible removal of UMI sequences from fastq reads (PR #71). + - `umi_tools/umi_tools_prepareforrsem`: Fix paired-end reads in name sorted BAM file to prepare for RSEM (PR #148). * `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43). diff --git a/src/umi_tools/umi_tools_prepareforrsem/config.vsh.yaml b/src/umi_tools/umi_tools_prepareforrsem/config.vsh.yaml new file mode 100644 index 00000000..ceac2052 --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/config.vsh.yaml @@ -0,0 +1,107 @@ +name: "umi_tools_prepareforrsem" +namespace: "umi_tools" +description: Make the output from umi-tools dedup or group compatible with RSEM +keywords: [umi_tools, rsem, bam, sam] +links: + homepage: https://umi-tools.readthedocs.io/en/latest/ + documentation: https://umi-tools.readthedocs.io/en/latest/reference/extract.html + repository: https://github.com/CGATOxford/UMI-tools +references: + doi: 10.1101/gr.209601.116 +license: MIT + +argument_groups: +- name: "Input" + arguments: + - name: "--input" + alternatives: ["-I", "--stdin"] + type: file + required: true + example: $id.transcriptome.bam + +- name: "Output" + arguments: + - name: "--output" + alternatives: ["-S", "--stdout"] + type: file + direction: output + example: $id.transcriptome_sorted.bam + - name: "--log" + alternatives: ["-L"] + type: file + direction: output + description: File with logging information [default = stdout]. + - name: "--error" + alternatives: ["-E"] + type: file + direction: output + description: File with error information [default = stderr]. + - name: "--log2stderr" + type: boolean_true + description: Send logging information to stderr [default = False]. + - name: "--temp_dir" + type: string + description: | + Directory for temporary files. If not set, the bash environmental variable + TMPDIR is used. + - name: "--compresslevel" + type: integer + description: | + Level of Gzip compression to use. Default (6) matchesGNU gzip rather than python + gzip default (which is 9). + +- name: "Options" + arguments: + - name: "--tags" + type: string + description: | + Comma-seperated list of tags to transfer from read1 to read2 (Default: 'UG,BX') + example: "UG,BX" + - name: "--sam" + type: boolean_true + description: Input and output SAM rather than BAM. + - name: "--timeit" + type: string + description: | + Store timeing information in file [none]. + - name: "--timeit_name" + type: string + description: | + Name in timing file for this class of jobs [all]. + - name: "--timeit_header" + type: boolean_true + description: Add header for timing information [none]. + - name: "--verbose" + alternatives: ["-v"] + type: integer + description: | + Loglevel [1]. The higher, the more output. + - name: "--random_seed" + type: integer + description: | + Random seed to initialize number generator with [none]. + + +resources: + - type: bash_script + path: script.sh + # copied from https://github.com/nf-core/rnaseq/blob/3.12.0/bin/prepare-for-rsem.py + - path: prepare-for-rsem.py +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data + +engines: + - type: docker + image: quay.io/biocontainers/umi_tools:1.1.5--py38h0020b31_3 + setup: + - type: docker + run: | + umi_tools -v | sed 's/ version//g' > /var/software_versions.txt + + +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/umi_tools/umi_tools_prepareforrsem/help.txt b/src/umi_tools/umi_tools_prepareforrsem/help.txt new file mode 100644 index 00000000..efaf4de6 --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/help.txt @@ -0,0 +1,54 @@ +``` +umi_tools prepare-for-rsem --help +``` + +prepare_for_rsem - make output from dedup or group compatible with RSEM + +Usage: umi_tools prepare_for_rsem [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] + + note: If --stdout is ommited, standard out is output. To + generate a valid BAM file on standard out, please + redirect log with --log=LOGFILE or --log2stderr + +For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/ + +Options: + --version show program's version number and exit + + RSEM preparation specific options: + --tags=TAGS Comma-seperated list of tags to transfer from read1 to + read2 + --sam input and output SAM rather than BAM + + input/output options: + -I FILE, --stdin=FILE + file to read stdin from [default = stdin]. + -L FILE, --log=FILE + file with logging information [default = stdout]. + -E FILE, --error=FILE + file with error information [default = stderr]. + -S FILE, --stdout=FILE + file where output is to go [default = stdout]. + --temp-dir=FILE Directory for temporary files. If not set, the bash + environmental variable TMPDIR is used[default = None]. + --log2stderr send logging information to stderr [default = False]. + --compresslevel=COMPRESSLEVEL + Level of Gzip compression to use. Default (6) + matchesGNU gzip rather than python gzip default (which + is 9) + + profiling options: + --timeit=TIMEIT_FILE + store timeing information in file [none]. + --timeit-name=TIMEIT_NAME + name in timing file for this class of jobs [all]. + --timeit-header add header for timing information [none]. + + common options: + -v LOGLEVEL, --verbose=LOGLEVEL + loglevel [1]. The higher, the more output. + -h, --help output short help (command line options only). + --help-extended Output full documentation + --random-seed=RANDOM_SEED + random seed to initialize number generator with + [none]. \ No newline at end of file diff --git a/src/umi_tools/umi_tools_prepareforrsem/prepare-for-rsem.py b/src/umi_tools/umi_tools_prepareforrsem/prepare-for-rsem.py new file mode 100644 index 00000000..b53d30ac --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/prepare-for-rsem.py @@ -0,0 +1,271 @@ +#!/usr/bin/env python3 + +""" +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Credits +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This script is a clone of the "prepare-for-rsem.py" script written by +Ian Sudbury, Tom Smith and other contributors to the UMI-tools package: +https://github.com/CGATOxford/UMI-tools + +It has been included here to address problems encountered with +Salmon quant and RSEM as discussed in the issue below: +https://github.com/CGATOxford/UMI-tools/issues/465 + +When the "umi_tools prepare-for-rsem" command becomes available in an official +UMI-tools release this script will be replaced and deprecated. + +Commit: +https://github.com/CGATOxford/UMI-tools/blob/bf8608d6a172c5ca0dcf33c126b4e23429177a72/umi_tools/prepare-for-rsem.py + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +prepare_for_rsem - make the output from dedup or group compatible with RSEM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The SAM format specification states that the mnext and mpos fields should point +to the primary alignment of a read's mate. However, not all aligners adhere to +this standard. In addition, the RSEM software requires that the mate of a read1 +appears directly after it in its input BAM. This requires that there is exactly +one read1 alignment for every read2 and vice versa. + +In general (except in a few edge cases) UMI tools outputs only the read2 to that +corresponds to the read specified in the mnext and mpos positions of a selected +read1, and only outputs this read once, even if multiple read1s point to it. +This makes UMI-tools outputs incompatible with RSEM. This script takes the output +from dedup or groups and ensures that each read1 has exactly one read2 (and vice +versa), that read2 always appears directly after read1,and that pairs point to +each other (note this is technically not valid SAM format). Copy any specified +tags from read1 to read2 if they are present (by default, UG and BX, the unique +group and correct UMI tags added by _group_) + +Input must to name sorted. + + +https://raw.githubusercontent.com/CGATOxford/UMI-tools/master/LICENSE + +""" + +from umi_tools import Utilities as U +from collections import defaultdict, Counter +import pysam +import sys + + +usage = """ +prepare_for_rsem - make output from dedup or group compatible with RSEM + +Usage: umi_tools prepare_for_rsem [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] + + note: If --stdout is omited, standard out is output. To + generate a valid BAM file on standard out, please + redirect log with --log=LOGFILE or --log2stderr """ + + +def chunk_bam(bamfile): + """Take in a iterator of pysam.AlignmentSegment entries and yield + lists of reads that all share the same name""" + + last_query_name = None + output_buffer = list() + + for read in bamfile: + if last_query_name is not None and last_query_name != read.query_name: + yield (output_buffer) + output_buffer = list() + + last_query_name = read.query_name + output_buffer.append(read) + + yield (output_buffer) + + +def copy_tags(tags, read1, read2): + """Given a list of tags, copies the values of these tags from read1 + to read2, if the tag is set""" + + for tag in tags: + try: + read1_tag = read1.get_tag(tag, with_value_type=True) + read2.set_tag(tag, value=read1_tag[0], value_type=read1_tag[1]) + except KeyError: + pass + + return read2 + + +def pick_mate(read, template_dict, mate_key): + """Find the mate of read in the template dict using key. It will retrieve + all reads at that key, and then scan to pick the one that refers to _read_ + as it's mate. If there is no such read, it picks a first one it comes to""" + + mate = None + + # get a list of secondary reads at the correct alignment position + potential_mates = template_dict[not read.is_read1][mate_key] + + # search through one at a time to find a read that points to the current read + # as its mate. + for candidate_mate in potential_mates: + if ( + candidate_mate.next_reference_name == read.reference_name + and candidate_mate.next_reference_start == read.pos + ): + mate = candidate_mate + + # if no such read is found, then pick any old secondary alignment at that position + # note: this happens when UMI-tools outputs the wrong read as something's pair. + if mate is None and len(potential_mates) > 0: + mate = potential_mates[0] + + return mate + + +def main(argv=None): + if argv is None: + argv = sys.argv + + # setup command line parser + parser = U.OptionParser(version="%prog version: $Id$", usage=usage, description=globals()["__doc__"]) + group = U.OptionGroup(parser, "RSEM preparation specific options") + + group.add_option( + "--tags", + dest="tags", + type="string", + default="UG,BX", + help="Comma-separated list of tags to transfer from read1 to read2", + ) + group.add_option( + "--sam", dest="sam", action="store_true", default=False, help="input and output SAM rather than BAM" + ) + + parser.add_option_group(group) + + # add common options (-h/--help, ...) and parse command line + (options, args) = U.Start( + parser, argv=argv, add_group_dedup_options=False, add_umi_grouping_options=False, add_sam_options=False + ) + + skipped_stats = Counter() + + if options.stdin != sys.stdin: + in_name = options.stdin.name + options.stdin.close() + else: + in_name = "-" + + if options.sam: + mode = "" + else: + mode = "b" + + inbam = pysam.AlignmentFile(in_name, "r" + mode) + + if options.stdout != sys.stdout: + out_name = options.stdout.name + options.stdout.close() + else: + out_name = "-" + + outbam = pysam.AlignmentFile(out_name, "w" + mode, template=inbam) + + options.tags = options.tags.split(",") + + for template in chunk_bam(inbam): + assert len(set(r.query_name for r in template)) == 1 + current_template = {True: defaultdict(list), False: defaultdict(list)} + + for read in template: + key = (read.reference_name, read.pos, not read.is_secondary) + current_template[read.is_read1][key].append(read) + + output = set() + + for read in template: + mate = None + + # if this read is a non_primary alignment, we first want to check if it has a mate + # with the non-primary alignment flag set. + + mate_key_primary = True + mate_key_secondary = (read.next_reference_name, read.next_reference_start, False) + + # First look for a read that has the same primary/secondary status + # as read (i.e. secondary mate for secondary read, and primary mate + # for primary read) + mate_key = (read.next_reference_name, read.next_reference_start, read.is_secondary) + mate = pick_mate(read, current_template, mate_key) + + # If none was found then look for the opposite (primary mate of secondary + # read or seconadary mate of primary read) + if mate is None: + mate_key = (read.next_reference_name, read.next_reference_start, not read.is_secondary) + mate = pick_mate(read, current_template, mate_key) + + # If we still don't have a mate, then their can't be one? + if mate is None: + skipped_stats["no_mate"] += 1 + U.warn( + "Alignment {} has no mate -- skipped".format( + "\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)])) + ) + ) + continue + + # because we might want to make changes to the read, but not have those changes reflected + # if we need the read again,we copy the read. This is only way I can find to do this. + read = pysam.AlignedSegment().from_dict(read.to_dict(), read.header) + mate = pysam.AlignedSegment().from_dict(mate.to_dict(), read.header) + + # Make it so that if our read is secondary, the mate is also secondary. We don't make the + # mate primary if the read is primary because we would otherwise end up with mulitple + # primary alignments. + if read.is_secondary: + mate.is_secondary = True + + # In a situation where there is already one mate for each read, then we will come across + # each pair twice - once when we scan read1 and once when we scan read2. Thus we need + # to make sure we don't output something already output. + if read.is_read1: + mate = copy_tags(options.tags, read, mate) + output_key = str(read) + str(mate) + + if output_key not in output: + output.add(output_key) + outbam.write(read) + outbam.write(mate) + skipped_stats["pairs_output"] += 1 + + elif read.is_read2: + read = copy_tags(options.tags, mate, read) + output_key = str(mate) + str(read) + + if output_key not in output: + output.add(output_key) + outbam.write(mate) + outbam.write(read) + skipped_stats["pairs_output"] += 1 + + else: + skipped_stats["skipped_not_read_12"] += 1 + U.warn( + "Alignment {} is neither read1 nor read2 -- skipped".format( + "\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)])) + ) + ) + continue + + if not out_name == "-": + outbam.close() + + U.info( + "Total pairs output: {}, Pairs skipped - no mates: {}," + " Pairs skipped - not read1 or 2: {}".format( + skipped_stats["pairs_output"], skipped_stats["no_mate"], skipped_stats["skipped_not_read12"] + ) + ) + U.Stop() + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/src/umi_tools/umi_tools_prepareforrsem/script.sh b/src/umi_tools/umi_tools_prepareforrsem/script.sh new file mode 100755 index 00000000..d6b3775f --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/script.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -eo pipefail + +unset_if_false=( + par_sam + par_error + par_log2stderr + par_timeit_header ) + +for var in "${unset_if_false[@]}"; do + test_val="${!var}" + [[ "$test_val" == "false" ]] && unset $var +done + +umi_tools prepare-for-rsem \ + ${par_log:+--log "${par_log}"} \ + ${par_tags:+--tags "${par_tags}"} \ + ${par_sam:+--sam} \ + --stdin="${par_input}" \ + ${par_output:+--stdout "${par_output}"} \ + ${par_error:+--error "${par_error}"} \ + ${par_temp_dir:+--temp-dir "${par_temp_dir}"} \ + ${par_log2stderr:+--log2stderr} \ + ${par_verbose:+--verbose "${par_verbose}"} \ + ${par_random_seed:+--random-seed "${par_random_seed}"} \ + ${par_compresslevel:+--compresslevel "${par_compresslevel}"} + ${par_timeit:+--timeit "${par_timeit}"} \ + ${par_timeit_name:+--timeit-name "${par_timeit_name}"} \ + ${par_timeit_header:+--timeit-header} + + diff --git a/src/umi_tools/umi_tools_prepareforrsem/test.sh b/src/umi_tools/umi_tools_prepareforrsem/test.sh new file mode 100644 index 00000000..c94a202d --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/test.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +test_dir="$meta_resources_dir/test_data" +apt-get -q update && apt-get -q install -y samtools + +################################################################################ +echo ">>> Test 1: with --sam:" + +"${meta_executable}" \ + --input "$test_dir/test_dedup.sam" \ + --output "$test_dir/test_output.sam" \ + --sam + +echo ">>> Check if output is present" +[[ ! -f "$test_dir/test_output.sam" ]] && echo "Output file not found" && exit 1 +[[ ! -s "$test_dir/test_output.sam" ]] && echo "Output file is empty" && exit 1 + +echo ">>> Check if output is correct" +# use diff but ignoring the header lines (which start with @) as they may differ slightly +diff <(grep -v "^@" "$test_dir/test_output.sam") <(grep -v "^@" "$test_dir/test.sam") && echo "Output is correct" || (echo "Output is incorrect" && exit 1) + +################################################################################ +echo ">>> Test 2: without --sam:" + +"${meta_executable}" \ + --input "$test_dir/test_dedup.bam" \ + --output "$test_dir/test_output.bam" + +echo ">>> Check if output is present" +[[ ! -f "$test_dir/test_output.bam" ]] && echo "Output file not found" && exit 1 +[[ ! -s "$test_dir/test_output.bam" ]] && echo "Output file is empty" && exit 1 + +echo ">>> Check if output is correct" +diff <(samtools view "$test_dir/test_output.bam") <(samtools view "$test_dir/test.bam") || (echo "Output is incorrect" && exit 1) + +################################################################################ +echo ">>> Test 3: with --log:" + +"${meta_executable}" \ + --log "$test_dir/test_log.log" \ + --input "$test_dir/test_dedup.sam" \ + --output "$test_dir/test_output.sam" \ + --sam + +echo ">>> Check if output is present" +[[ ! -f "$test_dir/test_output.sam" ]] && echo "Output file not found" && exit 1 +[[ ! -s "$test_dir/test_output.sam" ]] && echo "Output file is empty" && exit 1 +[[ ! -f "$test_dir/test_log.log" ]] && echo "Log file not found" && exit 1 +[[ ! -s "$test_dir/test_log.log" ]] && echo "Log file is empty" && exit 1 + +echo ">>> Check if log file is correct" +diff <(grep -v '^#' "$test_dir/test_log.log" | sed 's/^[0-9-]* [0-9:]*,[0-9]\{3\} //') <(grep -v '^#' "$test_dir/log.log" | sed 's/^[0-9-]* [0-9:]*,[0-9]\{3\} //') || (echo "Log file is incorrect" && exit 1) + +echo ">>> All test succeeded" +exit 0 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_prepareforrsem/test_data/log.log b/src/umi_tools/umi_tools_prepareforrsem/test_data/log.log new file mode 100644 index 00000000..e4b56e57 --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/test_data/log.log @@ -0,0 +1,103 @@ +# UMI-tools version: 1.1.5 +# output generated by prepare-for-rsem.py --log test_data/log.log --sam --stdin test_data/test_dedup.sam --stdout jnfgioeurg.sam +# job started at Tue Sep 10 06:43:30 2024 on 4855b4607095 -- 07ae7548-56e8-4772-9b48-7406710fd838 +# pid: 28, system: Linux 6.10.0-linuxkit #1 SMP PREEMPT_DYNAMIC Wed Jul 17 10:54:05 UTC 2024 x86_64 +# compresslevel : 6 +# log2stderr : False +# loglevel : 1 +# random_seed : None +# sam : True +# short_help : None +# stderr : <_io.TextIOWrapper name='' mode='w' encoding='utf-8'> +# stdin : <_io.TextIOWrapper name='test_data/test_dedup.sam' mode='r' encoding='UTF-8'> +# stdlog : <_io.TextIOWrapper name='test_data/log.log' mode='a' encoding='UTF-8'> +# stdout : <_io.TextIOWrapper name='jnfgioeurg.sam' mode='w' encoding='UTF-8'> +# tags : UG,BX +# timeit_file : None +# timeit_header : None +# timeit_name : all +# tmpdir : None +2024-09-10 06:43:30,918 WARNING Alignment ERR5069949.114870 99 MT192765.1 642 has no mate -- skipped +2024-09-10 06:43:30,918 WARNING Alignment ERR5069949.147998 163 MT192765.1 673 has no mate -- skipped +2024-09-10 06:43:30,919 WARNING Alignment ERR5069949.114870 147 MT192765.1 747 has no mate -- skipped +2024-09-10 06:43:30,920 WARNING Alignment ERR5069949.147998 83 MT192765.1 918 has no mate -- skipped +2024-09-10 06:43:30,921 WARNING Alignment ERR5069949.184542 99 MT192765.1 1054 has no mate -- skipped +2024-09-10 06:43:30,921 WARNING Alignment ERR5069949.184542 147 MT192765.1 1254 has no mate -- skipped +2024-09-10 06:43:30,922 WARNING Alignment ERR5069949.376959 99 MT192765.1 4104 has no mate -- skipped +2024-09-10 06:43:30,924 WARNING Alignment ERR5069949.376959 147 MT192765.1 4189 has no mate -- skipped +2024-09-10 06:43:30,925 WARNING Alignment ERR5069949.532979 99 MT192765.1 5567 has no mate -- skipped +2024-09-10 06:43:30,926 WARNING Alignment ERR5069949.540529 163 MT192765.1 5569 has no mate -- skipped +2024-09-10 06:43:30,926 WARNING Alignment ERR5069949.532979 147 MT192765.1 5620 has no mate -- skipped +2024-09-10 06:43:30,927 WARNING Alignment ERR5069949.540529 83 MT192765.1 5658 has no mate -- skipped +2024-09-10 06:43:30,930 WARNING Alignment ERR5069949.856527 99 MT192765.1 10117 has no mate -- skipped +2024-09-10 06:43:30,931 WARNING Alignment ERR5069949.870926 99 MT192765.1 10117 has no mate -- skipped +2024-09-10 06:43:30,931 WARNING Alignment ERR5069949.856527 147 MT192765.1 10198 has no mate -- skipped +2024-09-10 06:43:30,931 WARNING Alignment ERR5069949.885966 99 MT192765.1 10229 has no mate -- skipped +2024-09-10 06:43:30,932 WARNING Alignment ERR5069949.870926 147 MT192765.1 10244 has no mate -- skipped +2024-09-10 06:43:30,932 WARNING Alignment ERR5069949.885966 147 MT192765.1 10276 has no mate -- skipped +2024-09-10 06:43:30,932 WARNING Alignment ERR5069949.937422 99 MT192765.1 10421 has no mate -- skipped +2024-09-10 06:43:30,933 WARNING Alignment ERR5069949.937422 147 MT192765.1 10590 has no mate -- skipped +2024-09-10 06:43:30,934 WARNING Alignment ERR5069949.1066259 99 MT192765.1 11336 has no mate -- skipped +2024-09-10 06:43:30,935 WARNING Alignment ERR5069949.1062611 163 MT192765.1 11426 has no mate -- skipped +2024-09-10 06:43:30,936 WARNING Alignment ERR5069949.1067032 163 MT192765.1 11433 has no mate -- skipped +2024-09-10 06:43:30,936 WARNING Alignment ERR5069949.1062611 83 MT192765.1 11453 has no mate -- skipped +2024-09-10 06:43:30,936 WARNING Alignment ERR5069949.1066259 147 MT192765.1 11479 has no mate -- skipped +2024-09-10 06:43:30,937 WARNING Alignment ERR5069949.1067032 83 MT192765.1 11480 has no mate -- skipped +2024-09-10 06:43:30,938 WARNING Alignment ERR5069949.1258508 163 MT192765.1 12424 has no mate -- skipped +2024-09-10 06:43:30,939 WARNING Alignment ERR5069949.1261808 99 MT192765.1 12592 has no mate -- skipped +2024-09-10 06:43:30,940 WARNING Alignment ERR5069949.1258508 83 MT192765.1 12637 has no mate -- skipped +2024-09-10 06:43:30,940 WARNING Alignment ERR5069949.1261808 147 MT192765.1 12653 has no mate -- skipped +2024-09-10 06:43:30,941 WARNING Alignment ERR5069949.1372331 163 MT192765.1 13010 has no mate -- skipped +2024-09-10 06:43:30,941 WARNING Alignment ERR5069949.1372331 83 MT192765.1 13131 has no mate -- skipped +2024-09-10 06:43:30,942 WARNING Alignment ERR5069949.1552198 99 MT192765.1 13943 has no mate -- skipped +2024-09-10 06:43:30,943 WARNING Alignment ERR5069949.1561137 163 MT192765.1 13990 has no mate -- skipped +2024-09-10 06:43:30,943 WARNING Alignment ERR5069949.1552198 147 MT192765.1 14026 has no mate -- skipped +2024-09-10 06:43:30,944 WARNING Alignment ERR5069949.1561137 83 MT192765.1 14080 has no mate -- skipped +2024-09-10 06:43:30,947 WARNING Alignment ERR5069949.2098070 99 MT192765.1 17114 has no mate -- skipped +2024-09-10 06:43:30,947 WARNING Alignment ERR5069949.2064910 99 MT192765.1 17122 has no mate -- skipped +2024-09-10 06:43:30,947 WARNING Alignment ERR5069949.2125592 99 MT192765.1 17179 has no mate -- skipped +2024-09-10 06:43:30,947 WARNING Alignment ERR5069949.2064910 147 MT192765.1 17179 has no mate -- skipped +2024-09-10 06:43:30,948 WARNING Alignment ERR5069949.2098070 147 MT192765.1 17269 has no mate -- skipped +2024-09-10 06:43:30,948 WARNING Alignment ERR5069949.2125592 147 MT192765.1 17288 has no mate -- skipped +2024-09-10 06:43:30,948 WARNING Alignment ERR5069949.2185111 163 MT192765.1 17405 has no mate -- skipped +2024-09-10 06:43:30,949 WARNING Alignment ERR5069949.2151832 163 MT192765.1 17415 has no mate -- skipped +2024-09-10 06:43:30,949 WARNING Alignment ERR5069949.2176303 99 MT192765.1 17441 has no mate -- skipped +2024-09-10 06:43:30,949 WARNING Alignment ERR5069949.2151832 83 MT192765.1 17452 has no mate -- skipped +2024-09-10 06:43:30,949 WARNING Alignment ERR5069949.2205229 99 MT192765.1 17475 has no mate -- skipped +2024-09-10 06:43:30,950 WARNING Alignment ERR5069949.2216307 163 MT192765.1 17503 has no mate -- skipped +2024-09-10 06:43:30,950 WARNING Alignment ERR5069949.2176303 147 MT192765.1 17518 has no mate -- skipped +2024-09-10 06:43:30,950 WARNING Alignment ERR5069949.2185111 83 MT192765.1 17536 has no mate -- skipped +2024-09-10 06:43:30,951 WARNING Alignment ERR5069949.2205229 147 MT192765.1 17584 has no mate -- skipped +2024-09-10 06:43:30,951 WARNING Alignment ERR5069949.2216307 83 MT192765.1 17600 has no mate -- skipped +2024-09-10 06:43:30,952 WARNING Alignment ERR5069949.2270078 163 MT192765.1 17969 has no mate -- skipped +2024-09-10 06:43:30,953 WARNING Alignment ERR5069949.2270078 83 MT192765.1 18102 has no mate -- skipped +2024-09-10 06:43:30,953 WARNING Alignment ERR5069949.2328704 163 MT192765.1 18285 has no mate -- skipped +2024-09-10 06:43:30,954 WARNING Alignment ERR5069949.2342766 99 MT192765.1 18396 has no mate -- skipped +2024-09-10 06:43:30,954 WARNING Alignment ERR5069949.2328704 83 MT192765.1 18411 has no mate -- skipped +2024-09-10 06:43:30,954 WARNING Alignment ERR5069949.2361683 99 MT192765.1 18425 has no mate -- skipped +2024-09-10 06:43:30,954 WARNING Alignment ERR5069949.2342766 147 MT192765.1 18468 has no mate -- skipped +2024-09-10 06:43:30,955 WARNING Alignment ERR5069949.2361683 147 MT192765.1 18512 has no mate -- skipped +2024-09-10 06:43:30,955 WARNING Alignment ERR5069949.2415814 99 MT192765.1 18597 has no mate -- skipped +2024-09-10 06:43:30,955 WARNING Alignment ERR5069949.2385514 99 MT192765.1 18602 has no mate -- skipped +2024-09-10 06:43:30,956 WARNING Alignment ERR5069949.2417063 99 MT192765.1 18648 has no mate -- skipped +2024-09-10 06:43:30,956 WARNING Alignment ERR5069949.2388984 99 MT192765.1 18653 has no mate -- skipped +2024-09-10 06:43:30,956 WARNING Alignment ERR5069949.2385514 147 MT192765.1 18684 has no mate -- skipped +2024-09-10 06:43:30,956 WARNING Alignment ERR5069949.2388984 147 MT192765.1 18693 has no mate -- skipped +2024-09-10 06:43:30,957 WARNING Alignment ERR5069949.2431709 99 MT192765.1 18748 has no mate -- skipped +2024-09-10 06:43:30,957 WARNING Alignment ERR5069949.2415814 147 MT192765.1 18764 has no mate -- skipped +2024-09-10 06:43:30,957 WARNING Alignment ERR5069949.2417063 147 MT192765.1 18765 has no mate -- skipped +2024-09-10 06:43:30,958 WARNING Alignment ERR5069949.2431709 147 MT192765.1 18776 has no mate -- skipped +2024-09-10 06:43:30,959 WARNING Alignment ERR5069949.2668880 99 MT192765.1 23124 has no mate -- skipped +2024-09-10 06:43:30,960 WARNING Alignment ERR5069949.2674295 163 MT192765.1 23133 has no mate -- skipped +2024-09-10 06:43:30,960 WARNING Alignment ERR5069949.2668880 147 MT192765.1 23145 has no mate -- skipped +2024-09-10 06:43:30,960 WARNING Alignment ERR5069949.2674295 83 MT192765.1 23203 has no mate -- skipped +2024-09-10 06:43:30,963 WARNING Alignment ERR5069949.2953930 99 MT192765.1 25344 has no mate -- skipped +2024-09-10 06:43:30,963 WARNING Alignment ERR5069949.2972968 163 MT192765.1 25425 has no mate -- skipped +2024-09-10 06:43:30,963 WARNING Alignment ERR5069949.2953930 147 MT192765.1 25464 has no mate -- skipped +2024-09-10 06:43:30,964 WARNING Alignment ERR5069949.2972968 83 MT192765.1 25518 has no mate -- skipped +2024-09-10 06:43:30,966 WARNING Alignment ERR5069949.3273002 163 MT192765.1 28442 has no mate -- skipped +2024-09-10 06:43:30,966 WARNING Alignment ERR5069949.3277445 99 MT192765.1 28508 has no mate -- skipped +2024-09-10 06:43:30,966 WARNING Alignment ERR5069949.3273002 83 MT192765.1 28543 has no mate -- skipped +2024-09-10 06:43:30,966 WARNING Alignment ERR5069949.3277445 147 MT192765.1 28573 has no mate -- skipped +2024-09-10 06:43:30,968 INFO Total pairs output: 56, Pairs skipped - no mates: 82, Pairs skipped - not read1 or 2: 0 +# job finished in 0 seconds at Tue Sep 10 06:43:30 2024 -- 4.44 0.25 0.00 0.00 -- 07ae7548-56e8-4772-9b48-7406710fd838 diff --git a/src/umi_tools/umi_tools_prepareforrsem/test_data/test.bam b/src/umi_tools/umi_tools_prepareforrsem/test_data/test.bam new file mode 100644 index 0000000000000000000000000000000000000000..7793c7e3635e9c6a428647be7d067194f1d9d522 GIT binary patch literal 11123 zcmV-(D~!}1iwFb&00000{{{d;LjnM70fmy^PQox4#fx{vm)Hx?@l^({R~wKZ$u_4G zZkO&4xP+~^Ry5xGYCe}?$QGBOcc=Zn{!Y$Gr?%VsxVY=k@0u*NBTVWES6tt4skknADPwV< z`eJ5>LjYrB7$bu~Xb0}k=>Xw2EkvHhWK-}q;zdu21``6QF3I-epG8_Po&!mqD520^6NV7@DeX#>$<_dF}pM*P-JgGE!-FF(?Tr{A67(_wKp=z#&5V}SPHNO>oDuliP znC$J!Zged{rgwP){h&b`-j?>%RqHM93U>o?!?_kC;4Hx~A| zb!)L;cfBm$THL$%Q8)JY@NfCV6HnZI&pij*hbJfdC(qwKIXpaG{KAckMWNe5G&*J? zVR4?9T^@IdR)x|!PTH=nN+n8Js#>e+Vj1CBMrBl*ziCQ8mZcev(k#nT{8?r=8D-@% zi^>cq{XcxgXjZ1~=g7V}!)W&4&ZU3%;O?cxJum$mJBuHD<)!PZdoR83Z~xejUw_N3 zFMjJ!U0S^HO_y$Z^P4X%-u9O2Rd0Im()-%2VQO}|zqEM&2!e0m`^t1*f6HPaC6}BD zreoE$Do(1rOk~`(f{RY&ji`0p=uQ`HtMamtN$ceA)md&b`cG`S|sVMeIb!WBECZy?81*I|m0R z`}+n=#kgd;$yvp;ieu3+)l|GK7}rYbF4l~9x#W2gOP$Ag(&&mMj3q+xMrhu&3DYGP z0KX`+6fmS2&5{6HM$088l$8!CI$;i)Iej(28gPs#Ib;nwB}C8u5V^Y&c`2mNUM`1Q zm1dJWw;b-J`Og66e?AS&TW7)iTo2|Tm;~^Dp9p;Yz~=Uz_E3~Cy=@yvg>7t%1K zk>E`+n5HRRK&S8`HJ@j$OO4!GMK70WMAw?D(lP{d0WhCOU=lVCOf~}Zd0XFfz`g7S z59S*euNjTKCIIvB9f^|)g$Vn4s z{hDCbmp(q7^ixKb?ct2*Je>NsPuRxmv+ z7ZsO6=0&WUTvu9FDp6GnI~4N^B(5o$-^^Oo%*bqJEeIqRqa?{B;@+zFC`DtqjDn@^ zl~u9jnF00C{e3{UJyw$210~rSDaq|yFWee{@)qa4iviI02CmS8_ep?P3mzxBin~VU zRU(;6bXh2++a|A+XmiP!&^0R(*(o?o9mlz7YOc6W0CbZ`Rm)tSS$VQ+bj`>(h3|9d zO-u~mU}E^zD;#=%_o;)4`G7<3wx=x?gzgBvz3r3zo$bY2V6Ur$Rk|T46i+&qGtsIx zk6BgLi9%e#VZUQZ3K=uqD#TK(=$K9rJuni)M7JF|#wt-442F@Bo@{BRW8}?82)$n# zO?L!=d)2G%;52&=N1;R1UvJ%IWJ41p`V@Ci+(K`)UXuPjxN~p372^5)0MFY;cs_sY z<^0v{fc2cjXuGe4EfrH$_h&1@CTxo9GuL=!hUH1ECuG+$&5;{%If?EO>2 zxOaGXa&)k`eA8k{X9O1^dGxPc7#%7m4JBEMg=+1!9c0N;F5* zu3|=4=0LoI^dVNM8rXgY%O&$T024-qLXj++2I*_S>;sXqoDKS~`pXbPq{;1Mx`x1`p(YGYZD&{dsCv=+z}+nBd`4ad;M zF}wpD1CNWC!5uVOLYQ#|v0Wv-O?Dz3)TUf&5oW0v~_RtX!29heEKY! zgj|?Q62d!yt`BIo;B)de!5597BhSkm(MCh~0b~hwxZgZ`9`()%s!;y~thx>U-saQ40($Q@>E1Jm!%}Wl==Pw7@{HG4hKLDC{PJQXY z_RjwC;UXi>s6k|$w3R~ehF}gq#JUBc!Rwqt;SMn}t6HI0soEB}31twb_zw{ZXrqMH z3Q)!_2%%{#M^jJT%!au~=>Wi4j%@bjDerN9_%k7l<<{Et6z4p_@dwGOP!hCXn0jId zqY3chBmoB`8U;86QVXSW0Y8)!QnXAaadKv$uZd93e84Fm>&T|mhTkTfwBfdW-yE2s zOvV$L^k81ReemWU%nxAf{{~Yo5f75|rt(q!W3?@x3i1C}G~} z4scZrh7(>1i~?+B4x=fF0OfGwttJZM0+C^9KPgWyVl*9DJD-h=l8(*jT)9j4TK?zY?by zP0*lVkmmQ2Fo^!(TF&Is2Pd<0Hf%h*SB7eL-r0GT>VNZWNHCYf(2z=5LYv6^G!5om zo?&*kj%QamJ$ThKJcD@e;%64a*gu@!yThHGo!z~~XTN8$;1U82z9(mrCmJypOg5_; z-5?T6I6^eoZK)*kfljta)wzV`U>Hf1NN`W5Vw$DH#KJkPvlH2sNV=EtU7+;#F-qT_ zqIB~JrEgDAaxySR>1U=gaP3eUeAm?IZkl(xerT=ji2RrfeT~5ZN^cKkDL^S2qf~hZ z^=ymMb&#c>b141xUqTof$P&2dz2ilLvzKqiJHQ)Oh|UX+yb?O2G-^Lh2PY1ISb`*p z4o>3&cp*crYE7qL;)uXUI22RV2veWsaIAPZR_)0G?m4VrMhT zsAmX!bb5)I@L;Yi8M(Ja7En)dd+IywFE=D2-JH|~?Z(Oz--rP!4llE19Otmi z0!Hk@2+ORGm@a?cYKD0S#=betFb|Ipw@)m;h%x{O#Y7-(qH3B_9myPrhIUeb{6M9* z?iwmmD%1zSfJ?+pEa5GFq^lTEhu7~^%zJ?dH7_%0jB}VQu<`4TYy$szZJ-y-@1904 z-Zr5Z|Jv37oEJ<_CK13FrU|A4(~#d?f=s+^!71s6M6LwUQ72$<6@(2%bY$f*93szS ztyw8S*PrQIa4s<6#(1GbjM&40gq*_Lp zRwxv=v64+**02W^!yQ3d)wU0r77NRITlG%;jNFus+Gb-3&Q6!QE4x8G%uCAHFI-_Q zfv^ixO~Uv`rsC=FBwepLj3M~9R_jEeKnTnL9^lR4d=Rq(x>%XYAkS;vwwio4v>bFW ztSWfY4sWURHV(>L+1Vi2z-IcNQ~z=r?)CY?GIPAA{pwAv|KwRTH}fA}*>8C6s&sx; zEEX@Fy7H5qlf$DOBb_|qQpGTsorY~hHV#-6%BkRwnndUDP9T+uweEc-wyk#Ps-VqUW zp>Pt!pO`vz2PX+9I3d+`yoFzfWP-Z9a(vIggJgZ=Gm+x(bvFGnbi!m~k+r9M=ZC#_ z?sv$t-h$478OkP~$mW-QNO|XwZvK!P`?nbTg+Mls_D=S;7vH273EhFss1ph*!Li3p zOIj|dN+@!$M$qycpj}K&0W=yaosoHku+n#5+4Fv~Ygmktul=!row)R~nBgB)pSf!$ z;xCzL(-z*V_OPU>Mm(9EnP4&YSc=FB69);h`JgOro2#8GYEF!bEND_?kri1zj4JAT$!4!~__*YZ^i{?roh;xOj@lB5W9A=KxmB@!l z2iis=WSXLuro3(?%(n2E>r=|-aS6cr%^)mJ5Mfn{0J|m7 z2MZ1v7v8j7(wsZ66bf`d;;{})AgDreiUKqhtOsxrLP2U|jk3GsfKEzPfJM{|(u@*# zRXO<}@1s&X_sn&D**+PYU`0M~wPmZJWv>S|>YEBiC9UBu_*c<1$XGR3Uh{tQG~;sn zY#HT$0?gY2l9k5ZA9Rs{u|7Q6vn;FNycRV=Rs^{@&th5@Jcgi1g) zA$NI~h?X)i$qNFphFMY=w=Anu$farA^qqwC2ND(i2Mm(3h#>Tw1pf(-2X5t?G_*$&Om z5}MBOy>l^){i6`g-R+~Jqs3*I!K(t(N`qx7qs+WAseRhF#HKDq$eiDgvME_GZciE= z%Lzgb(xn~t7`+)7J$LLU{&+oC`^OW6oaY=P^v+OwuMtA$D=}r~JUy~>VI!AEXt4pI z?;RuLoyCtlf3Rx(0PDYaB~>imH1{~g?DRQAY{|9}NdEw2>W z#`)ea8xv*=_-X%Edzoo^H{R9JROY`mz8ItAjg7bh%8v!h3hsR>DdeXf{qS9@SmK;F zZ|~{B9LyU5{K8NnEts~2VKZon1x-276fj5ds`93-i7l2aPf(|;62@z9kW3)qx)!og zuX>u5V?Ua@ws%`i$yUT}o@&pc-p#DYXPCiy)&1CM&LWz17I(hDqdBm3ImYe}(A?SE z-8-<(f)yyS)Kb&{86> zfx0RXE9W&-IBp?xNXU32st!`C;)3xO75Jox3GfGsU_Ws-r({mU4B)HdKQmw1 z=<^ecLpoZIF>QO>L=_K%X*o*&C+k0gtE15edinh7X?U_(JYPE_pM>$Bh4MMzc`gZT zU}@Lm|Ik{`y~4$%HcpjhZ9_%hM>Uu>LEtfEbvQ&ziL{LGBHB!L5CMSiO5zE1mB(SGEk#JmB*-_&& zxDBT<|NFwQ=CSn)VIG^ze&2fH@$5eoYUfqV{t8U`*$=NxH2ZYJ?5`Wo{-sWrj(3K8 z|2{YSdocDNf_uNae|UhR;YI3Zp$0$Rq2#P(&6}bF`5|NYP1e*HrgR9F)e)Uk!SwjUfgs5ddP0Wtm5_U%hB&nD!H0@p@0nF$xczU_N@9S-NS~%>T39 zRkvATEDrG8+dbYnwq~9+9APdblP5_F#T2SVBm&stM3x;zCynIDM5+X2RKYX~g;Jw} zEn13E5G5<*bBOz5lnqU|i_|_fHrLiuOe!Bap-Qu58VwQiw2Cu0#8En%&7oADGB`SY zR=0WEES}Zws@o)t|9_zBBRty}#cP>^6sh98fJFnnhaW)*OKR-{Pt01bd5rQlODfG` zh7+=CBuX+G^#{mioukaviul4<**$FQm^k;-Y-BNjF{FIi$UkJG5mTge(YUK6q^aX{ z?C)>|c)q|d!R_&U<3cTlF%{yuw{v`avbcCN@qIBBQ5hl%1RJ#=jb{SXL5F;e2vtE& zLjhQ7qO7Q)MY3D-$ajX2-MpbT zc=Hr8N9v6sdsnE^*A}uVAlYOxn~rF(nNfKbi#ey@UTdET$lkdLvg;?1d9D50j1~Od z-PP1~Z-4v1t;9K#eFK0J+*Q+5d5kg-oO#PAth_fVs+Xu71n;1@HP{wD**ICalf zU-dNSsbH-ut)4RK@%$&}^C~{z`Q4Ef^Z^fH945C8PYQU>>@!ZqbaTKHZJc(w@fgK2 z6P6(jE*q_MaK{{!n**L7a8P!8gJ?eDpu8VruLwkQe`oi2?_{xUdi3KGL={5_iDZmu zzP14^ib8d#kf4#*4ex_at_^IKa1w()nVRbUf3TTvp1P{W9+v=G^y{}20Dng z3(+><>vJOb5VHWZvQuRXIshO^1x<$@bCB3&$%iR= zt1!^XHeEJ0rqRd(N1j`a9RHp636GQgNty#ZfAutznarBZyL&tz=}jhKd=%=r#gj~? z0$b8(&?Sh!Td2Ay>MAZkN9Ao^qgoQP4s~J{1M#|1G6Bs}w6toyt(zPIEl_Yr%G2`L z&LYi9ANfRX;!9f%h^Cb_N4KtN^pD&Xwt{!=_64(>(9($JubyTyZ=IM-ujt=8@Bz?3 zRR2E4?hE(8**iXQI|>nrCleIcm02|hIP)APv0}(>x|o~A|2U;kmhTv3w}DCEq}~aG zoK{lmxCV|j3}+^I1}mZIXGUB`Q371e5SoilQcommp@Kp0iSfW6 z>N>5Ewc7-7Y3DQNV0x7v*1g!FrM)=n#k61W7>$K4Ez_(gLu@_I7WDf8+n;5_T{1WH zfe-H7yWBc$ZPRkHw#hd_K6lSa)m9uu+pHkoy`%X6;zey)S_IWHVq802CYd-oF|oF{vSM~e%{ z9UJo6njtwXSc@nV7#666GB|FOlnbt**s5t+BNC!8px!yJ;Oi3#P7w(_WVb%++Y-!< zb8fNAxZfsnJ<+xgsn^%Y>8tfo&K!hI4l74CilQ>Np61SRW7oHUR{4P^UNqCq@U010 z&N7a{l8aE2uO(P3$<3-7pB87%8^>(&Eg0>OaPiR$Sl>Ga>(xGwxyw4`>qjZ}yD;{K zP_mAW!AzJX+cB-;q79G)io{4q5>(q|46Ovk*ybg|aN0#C)<_g%su0AC>AHlj^0uhK zo9A&%D5^wtAlTq^5`MAwi&nJKY%LC}KDPI~lX2&j*CsTl&>y`B9s$*dH|=>b4t9n; zF9^extOjEr3Wd<(Y8KHH@wI|r!Xf0rigpm zIu^8a2I+T_pl;ks+Rm&IQWn4!0nkG$xcFIR&S&B}76r3BH+Tn*=#DwT9HRK07XxmX>aV38Jk+Xx)<3 zRJ4gXNR1p6X2B`b0f!-y%}mghBC_tsNlD+g)29x@)UAt-%zC*`U0J*4Z`Wi)Up`4} z{!6`**z9ia?Hz90ekPRUbyLV1>MG(wi@K|!^e8vf8e_aJA-+&}sZ_!mip`}~1vDGj zO4`|z$D&5y3G#^0Ox^Nsn|!Bcr>vz*RHC7?*m&iM-Al7RtsY17Zod!+mfe z&Y75T@JpjMe`>qSB4-K_+T%ht_#t$CSsCzhdm6J*9y#~%U>N&@rb7Qh?$lj`)HtjZi?ACrdXKYF||{4Q%l_n zKu-olo+McMZ2;ZY!S=3}k?kN{8jiu}#mX!{yng%65Br4b*X|xXptmvG82j?$(^>BB z?``kecmiVHao)j$#i*CFyv9onON=UUr6s(fp#4me9EQEgrO-8JV4_hyjs^9DC)z|7 zNvA93oA}H&o@La#FSm|v6s|cSERC7_|{1N$G3RB>)$tCjQV>gcRB6 zYq>@^=gzuqFeenX?a%IYfVBOw)9{REBeb9H@f-*g!uSP&RdaZnu3qY;sRX#Cgu|Vo zaD*~k)5^RP!_62NCBb`wE+0Tev+B@R%2Aw??rjd zqq_csT}<)VY3lxz*$C~uuNb7$qxpvz`%buj@9vK8krpI}u{I6T)O?P_3~?>^Jk~(u zbY-?7YZMr$9DE)}Fd<{mc@Rr#r&NHMQ$*2flZ{yy;J2r7&cybly1sie&E=RyiL4)q zCVRZtGuD^(Xs3NYL*#Z)Ht%s;+b>uUM5$LiomTE9n7ENMfryq^*FZfyAM74BAeaZSh#HV}bm(rS@}$+FdP^GN)Z? zi-a6h3u8cO7}rhPHnG|9!tNAnw~}@%0;yXG(S%tYv2@r_s!EX+MH!i22H|!OO6|{? zi@v-s&s>;dv9gR4&(yE zMb{=18$KbBQ@0R*Za;*yHzQ_Kugu(!Xufm5+`thGjLVl_y(*Q2?frpNIvhzvQbE%OhfIm@9oKyQ8Z94>p8Ku!e5(8)i7fEdmUxoAmM1C$gE)uBJ>n0oO?2|mH zuacC#>W}frR{hT!XwQ3$*Fik{>{*yc^H=U$jW@gdR_dO+hX>ooN46>tr^0vz00k}b zq9RkmJ5DB(bQqR0Cq}jI@}`FGso>X94W(y>;Ei#PY7>lSBWh6obvep^X884iY6_3> z(%1lx@@fu`<|j|{KTn(Gue$qI!%af?w?ZAaXqxs|M3hAhzXEYX4T%=fRj3{n5KLlN zxByj%*Q%o>4?5QhK+4Z zMn@MW(a}pBo?ri)1LNYi%%ZWk2GV)7x4n0~yZDOPFAKqU+fuBPgPzJ8DOD+G2jNQK zFVg~vU)dnIM@lG?j?t1RMSK+kcG%b&wNK=WhB5MFi*Z1iFLtK3%;9u?9>9>|Pd!Fw z)c>si950dEK=9|s2tI2=`S@8=1f4{V5&XG8BA)~V*9hYoRy<{+B{yq&nrwh-?u6Or z*`=noJ!-k^H+!+=K@*|>d?sq!Fib_u@;9V!mx&faDstV&dFQE1{anh*96VeEPI%G>a?&$eS0-#>7)w#%% znC0ME&Dt5O!`T!SUxD+jou0%PKWS;Y{CdM44xa09SNkVTy>Prndyc1i%7mx#Z8qQO z#qnSCZ8ik(`&Pu!fJtpOS0#i>NFgXhZB*Kq@*_SHoLi5KFLG|OM1i?g`;zu2Z#-FDf$&ne?GE}Zy1ezXjLwckM_)( zo^vBvp6$p^8(*_{)>^h_gpaui*7V#kx1#CKM|N#I`!}o#(iPACNvU5FEL=60cF17}&RAaLh-?w##)#HfPV@|m=d}g1Z6XD@AR}4PK&SEP%&IxZt9L89XKbNS@3;wEN9>0Wa>9g zYYrO6Z&@D=FJ8M^FC(nK6iSuF)$#`=_${gGg!atIb4q7vhiF8;9JHZgU>6v`Em%cY zp!P@)v*8+pc@fj)L?Y5I3j9p+cJ-S~&&15mta&>&`7P$q+`J{?O%Bbc|E(e8^$!}M zu?IplkB<%x4$YHBY9)x5E@@wLB3NYH6gsA5{#1+8tFoS{-KeR_h#uFYA;NW4GP*Q-QgQpPhgG`&&w=W_gQ<@I@R zusU`c5?f;={1dkBd)*)t{_$iq_HZB*$9ubnM~CM5WNjYjjM{0TRV*)sqBU3o#lM_P z6}aUNSrbkv&I{PAR;ZY$xkgkdL^IGMjjp<(^KLSg(=bTOsL$}%r0}Db7B}D?7d2u) zJT-5PAh1)R&%b>GTRc3U%%Hbz8$9TVX2I-tUU+lsZHqJI6=>6=gIx6-6g;B`o-eb5WQn0=lk}7KeYY;)CN?r zp?xhCD`31qWYP-3m^yb_(ZJS&1V&0{e%$sn zVbXHcw%)gLI4)w^PBWSKCJ?icmXXzZ{f1|hjr##U@7JF_IFbs#{*1;R38i9pcXw~c ztUudu?;#~I?d~YR-Il7Qr@;V*TqFgj;Gz^Aim)v%QUWpUxvQ<`Vq@0@VAKC`{2S1V zx!%G%HnghWu{A-`$l-bj>%WJa`uCQ;w?=rLG*(I9?R zbxynUCZKPW^s64%Ky$T+iRE7!+NO`Jf7r!GCbQq8**`Lw{l5w2=;~%)K52))`!xG} z!|bmc&;AatMSo{to!-3|#$+)2o!#A&qwU2v=~*9nQo+yDGn6_cFtyA(=v9rP9H)NW zrmA(B*PvhWMu8}8k=@ihPC8uJa8+MG@pkEYEr!j4GT)IkYIku_r(dyPwsPx3_Hn3h z^Icb-3$lW2Bx7zl$EAHB^^P%8kFV!w9xum8x#;L0jF5VFpd}8euUtJ+PGOd#nUpiH zM`Ukbo$Wy_%kicOhPJdHOJ(=*Q{T5TKJN(i>AP#Uy1MjTA4UGJ2UaVP zAH`S|O4RZGp{wVj_?6J=^Csb_EV7d23T!WIFwAisqnMkE5)4%fykr+kP%ka&YNkg# zq32YvgcZn~HL7Md6r#k2C|T+WSlTC&xeXYp`5^5V45niU({kud7_Es3V$w77UO|_^ zCr*nZZ=S8^e)NIWn4A#)Z=uu;XaXad04Az-OD44dDMgP-sTnRTrkA}>Hs#Is@oKLvYae(jk6<|{PWPAiLx6PJ8`)`*rM~&yD-0Z z{MYY$@l0^}uI1J+mD}#H!t%D!l)oMbi<{m>W1>{t(q0BygDK9tinI@ZOiL-m1+zwe zRubb-l*p4CaGO;jib}igC^r#vV@#2RnuwRS&ScKpCkokHow>&gndc0-&B4tpb9HK; zSQOb!Ps@Hgdfzz}Ekj^#tnR(^F#vP&qAxbJ(YnI72H z^#yD(5NW{4bsZ4lNd8ewP)Sa!BRVGX28LbasPfS>S?Tfcw3joZC%n`s{{dC-lgVcYWLvu4z@E19ArW&A#001A02m}BC000301^_}s0stET0{{R300000 F000^eh;IM@ literal 0 HcmV?d00001 diff --git a/src/umi_tools/umi_tools_prepareforrsem/test_data/test.sam b/src/umi_tools/umi_tools_prepareforrsem/test_data/test.sam new file mode 100644 index 00000000..6465827d --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/test_data/test.sam @@ -0,0 +1,119 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:MT192765.1 LN:29829 +@RG ID:1 LB:lib1 PL:ILLUMINA SM:test PU:barcode1 +@PG ID:minimap2 PN:minimap2 VN:2.17-r941 CL:minimap2 -ax sr tests/data/fasta/sarscov2/GCA_011545545.1_ASM1154554v1_genomic.fna tests/data/fastq/dna/sarscov2_1.fastq.gz tests/data/fastq/dna/sarscov2_2.fastq.gz +@PG ID:samtools PN:samtools PP:minimap2 VN:1.11 CL:samtools view -Sb sarscov2_aln.sam +@PG ID:samtools.1 PN:samtools PP:samtools VN:1.11 CL:samtools sort -o sarscov2_paired_aln.sorted.bam sarscov2_paired_aln.bam +@PG ID:samtools.2 PN:samtools PP:samtools.1 VN:1.20 CL:samtools view -h test_data/test_dedup.bam +ERR5069949.29668 83 MT192765.1 267 60 89M = 121 -235 CCTTGTCCCTGGTTACAACTAGAAACCACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAG E////6/E/EE/EE/<&k`Z3G%vZz z`gZt$L!Wt1VA1i&B!FF?QR|jkefx2a7c&Ofov>{l0}foaaNVW1@Esg_Ol4BxV98=$ zB_hqBZwK{*A}{lT?StqS(p4A@nWQ9i-~~Lmt^zt&mwIWPk|&aln`5a z2-ND}S2!S0=Q_=`eV)B(dKhaBm~TrWd2vo9aL#RX7gvf(X}V~!JDyirdZdd=Whsey z(K(Jg%;_a_0!Mo~=!o@PcNojD4(ABzYC000000RIL6LPG)o{71EY3$!d*c~+l!F*6L6TQ$|ghC)(RR>sZ9 z8qWIx1Kz4z14V>5nZYiIG1Uf21QWVN7_FN z5?aSA34KY^LSGjI^h8}6(8c;Wy+q6O6I~zRvIH+DiTX`+b&9pL2$t$QJ)^%Xp|5z3 zC3-tn@C4VsY3I^Ee$(!y#l0_h>CWOCUVLe*yzjnz>(9LQTYuv1zxJH({Ps(W*S!AH zliu)#ON;M%V|mZ(--Op5b8ypG-naPX#enbfD_{A_1j~0`zgXk|h1W6)c^pPrR;#I_}cQmOYg@0UVGEc&o%dZ!*IWs-u*%&I_))!MWk-`HNEIOhPAJZMQ3Mc z|M+NI$xaxTRoB!xiy@>=b0osgpI*+>B!DAxBj>R^L;&-4=#Shfcc(l7mNG0#$fIrotzvm-Uu)m595}} zl8ZWz%1YK5FIz6Uu#_^3!;sfu#>0rkSyOj1%9}E3S=Hdmuqf*~6B#c!YYG--7gF^o zmIgp2Wa&~rR}!UEO{=Q`-Rf0+v`NjB7Ip}Q8faV zdN8Dgz41?aH-70uH5o_#wwIN9GveAq(c?jtp5a3E<~? z3ORt;g$%&d85{vz1avIy>XJ1%K2;6_-@C99h_lm+7m zOmf@+8JLtJ0176S!4k$sn`|XGctKmh-WO=K(gGQnPeK_pKw{9e_y-nv`rXUr@S!AQ z)m#prl9kJ!*%WBpJPI_dvv~S-gXgu*;E6<#gx@g7a^=a9#sApQ@e3{dXBSZ*ky!>elBBbT4$^e3`TMGEdPDPY!nW z77xL`dCFRbVI|=cr08lXG--2m9tdjv+FaMv&MjoZ(zHf}#Vy60`} zAl-ZZF;5@Qd(W%?*;j;wRh$&}{IWIoe64%W|H)Ai8x&NFL6v^j^J7|VQo6dS5KT2A ztTNseWUnd8DkR2G?$;7FJIlW5P6jR4;-v{je zqVJB_8h6CDPPPW5q8;sy-o6+R{Vp$6)9C#a(94B@aEH8TqBv||4w0X6B)MG`v#M)Z z7)G*#A8HscncyPJqpr?mDMg7ev20*hBX)uO4`m!hs?*O@xz<#>lSCQ7Brt}~q`y5# zfs2ANl!Zu7v~a=^Q7>qQ2<@3>{^er|ozV)5*;``Oz*+27e# z?x&2AmDD6h9gEvA3whHu@H=IJ)D9E^L!cB1vMy05n+~~en75&fK@@}pWh|SPTvZu& zI#)3X$x$e8%{A7hN{`X_MWbmZeNHPg!6&%PlEJB^{P*hvAGAAxo$k>ONG!x40fq7pn*4jTV{2pkXWD_i#SAc->-Hdz}a8;o-^A!Qz1% z77G$UqIohNA%?F+RU%U85M4G^7R69-#=;!#1$RfJLXpEj`C8;|HL4r5`bU zVvNL1j>G-L2#Dusm*E2Op0UfgQXo!yPs`b>lC(%qfrWJdn}E2^1;VCsFTXG~SEO`H z4+yPbid#AD&3B@#VjnhH7 z2B*z0SSTOelox-^C@;3Y^qZoowR!|=PaXTx{lkO(1D(o6oT4?6RSgTfMzo03cq<`m zSq|%IgtbULFn9`n>LO^W( zZv6==H?Ca@tpD{}XCwZsFMTAM3MXOwTVr2p@Fd~v(J0{?83`R@VEkgia*;ZXfAx-tXeEuu8cBXelg^`)2tTjO2I)c}OX2^ko!K2L^#5l7l|#D@G7Ze>(FAveF${+1!lglV=&l--)NPdFOJm zcz7I`9BhN*Kh#lk6vyBR5zB>D$C2!I9pbiD#<}24DByKOjiflHI1s8GQHC7J1_T+a zGRlHmq{m^|@GiV6vN`h$HvC#cNG3X`Q8D_!bF0wimo|YJjljILH2`Nt^RF)kfbSUl zbOUBeb>P!NRiJhUmFAsNeoFL^iXg;}@gYFVBa>~=?JXUxOza)^Y8lw2v%`DpkIH)U z35>UB4H3r6i)5G|h#osV6ejaPT=*i^X?#a=~;vT@5K{+!HH{FH64XheG3A7cJ zVd#7(t;9`POY=dI z1<|RY$?Y_+FDgJy9=DYeZlZ18O({C-96lNA3%9&9J$}eOL z(GYaEY^a*mh>ko?gm9CC?J7}-sTe3lDWKvUbPj^2S}>8v_zf9nVRS(e{Rw%QK$T(= zgNe>o0v*q5^-j!<2YNk8hakQWiUJ2=k5JaKzH}ndw3rirtLGotloY#qerK=eB!|{q zT0K|t>cOdHll1(&*da?obs56UQ7h~+Q5F|dTP@L@2e!>+X{vO_bgJdTEUB`el{70^ z=}f`Y0TRF1?XOrm*O*=zTLm*%etMIhUzpYNi?>bNNPYxs|HISsqrIbT?Z=zwRKPJ0 zxTeC@5a%Xu;}G8|DN_ITWpTA0!TjgZ1w`bJ;;KbXOU35fiJv!%iURE$4-Xxe$8VTkj2DiW8WBK6X#m?G` zeZf53J3d}~4CSKJfq2;pon@dM)=tY}|MRl2mnbGW&`^$0!TmkYXK{XXe0xq2`O=S$ z?rv9`F!%2Mxi2Yay*t~SmyAA%Wd0NCzmkO+2P#Alq^G#ME%)wnqX+jqW%%s=(qdTq zOXJV(aA#*{cW?2rCoC4AaX~rdaTW>|OJq{W7C;5d8u?btkcc5=5a6dWh5!Q8I}^|x zEQ846vH47$kI?&+W*^DwrIlLIyET)#DOCbWZ}w68(io+{LFr2)l(w}DxG4RVu?$>2 zl)RfslR>y=?snFirsuH-))MRjr8loZ=|&eNYg2#m4-K-^+tjBo7LR+fghKh=v9hV) z+Q7Xb&Sa7aq)1`e6>*IG2uv`c7DpKiK`-$#k3rputZh&i>wqOLMc$q>@ORL&j+vld zJK~@(RvFK&fpDrSh0T@%!>6gY=+aDT>xH(CnDm#cdKQ`J#4sL8g;UjfL4T5Ex(rNX z5!rPh`RhJCmwLq+*V(6A<<@{3VR;QC?uQlwzSnpXr?IT@>H|089j%9pLNXoDfs2G+=*|QLwQiG~C zs*5p4&;@GVO*7A^h?6j=UkK(dR9{_MZbDEw*Abjw-W1FyqhQ_!pbvcQl$IliKlk`p zCN-QY(HtQXc3n$i$6)YUfog?BMp?BrwWM{jL=Fmr7(y0VgCFTC)VMPQQEeA7KRF!T zeNwojWaaAc|B~h9s9|IUsLV_tpEu_U;s;r5;-jwuo zH@~K0;iOEded7J{RM#;~Z9{*x_SODoaGV#;$Qaw|5Fuwp(`q^){13;XY0xB14?)0J zc?bBAYa+=6$zD<6tVM}~qeKKc1PnU>j$=YPk`Pc1h&xedMFkgChAdVoEP=lWvyeFo3J`bRXL*E-dK5WB%Pwl*$2sa4W!80!R}tqW{7@ysoQG-$gW#DgMBEC7 zx`yN+m8cdBdz)Q254q%au^7Xcx*So^zoI#nK(}m=BU8@{<=IpZnrcZ`n_-kwSJVMH zDk%Lg%zE_|MYpuKpcYn__PmZS7PpO+{`hG7WLIaMXNFToEo+;w;bF_DT8G@fri{O_ zk+WLB{IGy^KEG z%jiJW5x`F#he$?77h76J-$-Fn4hDqwf2_i)g%@fXA!|fTq64wlbqMEU{EzrEit`XA zqJ)iz#bt_>s9SoO^o=@+3J?)KB|0pm=5UQ(YJ;6rg-VHwJ>FpGyg+#`-PlI(>&x5^ zkHRgVI06;tmIN=kf=&&p=ySujxiFFz9gV~993Qs=>K$XVuIyFZ`2zy z-#~}lnknwP#-X z@eCwqV{?77ceKB|ySV3iq-WPGR6n#t;@VUKwdfp*p&B2c6o`J2oWXK8$YEEG^5BoXDl>O|cz?aR|s|Ywh@Fjp>FjhVTrrHj9 zZtUVLL%s%ESW{;O6WK)vGILAIaS)K2=pilE zn1CtKDbg~vk0kB4Vks=u7D*FqjigdPg1~C{5b6!mPiA1b21qNgtNNnW`-Z^a`QvvC zy8cFk=P%x&t-p)s(ca14_G0k_(shKnsQ-Z`K&2Tjlgm7$e3$lJWei+_$Y{$z1KDjU zwJmuITNzKH+w(nw73=8`tm(f6GWJ@ZN`7=@wdB&HpK|WrmJ@7vMJT<~Q&^1%g;$Ym zQH!f$qVb}zjp~MSeoj5>JVOB=vXenmFD8MBnQY&^39e`-joAwy1kPs2U?m7viC()Q zymBiFKkyX0fwBL{U$c0NxxsJwql?A=^FH$9!~K0&Bz)xK`vC3Jk=Wv5oJ=jJjc@xY5`lrq+ozfhz`Z`SW>rs0ldmAT@L-eiP;=&wzisX6qS_(eT#}} z%N%xp_P_@HhK|u_fx!mpXWrC-sW-K9_8&L-zT0No1phO@ywxj+)7pEy?TFhu`-dlc zx+2aQt9S*j7VK^XWWg+=D8ulj6aWh-!(P+YkdnjpK<=_O<`k|)LQP!ADkx!xbzKoR z7KZ{+o3H4USMLcnNsVP00|%6JB&hZmnVB1&^qS!Sf$H2&^w18ouO~4+kMx|DfTlcK zXWIzP?XhUm+UvZHjt-jjRYcPaAW?Uy_%z&_y4zH3Z&i*gHP|i#BVV>~le&GX%Otf@ z?l``_qTnj~*)r~kM*+o;Np80!`k=wvbno`bab?M z0A}!tz&u-$ke=~nN00q2VuaBfd`I~Cl^y+;K0iXpI6@bp@Ac*FDj}r)u_~BaQYJ~y zY}#WwFcL%2+E%I{9oG-t&2!FU>AqxzZmE~86cIwtSc8!3Nk9JrgHZF#!M?rAAoNMB z-M;0?(Ba|k!O3D8U!}dY$^bDElry!B%;tp0zMELT*ip0a)y#&Q&+8I zVM|2?nf8u#UKnOmBMXELF~?hPj28(~*CC{VQO=|%Y5T9KTNTth2d8}Y?Q7}-xA$@V z-+ShujyA6UFxLK=ua1YiDB`QYejPeI+E_^)9=6_puO0`WBG4E$myB77<$8{k|n zmu7EoFa#b0*=17Bz)X?R0+$Xln)G64MlR@n07KQ961NJW6Fpo(H|v1wP0YUm*U^M2 zs*3~7AK&D9FU(f2KHQ@@B=ID%-{70VK{~G(L*B+6-VSSex#uFRqKz%SBQP`hdFox3 zx{WRF5k#xRzYcP_xdRaUf(?=fn@l=LWQM^|Kp~O|Kj4Nn-ClpcU z)}WM(E$s&OH0_HbCcaS&o7>r=1vXe?V@DrAy85Lto~I}W#oQZ(; z>V8%m8L}uMik#8TVM$kUQ3GxeCrWj-vsUxmw5%dPRFtqCX=f29jkHa-dg@kc!gDi` zCLJ{krhRxG1jC>J^D&G){m@=y#Pi2DmHuxSmHurK_^0k1{JO0IejnEUy>C2^kB^QI zbQ!RTTN$-&jPy688bmBgmZ8i-rIi|~a)X?inobcCwU9(=qM&Wqh_bto_PHaq29E&F z*<{RmvY)Q2B*rtF9_U^WgQ9m@lotZLL^t>ZW=xI!qf!%%WzxS@U#42wRn~0!tJ)rUp}Q7F!QmtU>DxXpr2B zoE9ot6?N>X4HBsT$Dx{FrqMCQJS7^=IHM$WOsTqMjaf2M&-IjyF&6VPVf?NPE1DnN zwZH3+D%{)MJJ{2nD2nSW;sT(sIEQ3{!fA6vOrVvsis%|%wKQ%4 z)ew{in~non>ZPDr6%w*3fn>v};b+)pyPBI?|CkN368BZ_3N@9m=c&Ed%%(rW^WB>a z=W^C?-tnxd;UtV-<&W3Xc&a_%ISghcM8&hRk+IBq3q@xYWUk{;$Fmq^GFDb7NCI1i zRH*Gju%T)ncn6?cNN?9wB5Wt}l8iif9~7lyACh|CXI?t(!(*7ObiR9&bY4FzoiCq} z&JRCpLM`s>9PAwJ>C)|))&Uqtg36&Z1q3D`fyk>8+T`a zZ(?4hpAmQ7vN7(wbp+^_40ylL1Nxf9tDUu1`wD-2vU{+*NNJV`3$skX7K)k=aLzagb(2ZeGSu798F2Kb)vgZ*4+Iy?%mrcd3$FXCBGhP@Ap1?q{PSDdOHDXgfgbt38<3s5Or5p zL#tRPA~?^mt}?hv-ee(u4h3AOj9d5v1bJmvf*V74m)FQzqA*gn-Xs?)Xw`$7EN8E& zgym8%1$w+zpu=M`2TNT*om{3*KClBE&5&T#XsBlVs{NjcsmdtbQT&1$kCz(GKe3Y= zzZXzGYh9Fl{U}PdRN}D%=DQ7;_hRjz`e5!K9_$@2F4BHC0+OurSM`OY*5AxHY|$2~j;rzsmaN-+I(9!g%J z9Lhvif;a#~O^&EI5=9iFI^1=w>x(+v2u!%|6pSo~ zV4Cafji$30tiz``#XCm7)sQu}oRO@t^S55_25-ka+nuW!_@?O?*8}s{`2E2dDTlFx0+h+ zPQ$UkxjPw*fZXeN=U9igxJMMuGt^JYD9hm}QC`Q7S(Xdh1|7yowU}fP2&ynHWf+Bc zL4eglY$j_EfibVLtce;Pjk?!ukm-f4zNXGbtbwq@V9>nrz zrwVUxolO`1=I&He3FF8UEC|3Vb1Mr6eOH1Umm3D{L8T1i_e!4Obv zhZGkYoP{NACgpKg*N{`hL;^9OifUHf>N?}5K~CJ}jSNxpYBl|b&lgFPR{MC8_Ivwtt){CPnPlH&NFBG-Rf@VM{?FutWW%tQtY+YxTQ2LasaSzaIj+hZIB^!E2!(7L#ffpnN1MRyN`*J zTK}+-^w!q-*!x%QO-*NuwSVF1`u@)D@!p9&+2o2)M7^^FlOKauO;0^>$0D*^d&Ssx zn7&UV)2S&sr-f}_w1lXh+ifj`63_RaZM4?&w+7^_c-nR*Lik0Vx*IgN)gZ#CKshc1 z6#)vLikVFf@)O15wgatSNCey9nZar?3ecJCx(JkhEkIttnTs509`b3yF6s=~rf`Xx z;;oC2w%M!b2B8il&-tXHK~DPhXY&l3`|`*>8T#`X^jXT;Q67mWz5#6-`(T+uHL5;#z($n zYCOrOcc^;Q5@iqMGYy23=T#ZyC~IU*R&}h3!nTpf4}o}9cOpi)ByVU!e^b>NOaqSv zux%R_>AOf&DaVMPzVW5bgM)D=t0~*97!wqjD|E-~nfF7elkS#EG(WV-r{6s5(|_w= znsa{g4#PiIn-=UR9nQ* z&mZ-BMQ-D~2(uAeR+Pr`|!S=Gz5kw^6dR z^Qg%^1xr(>>oqw8GIO|w1jZ#kUq z82BjGerIx0+3wDXMw2m76hagSOvdV* zvbII7IYh+SF~O&`s}9mJfv zLPNfXCX#TJGhr_2oCskxi+5fddHwZ&-bkI+)t)_^X9FV7*6AlT|yf|x|cewJ;q z)9aGiIDY=u4TXCrMsS;abh9!QaVE2h=B63AVBXOi&W|2W%L*i}w@uD{Q=+Lll+Otr zY7s(a>c|rg@s4;G=@U(qf~!r4`lQsQZaYv!u9`2D^h}7^d78F7{6fQ}xpnu2x zFpR4I<|d#1HM6lh38LS5Ljrrj$5RWZirvo$T2{l*(tx8T%R1_;N92z5GnWz5yEn}l zm3j#d-?A=1)9IUN!6b;YdCK_nHFO|fVD%9L^|5uKRDKtOG<={ zsA*Apo;9GwBNSaw$ddFkHH3zF0p}!9geiF|XmAm48Xh8Gq9Pgoqv1Ox=wr>KFAV@Q zcC(L#hEb7vX{j#`dVkap%@}QhPt>`NCXUn%in=qmU(lrt%@(K%hMruq+&H;pIns4Y z$$j8t+OtRqf7DCV4VucSXEhX?cuJJ=K~k0+m5PS?zoQJO!U@@gr_J&6f72ZxGGYr#}3h z^iFi&|8jef(ejLkFFeD*`E}18{E%1-YhvLG=gHpT(c%I~;hNGN3FiZnw?RS)40Az! z1K>TS%$Y=B)YVN`^O$NqFbo+hktD^Gha!cV^FC9_+~niekQ`G#?Oo-MvZNiTUj$mk zk9_4hGuy7dI0DOTyK=#L+gE(qyP9APW>1BbL1Bk-+L~%=yol+Tb?OKDy>`4^VnTR- zW7aH+sblyEtUK0#wdI0k&E8w@njwI&_FiAIj*d}s&?E2ih-#!zH5yb&vnpf*HoeIT zkU-=;Xs`#5I&#w?9q9`-Wldg%bS_Fn0CsWLq6k^eot<#DjLxHb3U`;JsTB5mKN9=h zDOgy8H+U?)@%`;*)6=-K(F!m z3~~7BA1jUDs0Y-JQArGji5Z`)XSDtN+4e!(K>SyFGzTdqgbzI_HE62MLk*o9SW>$k zox(_`SR*Wlmj)k+WRQ~X7-EVnys08iqe8)3#&O0Wy>td<&4tJzsz}E#Pj0%eUOD}i z>0sW>rS9G>AHcKjzVaScI)8psAb!gz5Vu+4zpMu7w4viKRue{OcYAN|a9i(oA^JQ- zR87s>vd%>X<>gT>dDT`h0YYSz#2Tv#AP@ByT^ENnksLyHIrJPtE9hiqI~I6!|6?$c zx##{{hTc8%8-{lcy6=n#q0X2zFrQbD^aosqB-#BZ@#oxY;Sp@N#{&$nuNR zBJn3}ooz!wKibOy)arTyK!4KL^~3#>oo%J-rv@`y3np?(I}64V5GKdd16+#n220(q zX0_C)xRy_~^-{I#((8d9+Nd>KouW=+Z(vLZLWG{D`rH;Z3RX{l#2TdY2RDrXcO6mM z>pB5v!L+(AvGxTI%$?nXgA?u6SyPrYz&d11f~Eio$(op4jL5sJ=s4KRk_H1~y`^3@ zI)+M=a86xI8!bEFo8gURSwv@a>%*nV#w$iY_7du@j$NPL#A%82|9LjD^2sim%5uJI z@#e)KuJ`(u(}?C16q?N$(2NE&FRL^816|Qc^ia#xZerCBC0V+>l@23cGzV8K)yTud zUY+Q4eQ70CpWCC4II|{Ik2$;@;QXLFvFTx8^Wx<@=EfX8Yz6Jjy`T+ep^$0(*yA2F z+6=10P}a&$OjU#i>nJmXC^B>icS6bw5r-pmMyWIY^Si z+19Hz;s?5*qFqa(wN@$n(&W)5anUCy4Kk#A^3y?_T$4NVu$y~lY~cE_vv#z5n6S#}{PNml0+jf0(p%yqytkC}iF zhRRHeOz*bUo0!e`P?~b0Bay)DQFR{4#u~S;up6y`^~Mtoy|6|@FZBHZf6xQ>VBa5b zxn3-u;(6npnhZ@t2UJ68L}dSD2hnLbkJEGW^pJ4UdtLdj0l1l zMncpr2(wP+Z5?tmWYY{AwT^Vy0Z!kRW(=4*03y)yifB}?9$chnLj-!}iAtFhJwPlm zN~&l3nUOQL8(PhcNc8rP)c9lgW&O7T$uIhc8%2cV_ka8qcg^hLje5`VgnF6~J`-#I z%J&=xJI8z0b8G@J2Z(Ylh;3cSCigc~1slcY4A)vg8@<)}+su*2iF1hxbyPz5Gw$A* zGwg}3r%~b?u=X21nn#Cw+j>v*l>x|MG@CApI+)85)3}NmOuc3Ig86Ny&UJ|`Wd-(P zOUb1}4|U?aMPCFw|I7J?Nj$U1bI4&xV(<11xi&ssk0W=Q7GlMkcY1Vfob2 zCL!E&lgANjT3(cqY>Em-l2{Ff zoUJlrXB1yDXkP0?i9dz4Ydx#EyM3^`d$M@vjha;Z7zy^dg;Lhw)L5t{qSFB#r7NU? zi-dW$l7jFzh>yN43p@3C^6s9tOJ|F_dTs7>j;F6_yLkH6uNwqk=;-`C&f2}c&L2{D z>SA#{?b|;clqyTGai0!MObC+(&XA_1)$E|3-mM3ErutiwnBm~K$-DjCF%PWFelI=n zY4;Y7bewXpv-Xp|8tPk2&v1*m`Zp$K{Exo1#2k@f`$0@vR@mJ_hV?g2lWt56%UG&5 z6Qv-NyZo`0?eCX<%%0c#EsiVvVRM%cVeRwY2Z`I<+eW!VZOPuK2<40-AP{3=j2JD0 z2gNJl1+{spKVq{FD%3<)l_#2gqCsEsP-y}+*d%@Ss&YwHp;JK!#T|h{ELFOpd&cy@ z8#T#&sUS9e@2Sc{=AnR@R8+sUriY58QKjaIG_JL}S z#i>oQU^Mt$8xmDjq9Fmux*93qJ3Gmop>TtlG4qfULxWH{KOXQg>n-QZkTFl`G@MxV z9&5Az%lWG5muEa6MZceY+(T1Gx{9zr6^LRDhzLhI{FJq+CFc~Ap;E123Hb=z+tYKD zOul6}qQQ#mV^WMRCuXRynhmdbkW>qD-wu{ow~!>xhtgBCOM~VYH;q$rr>Esx-?%k+ zJM967^p(8gA9!dQ5BQKd2;`l}@I4XqH5jE2$><1f(MA3tTrTqxrbi6!V8u(FJF`8bJx*|GpfKOX< zJk`_sWb7eFL~Sxgw)7TH;f>v-WNfn@_afM3Otdkv0-`R6>o4hSgscz|>}z+?_QDc{ zw$u?CrA*6oX*cSs9)&<{Y*f;f=$_bg*k`Iqxn5v(il#47BAsZjyHrbV1E}7)ujSRq zxJL4kStP&zWx4pY(M^BCsDa#P`PL6^Ss&x$h z7XdJ8@)f5IYqBn>38W1pb%qcZPs_(I?!ABfP48nL-I*PiG0(pWE+p$1j{PH>)cuLG>i)s!Pum;F1boJ~o{qYo9<4Y+h{Rb5@uctxq(AIL z*fk>OArdKZc>pgBoU}+iOY}f{Lr7_t?ryzyXq(kU;ig`jRd}94-Jd+`j$Z$QDcSlM z)?&|o?(H5QZSO1b%(_z6ye+%7tZR6tN^r!HRVQna!w%%o`$_=8R%8g9!W^{}-au$+ zn-(39l_Sy;@LaOJ;@LVbvFBhiT}uaPAAFoo_1I4oaNf9ezV@5{`huzTESkmQzj@Yk zZ-4jbQ15NaY6%z4=-4&EqL@huUoMfI!`tT-uVa=m8IlZ(Ji?_>2;0yiszN!hD}*wu zi02y-cjq~BGEL8^?c)snhvnUb_n03aqx@xOKigFTeMKX@|o-5YNYa@jTo)Jl@ld@K*uHw(TN&c)sZ} z7(yA-o1P3%xv9Lnsd6|+UblabE^}koL>H&*$EDIQ^ZS?j@9t0gBH8?nv-QDS&rW*n zdv?4*Ht&u>GT*L3;qYLh?juA5E8{`hT;LMQBQ>%$KAKuIX&39caEqxI6j5nhqEKCAv^|D1 zBgn|IIEj!SglKLI6_ApHQ7zTX|y9^BX zVxjzvO}kjb(JoeNH2;1xZM}KX3m1#$c|LuA=iqop8%+>NF#@7is0mRWN>ZYZcv&IX z1Y?J2iNz7MeS%kNGhWF|9mxUk>a1n0kTpZ{9sx-h&2#vmm9HclRy|kmD}iQQ_3**_ zuGwfbm$Q85KbnE_%UJtu-)J5m92hv)3R_bmE~mi>60kuE?3UV(US<2C4*4#NkzRkxID2d36eH>E_Z90iK7i&I&bBGRqWP&AG_MDmAD+nO@yW54%`>WVK_w#ao^6=1 z8c{vMc!cy>ScN$otNPjjKEqPF{`Da@oenq)oS)K>rmHW?P$wjt@7siCIE&_&XFNJ- zKl6Ripy}t9#0X?EZ&=rcO|JIFMW)9t>0_b57KD<-2(6Q_rw*8X2c8RV>aV2U_x`bp zOjXgEujzA@jhjZ54I6Bd9QuuUB(IkyXe!xM2SuDu4$HO^w5g`e)TFcw-124RMVE%i z>8X#vOw6~`*$5A{OEBs0-M*&I*s^EVt&WOL86LfFQ#n;Bkt*>7dawo({+lT{;ii@nn+0>7_m{25zup*tAaXF)+!s|%Li&nbEj;6;eJ%4gOF`cJ%rz z*mF%dRWl-~4mF_zcq$Y!5tgEyUc+a>yq0v546>&vYY+ltG89qU;Z1aWMA-pSScOnF zQf-XYy+KQJ*TYtiz;x4_X6L05UIT?{FbayopI#LNLFzU`n<=M(lKVdXSC`?hU7AlW zvHMC?m71caf8wHf9nidv(EP&BJ#yE~2F~kxH2?RDraDguU++n&gC^=3jcg-2@`ZsT zE;~`Ojy7_NkPeAJ(kaM@(gs9XS1_5>tsmwwl{qP&qP>Pp)=g7K)aQ`|x)~`k#Juj^ z&?R=-^|CbBs4YlA+IQ%yu5%S7=ps7(F0=iuFu7>^24hLSM}B<=6yi2lnmZUr6T5nrXvuq!hH8;QC7l;-d2C;q4tf>ju- zjz^n2ZKqh184aH3Q!klHC1Lw2Pbv+Lik)kbSwgELS<4({4Uu(VF6)Sf((tgXX|GHU zLn&xzfD&A!_Z20hbOcYBfp;!zj+j-}Mra0VxUbfxBOeot@ko7dv=Z90K3t3W^waM( zv~`X4{%M*0r|+N4?Atv&*giheXA&}oVlX8{00`wYI?}d;r^x{&ZOaBrS4ljlbQENL zCOg4IS<}=nqXH%uyd>?0vy!~xun?%n)md3) z062;+UlpTy`d^?22Iw#>8Siqi!$n9g-+Q)`hOJz_{H4=!@Pk-;#7FXIfA>UN&74kd z;}Q8|nkok-z5zfPVkL$;JM}BX4SA@J+zcI9)^<5uRRq+lIHsKu2%sXG-QZKlBT$xW zK^grtQ)y4ewl%dYJ-umm#rChK10@PHfB9_XHjC!By>#k>NLK%(ujN`Ub+6Y|z_I99 z9>w-BerwbcJCD~6OBsRZ+dP*yy7HJn|01L1@7;vv4YL)Z2YWQ{>nlVguebYXYS~m3 zqB8*o7OyNNr*U*y&hS>6cHCs}Tq5!jvJG?kt4%C*jx=--CZlgx5l{=|FK?<41)~bl z^9+=q{nLY*|IlJs`yo%wkM_3rj&~Q2(zctHG5Bw4n#q6*vP;nwpgCApa{M1kp0{;T zXN72p{%Auw_pzfHE3C;7lvZ&_J1cm@*v~W7vyzedI489z z8XpM$H6OvJtx@f#jS)1XEL;S?(-X$80)mO2qhR>iB(;SabZ6tu-JyB={ zCfUX-62cBq5hsd6$VwH}TmwaUL4p$o3j%>7z&$-IdUn{{n%uhb3K+WxxW#oy206M|63q z;ta4dr;>^*pS(PiyczYg`ZoGkoTdPH>!<)|>FY}dO?P4*A^df|ENL_sW@6qOX<{C7 zS>Tg_SOLWnA-P2Yh6ENsW;qh!801w$UAmHJL`M4`$m)kt7H3F@!EwnlW_12}IHX6@ z^RsZ|`R13Vp;QHiO4C(Ic7Q@=>d?`IXIyg97JOCZ7 zI!@a`3NWgQkrq*%BQgfIG(eAM;0Hi1$O^n_#@i0`tf=a9S*f%0E_!R{H~8Z~KME+` ze9cp5xaUhB9U-x+6D${r_xKWag-E!9qrKA5fH=b$ozJvesn~?XmW#x*ZK>#62AO#K zVpw~-Clkke$d3*eKSf82p)wTGSr4?$Eh{)mKQsUk3=%0QKgZ$Cg%GVAuIR|F$)4+qZ3v z&@Q#W|%&e%!=MAPIIz*)V~hsb?eZZ zFQ@(VE2myFe$`^}yT0rlgUi%yGXY2mxAtvLo-hnc&>LCQ@w@}4j`Ad{!>~dgj)=Ys zQ8-6EvJF|S4!(kWji_lB{5mTmC~wOIk7kM%=jpRPg~}%W8xVsoQ$03*h~std4x#zk zvmMc9sdhrn-{L|D-|WkxMpK=GMTa)gv9eiM!y}3=q#eGfqtW4?5e=MeIL*~Z4H7In z5CqRjvt_BJ21YN7cnR*EXEKW~rfD&?`i}gisqZ9D*Y~yNvEgcK7e9M8s@?W@z#vYx zWc&rJ-QqPIo*e9*?CA|{4Xl4$sxyv31!W>EpxM;bMYDLr79z{KGJ-gwWLZH>fpuJV zJZgDMfr%^%@Jd-!ioCpXdN`e5oBy45rVNOn(-cIi-;Z51S!h|5s9ZNr-O3+0mbXYjVd^C~P@g#W{eXFTVk zD5<#Ki5hWrB&jn&;_3=@wlEwWOPdBR8{}YRb8-}Zm&wwUZ}kR#wJjhW-m)|_K%fsv zhb}MmTQzDys$S4ZX%H+^wN3*rkbdHq+ZyHQxS{LKc<%RHdg60u2G@PD*&2ktH4SyaDhc*8cohsrR;bj*fSaRf`crq@0&UM29(4O$Bs93Wrd^ZK6``aA{fF$f88N z9XC+}&m-;F*ky6pLY}%!VZB{||ZKFY(RS-UZV=PP-3(6{R z(-Dv;BEjPVWrmJsnb7%ju~5E4z_}Js(-39DS`;!AfFpH>2`BCthjgf6Db9O5i+HSb z8$hyhL9yK{T_irSdj8NSY+?0H-#RU<(%QRxso34!-P1!2*L?Pn5*T#YeUz|^u3=C+ zz>x7cXOx1$T9t@Ya8U=RAkOSL(^IVUkCk&?)A3K^{Zn(N2i(496UpscBP5MXxk$d= zm#MD?lFra6ZiPHH4n zhDC9U3v1R@MhvVZuVHy?HN!i+uJ4$&+Xc;eTJ&pGTk>-JI}?PUYAzV!2*T+tzLb1< zMmoo}n;Cto*hYa@h9U+xAp85|oObY91- zW0YxU9V(3QXmD;-)Uhz*NO)ue`4a#gWt66AggUJx790dy9OhtaC9gvrSQ42=Wk8C_ zm#GS>)U>!jWpzn^;%DMwsw42!KfcN1l}=q9@LGX4qIu&8&80!}gxP_bcZOhI%~>Y{rr3erT1+=v-Vu9GFZ+ohXp#oG39b+Hs(&V?9^` znpgRm^9O;<1K+>>+?lZSgTLm0aO>A;F?;c10P889m>FO`cHLr8FzTtQ+ZqVek~%O# zP!A1tOGgC<^N>Zz_AaP@q8f4+=yv&`+umehW`e2nLF>N#-X^Jc*-w6Sd)JotU3x2S z?=AoOYv*q7tyYVk`W;i7^fs*hp8vTY?QL)C5yM>t=C90U$lAOuYusresZNGsnW*9x zMU9AzI5M>y`Ud+`79mBBu;sNty@L*W#bwBN#fq@oo6zR~vm7afqa%s^qCS0lTtK1m zA(O6tkpf#Qp+1N+)e5ZOB$ZPqNdwEr&bCF?@~JQTjw%25cd+*U37SXy`v-c0-KoKn zbe;;2K?c@2&A=&nn+wqYbdqL7n;0*ogE#c?oB-y%Vni(MGdXM6gYkQNYn9rKBjy|t zvtKS9{T=#5I{!@PIr`7F@xyWBZ~KmE8y-pQ6MW^<(n+*c$!dVtltn{RmFSq)26h~I zizJL`JZ)WNWz1=g2%Y0a-L_>>fr#m9Vq8i@AG`}imZybbTg>!Xb9xqyS1gu_jY||S zuU=D4Q2H!!HSf+0ATCTBi8;E$%LV@MnqV)T(Pbh3idKgD z4^hdeza23s_+wtt-hqyq+j$QAjvJ=#ud<3ej(P^-)QX2lpsN`5qAaeUvXu~sGNddJ zniW?}&U^CUeP8()@AlOH3LS*eS77&p+Zfk2ZQ`X zWh0BiHtXVyPRwgVp4CVScm|+zl;3K&lA0rXj2hX}BnEJxB_M53{kS}`V#(k!oSF5T z(Oz*ynih%wdOF<;H+fe&kLR6hDw}t1{eRFwe8D&I001A02m}BC000301^_}s0stET N0{{R300000007o0qmlps literal 0 HcmV?d00001 diff --git a/src/umi_tools/umi_tools_prepareforrsem/test_data/test_dedup.sam b/src/umi_tools/umi_tools_prepareforrsem/test_data/test_dedup.sam new file mode 100644 index 00000000..e9487b8b --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/test_data/test_dedup.sam @@ -0,0 +1,201 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:MT192765.1 LN:29829 +@RG ID:1 LB:lib1 PL:ILLUMINA SM:test PU:barcode1 +@PG ID:minimap2 PN:minimap2 VN:2.17-r941 CL:minimap2 -ax sr tests/data/fasta/sarscov2/GCA_011545545.1_ASM1154554v1_genomic.fna tests/data/fastq/dna/sarscov2_1.fastq.gz tests/data/fastq/dna/sarscov2_2.fastq.gz +@PG ID:samtools PN:samtools PP:minimap2 VN:1.11 CL:samtools view -Sb sarscov2_aln.sam +@PG ID:samtools.1 PN:samtools PP:samtools VN:1.11 CL:samtools sort -o sarscov2_paired_aln.sorted.bam sarscov2_paired_aln.bam +@PG ID:samtools.2 PN:samtools PP:samtools.1 VN:1.20 CL:samtools view -h test_data/test_dedup.bam +ERR5069949.29668 163 MT192765.1 121 60 150M = 267 235 TATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCTTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTT AAA/E/EEEEEEEEEAEEEEEEEEE/ Date: Fri, 13 Sep 2024 09:10:30 +0200 Subject: [PATCH 18/28] Add Kallisto index (#149) --- CHANGELOG.md | 6 +- src/kallisto/kallisto_index/Kallisto | Bin 0 -> 2439 bytes src/kallisto/kallisto_index/config.vsh.yaml | 94 ++++++++++++++++++ src/kallisto/kallisto_index/help.txt | 21 ++++ src/kallisto/kallisto_index/script.sh | 34 +++++++ src/kallisto/kallisto_index/test.sh | 35 +++++++ .../kallisto_index/test_data/d_list.fasta | 5 + .../test_data/transcriptome.fasta | 23 +++++ 8 files changed, 217 insertions(+), 1 deletion(-) create mode 100644 src/kallisto/kallisto_index/Kallisto create mode 100644 src/kallisto/kallisto_index/config.vsh.yaml create mode 100644 src/kallisto/kallisto_index/help.txt create mode 100644 src/kallisto/kallisto_index/script.sh create mode 100644 src/kallisto/kallisto_index/test.sh create mode 100644 src/kallisto/kallisto_index/test_data/d_list.fasta create mode 100644 src/kallisto/kallisto_index/test_data/transcriptome.fasta diff --git a/CHANGELOG.md b/CHANGELOG.md index d88d0996..846007d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -149,7 +149,11 @@ * `sortmerna`: Local sequence alignment tool for mapping, clustering, and filtering rRNA from metatranscriptomic data. (PR #146) -* `fq_subsample`: Sample a subset of records from single or paired FASTQ files (PR #147). +* `fq_subsample`: Sample a subset of records from single or paired FASTQ files (PR #147). + +* `kallisto`: + - `kallisto_index`: Create a kallisto index (PR #149). + ## MINOR CHANGES diff --git a/src/kallisto/kallisto_index/Kallisto b/src/kallisto/kallisto_index/Kallisto new file mode 100644 index 0000000000000000000000000000000000000000..3c7b5b2bff962965d99ca3f9a4a6b6af6da1f3f0 GIT binary patch literal 2439 zcmeHJTTBx{6rFCj(iW7(R4PH@f{iwcTCEBOksY_VO)N&jMkQb@MkU0z5b%pZJItq>6y3i5QJC02jb;7mKSNlWm{Po|xmnK|d)x%cj5cE^Hf zo5Ms=gP>q-=Dx`Y&8Tam%b*(*sAYc)aw*sH`%Qmb#irI!wJd#g^%N{jJ942Y*B;7v zR8myiWp1c$UHLL&=A`AC^o$*g(yPKxE9&k2TL(+#tPc&Aci!%O@N%%KUe}u^mgT!# zi%Y`WO!oXjbAkQw#f0xZP+kZo&F8eXH{r?boldZcY~7r&DBNCpW#D;g<%>%V>88S* zn(fP&_poKg&FeY4ul8ovgjcuEYhCLuEry9tsbgo; zfcbNEMVB@;wf0@jF3-`vqIs3Vln>G5z~*-Q*|O7R4?+L_8<&^;{;|Iv=yw^0x?1Fq zKKH{t-pk+KZ7B%++|bB;H`mVIyPQk@FmSG}@6usnuL2HvP5$1gzh1RfdlA=oM8O*p9pU%UxMoXtZa4MiI;e6lN9)Zj9SsczoL4F-nxW7gjS;lL#ITXLZTe7Tm{6y<^@7*BFcmyOu`fe-N>GMM_MNm8n3s15~-A-du zs>S(M$mE>7gVQsBMnsW@M&}f>b(D#qkcNO}MG=t0Wgt?+7*11VW2Q|{QbnyveDt-4O(8?_>}HQ+|nsHK}JD> zEI}_&#snCR!wU`=ImWmY2!IuUp@YzBuGIbjA?Q;}NCbS6Ej{38{Q(Ed(9~7CHlh~@ z(zu?LfHTHk>o~Hk>ib8~11>`F@%qmr>8X$)4UE=ZAnP=qIJp|ns6M_j(fMdSqjeZP zKmX@E#Gf*HzUVyzWeLinBtr;M7o$H(mPA>GqA1d9hM{`wuLpH{uWG16io*!3&Oq!~ zY>JwOK3Z%+$HT0KEnpYTsK*f41@5@T5O@KlCdndB3}+_eA<9%le@sX@NTS*o#e2qq z{mZk6y~(I%5^AViqJ%~m(IWP&+TTT!n9y(~NA!$eA9;u!LWn;?@K-_t>bR9cmu5}*J8 literal 0 HcmV?d00001 diff --git a/src/kallisto/kallisto_index/config.vsh.yaml b/src/kallisto/kallisto_index/config.vsh.yaml new file mode 100644 index 00000000..2c4f65c7 --- /dev/null +++ b/src/kallisto/kallisto_index/config.vsh.yaml @@ -0,0 +1,94 @@ +name: kallisto_index +namespace: kallisto +description: | + Build a Kallisto index for the transcriptome to use Kallisto in the mapping-based mode. +keywords: [kallisto, index] +links: + homepage: https://pachterlab.github.io/kallisto/about + documentation: https://pachterlab.github.io/kallisto/manual + repository: https://github.com/pachterlab/kallisto + issue_tracker: https://github.com/pachterlab/kallisto/issues +references: + doi: https://doi.org/10.1038/nbt.3519 +license: BSD 2-Clause License + +argument_groups: +- name: "Input" + arguments: + - name: "--input" + type: file + description: | + Path to a FASTA-file containing the transcriptome sequences, either in plain text or + compressed (.gz) format. + required: true + - name: "--d_list" + type: file + description: | + Path to a FASTA-file containing sequences to mask from quantification. + +- name: "Output" + arguments: + - name: "--index" + type: file + direction: output + example: Kallisto_index + +- name: "Options" + arguments: + - name: "--kmer_size" + type: integer + description: | + Kmer length passed to indexing step of pseudoaligners (default: '31'). + example: 31 + - name: "--make_unique" + type: boolean_true + description: | + Replace repeated target names with unique names. + - name: "--aa" + type: boolean_true + description: | + Generate index from a FASTA-file containing amino acid sequences. + - name: "--distiguish" + type: boolean_true + description: | + Generate index where sequences are distinguished by the sequence names. + - name: "--min_size" + alternatives: ["-m"] + type: integer + description: | + Length of minimizers (default: automatically chosen). + - name: "--ec_max_size" + alternatives: ["-e"] + type: integer + description: | + Maximum number of targets in an equivalence class (default: no maximum). + - name: "--tmp" + alternatives: ["-T"] + type: string + description: | + Path to a directory for temporary files. + example: "tmp" + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: + - type: docker + image: ubuntu:22.04 + setup: + - type: docker + run: | + apt-get update && \ + apt-get install -y --no-install-recommends wget && \ + wget --no-check-certificate https://github.com/pachterlab/kallisto/releases/download/v0.50.1/kallisto_linux-v0.50.1.tar.gz && \ + tar -xzf kallisto_linux-v0.50.1.tar.gz && \ + mv kallisto/kallisto /usr/local/bin/ +runners: + - type: executable + - type: nextflow diff --git a/src/kallisto/kallisto_index/help.txt b/src/kallisto/kallisto_index/help.txt new file mode 100644 index 00000000..28778ac0 --- /dev/null +++ b/src/kallisto/kallisto_index/help.txt @@ -0,0 +1,21 @@ +``` +kallisto index +``` +kallisto 0.50.1 +Builds a kallisto index + +Usage: kallisto index [arguments] FASTA-files + +Required argument: +-i, --index=STRING Filename for the kallisto index to be constructed + +Optional argument: +-k, --kmer-size=INT k-mer (odd) length (default: 31, max value: 31) +-t, --threads=INT Number of threads to use (default: 1) +-d, --d-list=STRING Path to a FASTA-file containing sequences to mask from quantification + --make-unique Replace repeated target names with unique names + --aa Generate index from a FASTA-file containing amino acid sequences + --distinguish Generate index where sequences are distinguished by the sequence name +-T, --tmp=STRING Temporary directory (default: tmp) +-m, --min-size=INT Length of minimizers (default: automatically chosen) +-e, --ec-max-size=INT Maximum number of targets in an equivalence class (default: no maximum) diff --git a/src/kallisto/kallisto_index/script.sh b/src/kallisto/kallisto_index/script.sh new file mode 100644 index 00000000..59a5d3de --- /dev/null +++ b/src/kallisto/kallisto_index/script.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -eo pipefail + +unset_if_false=( par_make_unique par_aa par_distinguish ) + +for var in "${unset_if_false[@]}"; do + temp_var="${!var}" + [[ "$temp_var" == "false" ]] && unset $var +done + +if [ -n "$par_kmer_size" ]; then + if [[ "$par_kmer_size" -lt 1 || "$par_kmer_size" -gt 31 || $(( par_kmer_size % 2 )) -eq 0 ]]; then + echo "Error: Kmer size must be an odd number between 1 and 31." + exit 1 + fi +fi + +kallisto index \ + -i "${par_index}" \ + ${par_kmer_size:+--kmer-size "${par_kmer_size}"} \ + ${par_make_unique:+--make-unique} \ + ${par_aa:+--aa} \ + ${par_distinguish:+--distinguish} \ + ${par_min_size:+--min-size "${par_min_size}"} \ + ${par_ec_max_size:+--ec-max-size "${par_ec_max_size}"} \ + ${par_d_list:+--d-list "${par_d_list}"} \ + ${meta_cpus:+--cpu "${meta_cpus}"} \ + ${par_tmp:+--tmp "${par_tmp}"} \ + "${par_input}" + diff --git a/src/kallisto/kallisto_index/test.sh b/src/kallisto/kallisto_index/test.sh new file mode 100644 index 00000000..2646dcd8 --- /dev/null +++ b/src/kallisto/kallisto_index/test.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +echo ">>>Test 1: Testing $meta_functionality_name with non-default k-mer size" + +"$meta_executable" \ + --input "$meta_resources_dir/test_data/transcriptome.fasta" \ + --index Kallisto \ + --kmer_size 21 + + +echo ">>> Checking whether output exists and is correct" +[ ! -f "Kallisto" ] && echo "Kallisto index does not exist!" && exit 1 +[ ! -s "Kallisto" ] && echo "Kallisto index is empty!" && exit 1 + +kallisto inspect Kallisto 2> test.txt +grep "number of k-mers: 989" test.txt || { echo "The content of the index seems to be incorrect." && exit 1; } + +################################################################################ + +echo ">>>Test 2: Testing $meta_functionality_name with d_list argument" + +"$meta_executable" \ + --input "$meta_resources_dir/test_data/transcriptome.fasta" \ + --index Kallisto \ + --d_list "$meta_resources_dir/test_data/d_list.fasta" + +echo ">>> Checking whether output exists and is correct" +[ ! -f "Kallisto" ] && echo "Kallisto index does not exist!" && exit 1 +[ ! -s "Kallisto" ] && echo "Kallisto index is empty!" && exit 1 + +kallisto inspect Kallisto 2> test.txt +grep "number of k-mers: 959" test.txt || { echo "The content of the index seems to be incorrect." && exit 1; } + +echo "All tests succeeded!" +exit 0 diff --git a/src/kallisto/kallisto_index/test_data/d_list.fasta b/src/kallisto/kallisto_index/test_data/d_list.fasta new file mode 100644 index 00000000..ad5e05bf --- /dev/null +++ b/src/kallisto/kallisto_index/test_data/d_list.fasta @@ -0,0 +1,5 @@ +>YAL067W-A CDS=1-228 +ATGCCAATTATAGGGGTGCCGAGGTGCCTTATAAAACCCTTTTCTGTGCCTGTGACATTTCCTTTTTCGG +TCAAAAAGAATATCCGAATTTTAGATTTGGACCCTCGTACAGAAGCTTATTGTCTAAGCCTGAATTCAGT +CTGCTTTAAACGGCTTCCGCGGAGGAAATATTTCCATCTCTTGAATTCGTACAACATTAAACGTGTGTTG +GGAGTCGTATACTGTTAG diff --git a/src/kallisto/kallisto_index/test_data/transcriptome.fasta b/src/kallisto/kallisto_index/test_data/transcriptome.fasta new file mode 100644 index 00000000..94c06163 --- /dev/null +++ b/src/kallisto/kallisto_index/test_data/transcriptome.fasta @@ -0,0 +1,23 @@ +>YAL069W CDS=1-315 +ATGATCGTAAATAACACACACGTGCTTACCCTACCACTTTATACCACCACCACATGCCATACTCACCCTC +ACTTGTATACTGATTTTACGTACGCACACGGATGCTACAGTATATACCATCTCAAACTTACCCTACTCTC +AGATTCCACTTCACTCCATGGCCCATCTCTCACTGAATCAGTACCAAATGCACTCACATCATTATGCACG +GCACTTGCCTCAGCGGTCTATACCCTGTGCCATTTACCCATAACGCCCATCATTATCCACATTTTGATAT +CTATATCTCATTCGGCGGTCCCAAATATTGTATAA +>YAL068W-A CDS=1-255 +ATGCACGGCACTTGCCTCAGCGGTCTATACCCTGTGCCATTTACCCATAACGCCCATCATTATCCACATT +TTGATATCTATATCTCATTCGGCGGTCCCAAATATTGTATAACTGCCCTTAATACATACGTTATACCACT +TTTGCACCATATACTTACCACTCCATTTATATACACTTATGTCAATATTACAGAAAAATCCCCACAAAAA +TCACCTAAACATAAAAATATTCTACTTTTCAACAATAATACATAA +>YAL068C CDS=1-363 +ATGGTCAAATTAACTTCAATCGCCGCTGGTGTCGCTGCCATCGCTGCTACTGCTTCTGCAACCACCACTC +TAGCTCAATCTGACGAAAGAGTCAACTTGGTGGAATTGGGTGTCTACGTCTCTGATATCAGAGCTCACTT +AGCCCAATACTACATGTTCCAAGCCGCCCACCCAACTGAAACCTACCCAGTCGAAGTTGCTGAAGCCGTT +TTCAACTACGGTGACTTCACCACCATGTTGACCGGTATTGCTCCAGACCAAGTGACCAGAATGATCACCG +GTGTTCCATGGTACTCCAGCAGATTAAAGCCAGCCATCTCCAGTGCTCTATCCAAGGACGGTATCTACAC +TATCGCAAACTAG +>YAL067W-A CDS=1-228 +ATGCCAATTATAGGGGTGCCGAGGTGCCTTATAAAACCCTTTTCTGTGCCTGTGACATTTCCTTTTTCGG +TCAAAAAGAATATCCGAATTTTAGATTTGGACCCTCGTACAGAAGCTTATTGTCTAAGCCTGAATTCAGT +CTGCTTTAAACGGCTTCCGCGGAGGAAATATTTCCATCTCTTGAATTCGTACAACATTAAACGTGTGTTG +GGAGTCGTATACTGTTAG \ No newline at end of file From fe56ee7c53ca30f25aa31cb9a025e17cd75b636e Mon Sep 17 00:00:00 2001 From: Sai Nirmayi Yasa <92786623+sainirmayi@users.noreply.github.com> Date: Fri, 13 Sep 2024 09:15:19 +0200 Subject: [PATCH 19/28] change output quant file to an optional argument (#151) --- src/salmon/salmon_quant/config.vsh.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/salmon/salmon_quant/config.vsh.yaml b/src/salmon/salmon_quant/config.vsh.yaml index 1f96f0c9..5fa3d48f 100644 --- a/src/salmon/salmon_quant/config.vsh.yaml +++ b/src/salmon/salmon_quant/config.vsh.yaml @@ -24,7 +24,7 @@ argument_groups: description: | Format string describing the library. The library type string consists of three parts: - 1. Relative orientation of the reads: This part is only provided if the library is paired-end, THe possible options are + 1. Relative orientation of the reads: This part is only provided if the library is paired-end, The possible options are I = inward O = outward M = matching @@ -118,7 +118,7 @@ argument_groups: direction: output description: | Salmon quantification file. - required: true + required: false example: quant.sf - name: Basic options @@ -327,7 +327,7 @@ argument_groups: If this option is provided, then the selective-alignment results will be written out in SAM-compatible format. By default, output will be directed to stdout, but an alternative file name can be provided instead. - name: --mapping_sam type: file - description: Path to file that should output the selective-alignment results in SAM-compatible format. THis option must be provided while using --write_mappings + description: Path to file that should output the selective-alignment results in SAM-compatible format. This option must be provided while using --write_mappings required: false direction: output example: mappings.sam From 124d50ce5318b612e4a1e4da1be705523cd6eab7 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 16 Sep 2024 09:50:24 +0200 Subject: [PATCH 20/28] update changelog for viash 0.2.0 --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 846007d8..1f733203 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -# biobox x.x.x +# biobox 0.2.0 ## BREAKING CHANGES From c3b40a15350235b00144f9f6735090d45bc24963 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 16 Sep 2024 09:53:23 +0200 Subject: [PATCH 21/28] update to viash 0.9.0 --- CHANGELOG.md | 6 ++++++ _viash.yaml | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f733203..0370a216 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +# biobox x.x.x + +## MINOR CHANGES + +* Upgrade to Viash 0.9.0. + # biobox 0.2.0 ## BREAKING CHANGES diff --git a/_viash.yaml b/_viash.yaml index ab4f3828..d08f2fb2 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -7,7 +7,7 @@ links: issue_tracker: https://github.com/viash-hub/biobox/issues repository: https://github.com/viash-hub/biobox -viash_version: 0.9.0-RC7 +viash_version: 0.9.0 config_mods: | .requirements.commands := ['ps'] From 38f635ad57ef05550bba3a0864c81627f84f5ad2 Mon Sep 17 00:00:00 2001 From: Leila011 Date: Mon, 16 Sep 2024 10:44:10 +0200 Subject: [PATCH 22/28] Add agat convert genscan2gff (#100) * add config * add help * add test data and expected output adn the script to obtain them * add running script * add test script * update changelog * cleanup * fix tests * format description * remove unused argument --inflate-off * update --config description * add requirements * create temporary directory and clean up on exit * add GENSCAN in keywords * add set -e to test * fix create temporary directory * add set -eo pipefail to test * add set -eo pipefail to script * fix create temporary directory * update --config description * cleanup changelog * cleanup changelog * Update deprecated variable --------- Co-authored-by: Robrecht Cannoodt --- CHANGELOG.md | 5 + .../agat_convert_genscan2gff/config.vsh.yaml | 95 +++++++++++++ src/agat/agat_convert_genscan2gff/help.txt | 94 +++++++++++++ src/agat/agat_convert_genscan2gff/script.sh | 21 +++ src/agat/agat_convert_genscan2gff/test.sh | 35 +++++ .../test_data/agat_convert_genscan2gff_1.gff | 25 ++++ .../test_data/script.sh | 11 ++ .../test_data/test.genscan | 127 ++++++++++++++++++ 8 files changed, 413 insertions(+) create mode 100644 src/agat/agat_convert_genscan2gff/config.vsh.yaml create mode 100644 src/agat/agat_convert_genscan2gff/help.txt create mode 100644 src/agat/agat_convert_genscan2gff/script.sh create mode 100644 src/agat/agat_convert_genscan2gff/test.sh create mode 100644 src/agat/agat_convert_genscan2gff/test_data/agat_convert_genscan2gff_1.gff create mode 100755 src/agat/agat_convert_genscan2gff/test_data/script.sh create mode 100644 src/agat/agat_convert_genscan2gff/test_data/test.genscan diff --git a/CHANGELOG.md b/CHANGELOG.md index 0370a216..b31f43d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # biobox x.x.x +## NEW FUNCTIONALITY + +* `agat`: + - `agat/agat_convert_genscan2gff`: convert a genscan file into a GFF file (PR #100). + ## MINOR CHANGES * Upgrade to Viash 0.9.0. diff --git a/src/agat/agat_convert_genscan2gff/config.vsh.yaml b/src/agat/agat_convert_genscan2gff/config.vsh.yaml new file mode 100644 index 00000000..2adce1da --- /dev/null +++ b/src/agat/agat_convert_genscan2gff/config.vsh.yaml @@ -0,0 +1,95 @@ +name: agat_convert_genscan2gff +namespace: agat +description: | + The script takes a GENSCAN file as input, and will translate it in gff + format. The GENSCAN format is described [here](http://genome.crg.es/courses/Bioinformatics2003_genefinding/results/genscan.html). + + **Known problem** + + You must have submited only DNA sequence, without any header!! Indeed the tool expects only DNA + sequences and does not crash/warn if an header is submited along the + sequence. e.g If you have an header ">seq" s-e-q are seen as the 3 first + nucleotides of the sequence. Then all prediction location are shifted + accordingly. (checked only on the [online version](http://argonaute.mit.edu/GENSCAN.html). + I don't know if there is the same problem elsewhere.) +keywords: [gene annotations, GFF conversion, GENSCAN] +links: + homepage: https://github.com/NBISweden/AGAT + documentation: https://agat.readthedocs.io/en/latest/tools/agat_convert_genscan2gff.html + issue_tracker: https://github.com/NBISweden/AGAT/issues + repository: https://github.com/NBISweden/AGAT +references: + doi: 10.5281/zenodo.3552717 +license: GPL-3.0 +requirements: + - commands: [agat] +authors: + - __merge__: /src/_authors/leila_paquay.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --genscan + alternatives: [-g] + description: Input genscan bed file that will be converted. + type: file + required: true + direction: input + - name: Outputs + arguments: + - name: --output + alternatives: [-o, --out, --outfile, --gff] + description: Output GFF file. If no output file is specified, the output will be written to STDOUT. + type: file + direction: output + required: true + example: output.gff + - name: Arguments + arguments: + - name: --source + description: | + The source informs about the tool used to produce the data and is stored in 2nd field of a gff file. Example: Stringtie, Maker, Augustus, etc. [default: data] + type: string + required: false + example: Stringtie + - name: --primary_tag + description: | + The primary_tag corresponds to the data type and is stored in 3rd field of a gff file. Example: gene, mRNA, CDS, etc. [default: gene] + type: string + required: false + example: gene + - name: --inflate_type + description: | + Feature type (3rd column in gff) created when inflate parameter activated [default: exon]. + type: string + required: false + example: exon + - name: --verbose + description: add verbosity + type: boolean_true + - name: --config + alternatives: [-c] + description: | + AGAT config file. By default AGAT takes the original agat_config.yaml shipped with AGAT. The `--config` option gives you the possibility to use your own AGAT config file (located elsewhere or named differently). + type: file + required: false + example: custom_agat_config.yaml +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/agat:1.4.0--pl5321hdfd78af_0 + setup: + - type: docker + run: | + agat --version | sed 's/AGAT\s\(.*\)/agat: "\1"/' > /var/software_versions.txt +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/agat/agat_convert_genscan2gff/help.txt b/src/agat/agat_convert_genscan2gff/help.txt new file mode 100644 index 00000000..8a9e9f52 --- /dev/null +++ b/src/agat/agat_convert_genscan2gff/help.txt @@ -0,0 +1,94 @@ +```sh +agat_convert_genscan2gff.pl --help +``` + ------------------------------------------------------------------------------ +| Another GFF Analysis Toolkit (AGAT) - Version: v1.4.0 | +| https://github.com/NBISweden/AGAT | +| National Bioinformatics Infrastructure Sweden (NBIS) - www.nbis.se | + ------------------------------------------------------------------------------ + +Name: + agat_convert_genscan2gff.pl + +Description: + The script takes a genscan file as input, and will translate it in gff + format. The genscan format is described here: + http://genome.crg.es/courses/Bioinformatics2003_genefinding/results/gens + can.html /!\ vvv Known problem vvv /!\ You must have submited only DNA + sequence, wihtout any header!! Indeed the tool expects only DNA + sequences and does not crash/warn if an header is submited along the + sequence. e.g If you have an header ">seq" s-e-q are seen as the 3 first + nucleotides of the sequence. Then all prediction location are shifted + accordingly. (checked only on the online version + http://argonaute.mit.edu/GENSCAN.html. I don't know if there is the same + pronlem elsewhere.) /!\ ^^^ Known problem ^^^^ /!\ + +Usage: + agat_convert_genscan2gff.pl --genscan infile.bed [ -o outfile ] + agat_convert_genscan2gff.pl -h + +Options: + --genscan or -g + Input genscan bed file that will be convert. + + --source + The source informs about the tool used to produce the data and + is stored in 2nd field of a gff file. Example: + Stringtie,Maker,Augustus,etc. [default: data] + + --primary_tag + The primary_tag corresponf to the data type and is stored in 3rd + field of a gff file. Example: gene,mRNA,CDS,etc. [default: gene] + + --inflate_off + By default we inflate the block fields (blockCount, blockSizes, + blockStarts) to create subfeatures of the main feature + (primary_tag). Type of subfeature created based on the + inflate_type parameter. If you don't want this inflating + behaviour you can deactivate it by using the option + --inflate_off. + + --inflate_type + Feature type (3rd column in gff) created when inflate parameter + activated [default: exon]. + + --verbose + add verbosity + + -o , --output , --out , --outfile or --gff + Output GFF file. If no output file is specified, the output will + be written to STDOUT. + + -c or --config + String - Input agat config file. By default AGAT takes as input + agat_config.yaml file from the working directory if any, + otherwise it takes the orignal agat_config.yaml shipped with + AGAT. To get the agat_config.yaml locally type: "agat config + --expose". The --config option gives you the possibility to use + your own AGAT config file (located elsewhere or named + differently). + + -h or --help + Display this helpful text. + +Feedback: + Did you find a bug?: + Do not hesitate to report bugs to help us keep track of the bugs and + their resolution. Please use the GitHub issue tracking system available + at this address: + + https://github.com/NBISweden/AGAT/issues + + Ensure that the bug was not already reported by searching under Issues. + If you're unable to find an (open) issue addressing the problem, open a new one. + Try as much as possible to include in the issue when relevant: + - a clear description, + - as much relevant information as possible, + - the command used, + - a data sample, + - an explanation of the expected behaviour that is not occurring. + + Do you want to contribute?: + You are very welcome, visit this address for the Contributing + guidelines: + https://github.com/NBISweden/AGAT/blob/master/CONTRIBUTING.md diff --git a/src/agat/agat_convert_genscan2gff/script.sh b/src/agat/agat_convert_genscan2gff/script.sh new file mode 100644 index 00000000..38afb084 --- /dev/null +++ b/src/agat/agat_convert_genscan2gff/script.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +## VIASH END + +# unset flags +[[ "$par_inflate_off" == "true" ]] && unset par_inflate_off +[[ "$par_verbose" == "false" ]] && unset par_verbose + +# run agat_convert_genscan2gff +agat_convert_genscan2gff.pl \ + --genscan "$par_genscan" \ + --output "$par_output" \ + ${par_source:+--source "${par_source}"} \ + ${par_primary_tag:+--primary_tag "${par_primary_tag}"} \ + ${par_inflate_off:+--inflate_off} \ + ${par_inflate_type:+--inflate_type "${par_inflate_type}"} \ + ${par_verbose:+--verbose} \ + ${par_config:+--config "${par_config}"} \ No newline at end of file diff --git a/src/agat/agat_convert_genscan2gff/test.sh b/src/agat/agat_convert_genscan2gff/test.sh new file mode 100644 index 00000000..b666dacf --- /dev/null +++ b/src/agat/agat_convert_genscan2gff/test.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +## VIASH END + +test_dir="${meta_resources_dir}/test_data" + +# create temporary directory and clean up on exit +TMPDIR=$(mktemp -d "$meta_temp_dir/$meta_name-XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -rf "$TMPDIR" +} +trap clean_up EXIT + +echo "> Run $meta_name with test data" +"$meta_executable" \ + --genscan "$test_dir/test.genscan" \ + --output "$TMPDIR/output.gff" + +echo ">> Checking output" +[ ! -f "$TMPDIR/output.gff" ] && echo "Output file output.gff does not exist" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "$TMPDIR/output.gff" ] && echo "Output file output.gff is empty" && exit 1 + +echo ">> Check if output matches expected output" +diff "$TMPDIR/output.gff" "$test_dir/agat_convert_genscan2gff_1.gff" +if [ $? -ne 0 ]; then + echo "Output file output.gff does not match expected output" + exit 1 +fi + +echo "> Test successful" diff --git a/src/agat/agat_convert_genscan2gff/test_data/agat_convert_genscan2gff_1.gff b/src/agat/agat_convert_genscan2gff/test_data/agat_convert_genscan2gff_1.gff new file mode 100644 index 00000000..695fb46c --- /dev/null +++ b/src/agat/agat_convert_genscan2gff/test_data/agat_convert_genscan2gff_1.gff @@ -0,0 +1,25 @@ +##gff-version 3 +unknown genscan gene 2223 4605 75.25 + . ID=gene_1 +unknown genscan mRNA 2223 4605 75.25 + . ID=mrna_1;Parent=gene_1 +unknown genscan exon 2223 3020 75.25 + . ID=exon_1;Parent=mrna_1 +unknown genscan exon 4249 4605 13.03 + . ID=exon_2;Parent=mrna_1 +unknown genscan CDS 2223 3020 75.25 + 0 ID=cds_1;Parent=mrna_1 +unknown genscan CDS 4249 4605 13.03 + 0 ID=cds_2;Parent=mrna_1 +unknown genscan gene 6829 8789 20.06 - . ID=gene_2 +unknown genscan mRNA 6829 8789 20.06 - . ID=mrna_2;Parent=gene_2 +unknown genscan exon 6829 7297 20.06 - . ID=exon_3;Parent=mrna_2 +unknown genscan exon 7730 7888 12.78 - . ID=exon_4;Parent=mrna_2 +unknown genscan exon 8029 8185 7.45 - . ID=exon_5;Parent=mrna_2 +unknown genscan exon 8278 8546 17.45 - . ID=exon_6;Parent=mrna_2 +unknown genscan exon 8647 8789 18.65 - . ID=exon_7;Parent=mrna_2 +unknown genscan CDS 6829 7297 20.06 - 1 ID=cds_3;Parent=mrna_2 +unknown genscan CDS 7730 7888 12.78 - 1 ID=cds_4;Parent=mrna_2 +unknown genscan CDS 8029 8185 7.45 - 2 ID=cds_5;Parent=mrna_2 +unknown genscan CDS 8278 8546 17.45 - 1 ID=cds_6;Parent=mrna_2 +unknown genscan CDS 8647 8789 18.65 - 0 ID=cds_7;Parent=mrna_2 +unknown genscan gene 10209 11924 16.18 + . ID=gene_3 +unknown genscan mRNA 10209 11924 16.18 + . ID=mrna_3;Parent=gene_3 +unknown genscan exon 10209 11313 16.18 + . ID=exon_8;Parent=mrna_3 +unknown genscan exon 11850 11924 3.27 + . ID=exon_9;Parent=mrna_3 +unknown genscan CDS 10209 11313 16.18 + 0 ID=cds_8;Parent=mrna_3 +unknown genscan CDS 11850 11924 3.27 + 2 ID=cds_9;Parent=mrna_3 diff --git a/src/agat/agat_convert_genscan2gff/test_data/script.sh b/src/agat/agat_convert_genscan2gff/test_data/script.sh new file mode 100755 index 00000000..c1693653 --- /dev/null +++ b/src/agat/agat_convert_genscan2gff/test_data/script.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# clone repo +if [ ! -d /tmp/agat_source ]; then + git clone --depth 1 --single-branch --branch master https://github.com/NBISweden/AGAT /tmp/agat_source +fi + +# copy test data +cp -r /tmp/agat_source/t/scripts_output/in/test.genscan src/agat/agat_convert_genscan2gff/test_data/test.genscan +cp -r /tmp/agat_source/t/scripts_output/out/agat_convert_genscan2gff_1.gff src/agat/agat_convert_genscan2gff/test_data/agat_convert_genscan2gff_1.gff + diff --git a/src/agat/agat_convert_genscan2gff/test_data/test.genscan b/src/agat/agat_convert_genscan2gff/test_data/test.genscan new file mode 100644 index 00000000..a88037db --- /dev/null +++ b/src/agat/agat_convert_genscan2gff/test_data/test.genscan @@ -0,0 +1,127 @@ +GENSCAN 1.0 Date run: 7-Mar-120 Time: 14:46:49 + + + +Sequence /tmp/03_07_20-14:46:49.fasta : 12217 bp : 42.83% C+G : Isochore 1 ( 0 - 43 C+G%) + + + +Parameter matrix: HumanIso.smat + + + +Predicted genes/exons: + + + +Gn.Ex Type S .Begin ...End .Len Fr Ph I/Ac Do/T CodRg P.... Tscr.. + +----- ---- - ------ ------ ---- -- -- ---- ---- ----- ----- ------ + + + + 1.01 Init + 2223 3020 798 2 0 55 2 924 0.940 75.25 + + 1.02 Term + 4249 4605 357 0 0 26 38 307 0.976 13.03 + + 1.03 PlyA + 4711 4716 6 -0.45 + + + + 2.06 PlyA - 4852 4847 6 -0.45 + + 2.05 Term - 7297 6829 469 0 1 13 42 387 0.281 20.06 + + 2.04 Intr - 7888 7730 159 0 0 85 93 144 0.998 12.78 + + 2.03 Intr - 8185 8029 157 2 1 65 60 144 0.787 7.45 + + 2.02 Intr - 8546 8278 269 1 2 36 65 287 0.946 17.45 + + 2.01 Init - 8789 8647 143 2 2 94 96 176 0.550 18.65 + + 2.00 Prom - 9720 9681 40 -6.55 + + + + 3.00 Prom + 10160 10199 40 -11.84 + + 3.01 Init + 10209 11313 1105 2 1 66 57 269 0.512 16.18 + + 3.02 Intr + 11850 11924 75 1 0 80 86 57 0.507 3.27 + + + +Suboptimal exons with probability > 1.000 + + + +Exnum Type S .Begin ...End .Len Fr Ph B/Ac Do/T CodRg P.... Tscr.. + +----- ---- - ------ ------ ---- -- -- ---- ---- ----- ----- ------ + + + +NO EXONS FOUND AT GIVEN PROBABILITY CUTOFF + + + + + +Predicted peptide sequence(s): + + + + + +>/tmp/03_03_20-07:33:11.fasta|GENSCAN_predicted_peptide_1|384_aa + +MSSKNKVSKQDIDSIVESLMKKQKSYFEPRLAQIQQVGMENVQKLSAIHAELALLTASIS + +TVKSDVDKLKCKVENNFSAIDGHDQAFGELELKMADMEDRSRRCNIRVIGLKERLEGFNA + +IQYLTHSLPKWFPALADVPVEVMSAHRIYSDAKRGDNRTLIFNVLRYTTRQAILRAAKKD + +PLSVDDRKVRFSPDYSNFTVKRCQAFHQAKDAARNKCLDFFLLYPATLKIKEGAQYRSFT + +SPKEAEDYVNSAASNHAATPASPRQHGTILTIYRRIHSLYDGERARKIQLLEQAASVALT + +GDNWTSVRNDNYLGVTAHFIDNVWKLRCFALEVKKKKKHSRHTAEDCAEEFIDVSNRWEI + +NGKLTTLGTDSALIMLAAARLLPF + + + +>/tmp/03_03_20-07:33:11.fasta|GENSCAN_predicted_peptide_2|398_aa + +MASTMPSSSSTEDEENTPECLNKDHYHFHHYTMEYIQDKPTNVARVGGFTDKKSIAKVER + +CLARERQEATEDHEAIPSTSGATSLTKKLRSRSGLPIAGSGLVLPALCIICQKKEKFINR + +AGKRQRDPLSKAETLTVGQLQKAAELKDDQSILLHIKDKDCVALEVQYHKGCYNQYTRFM + +TRPEKPEKEQNEPTFDVGYKILCERIIRQRLLVNQEVLRMGQLRMAFIELVKANEGLDAS + +NYSIKNLERSRRADAGSQRIQIFDPDQRTPTQWKKFLSEGTKKEALAEFLYVAWKNADLT + +IVGKNLCLYIAHTNQCHCVTVKEGVQSVRVVEDLLLFLHAQHAAREHKAVIIKSSDTDVA + +VIAVSVQTDLPCSLYVFTGTGNRTRIIDITKVSSANKI + + + +>/tmp/03_03_20-07:33:11.fasta|GENSCAN_predicted_peptide_3|394_aa + +MQRGRAAGINGIPPEFYVAFWEQLSPFFLHMINFSIEKGGFLRDVNTALISLLMKKDKNP + +TDCSSYRPLSLLNSDVKIFAKLLPLRLEPHMPELVSSDQTGFIKSRTAADNIRRLLHIIA + +AAPGCETPMSVLSLDAMKAFDRLEWSFLWSVLEAMGFISTFIGMVKVLYSNPSARVLTGQ + +TFSSLFPVSRSSRQGCPLSPALFVLSLEPLAQAVRLSNLVLPICICDTQHKLSLFADDVI + +VFLEHPTQSLPHFLSICEEFRKLSGFKMNWSKSALMHLNDNARKSVTPVNIPLVGQLKYL + +GIEVFPSLNQIVKHNYSLAFTNVLKDMDRWISLPMSIQARISIIKMNGLPRIHFVSSMVP + +LPPPSDYWIKISAQGVRCPLAKPFTHSPYSKTKX From 7f8bcc2b3e1ffaac9778b6acb42420b19660d1a1 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 17 Sep 2024 11:47:31 +0200 Subject: [PATCH 23/28] BD rhapsody sequence analysis (#96) * wip * fix test * add help * update 2.2 args * fix bug * extend test data * output separate files * analyse missing args * tweaks to test * fix script * fix test * fix test * move small reference * wip generate wta test data * don't forget about umi in r1 * remove unneeded pkg * load reference in memory just once * fix random choices * extend test * add abc immunediscoverypanel * wip abc testing code * fix abc test; need unique instrument, run and flowcell ids for each sample * add smk data * add entry to changelog * remove old test file * adapt test for missing read * update description * add comment * ensure cwl files are absolute * Apply suggestions from code review Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> * fix suggestion * newer pipelines have docker requirements as a hint instead of a strict requirement * rename str to content * remove deleted resources * fix containers * fix script * fix suggestion * fix suggestion... * fix test * fix component name * fix test * apply suggestions * fix test * added note * fix changelog * fix changelog again * splitting hairs here --------- Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> --- CHANGELOG.md | 2 + .../config.vsh.yaml | 14 +- .../make_rhap_reference_2.2.1_nodocker.cwl | 115 --- .../bd_rhapsody_make_reference/script.py | 12 +- .../test_data/script.sh | 47 -- .../_process_cwl.R | 116 +++ .../config.vsh.yaml | 661 ++++++++++++++++++ .../bd_rhapsody_sequence_analysis/help.txt | 167 +++++ .../pipeline_inputs_template_2.2.1.yaml | 203 ++++++ .../bd_rhapsody_sequence_analysis/script.py | 243 +++++++ .../bd_rhapsody_sequence_analysis/test.py | 494 +++++++++++++ .../helpers/rhapsody_cell_label.py | 405 +++++++++++ .../BDAbSeq_ImmuneDiscoveryPanel.fasta | 60 ++ .../SampleTagSequences_HomoSapiens_ver1.fasta | 24 + .../test_data/reference_small.fa | 0 .../test_data/reference_small.gtf | 0 src/bd_rhapsody/test_data/script.sh | 141 ++++ 17 files changed, 2532 insertions(+), 172 deletions(-) delete mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl delete mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh create mode 100644 src/bd_rhapsody/bd_rhapsody_sequence_analysis/_process_cwl.R create mode 100644 src/bd_rhapsody/bd_rhapsody_sequence_analysis/config.vsh.yaml create mode 100644 src/bd_rhapsody/bd_rhapsody_sequence_analysis/help.txt create mode 100644 src/bd_rhapsody/bd_rhapsody_sequence_analysis/pipeline_inputs_template_2.2.1.yaml create mode 100644 src/bd_rhapsody/bd_rhapsody_sequence_analysis/script.py create mode 100644 src/bd_rhapsody/bd_rhapsody_sequence_analysis/test.py create mode 100644 src/bd_rhapsody/helpers/rhapsody_cell_label.py create mode 100644 src/bd_rhapsody/test_data/BDAbSeq_ImmuneDiscoveryPanel.fasta create mode 100644 src/bd_rhapsody/test_data/SampleTagSequences_HomoSapiens_ver1.fasta rename src/bd_rhapsody/{bd_rhapsody_make_reference => }/test_data/reference_small.fa (100%) rename src/bd_rhapsody/{bd_rhapsody_make_reference => }/test_data/reference_small.gtf (100%) create mode 100644 src/bd_rhapsody/test_data/script.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index b31f43d9..07a83c15 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ * `agat`: - `agat/agat_convert_genscan2gff`: convert a genscan file into a GFF file (PR #100). +* `bd_rhapsody/bd_rhapsody_sequence_analysis`: BD Rhapsody Sequence Analysis CWL pipeline (PR #96). + ## MINOR CHANGES * Upgrade to Viash 0.9.0. diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml b/src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml index e596bf06..dc71262b 100644 --- a/src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml @@ -116,12 +116,11 @@ argument_groups: resources: - type: python_script path: script.py - - path: make_rhap_reference_2.2.1_nodocker.cwl test_resources: - type: bash_script path: test.sh - - path: test_data + - path: ../test_data requirements: commands: [ "cwl-runner" ] @@ -131,12 +130,19 @@ engines: image: bdgenomics/rhapsody:2.2.1 setup: - type: apt - packages: [procps] + packages: [procps, git] - type: python packages: [cwlref-runner, cwl-runner] - type: docker run: | - echo "bdgenomics/rhapsody: 2.2.1" > /var/software_versions.txt + mkdir /var/bd_rhapsody_cwl && \ + cd /var/bd_rhapsody_cwl && \ + git clone https://bitbucket.org/CRSwDev/cwl.git . && \ + git checkout 8feeace1141b24749ea6003f8e6ad6d3ad5232de + - type: docker + run: + - VERSION=$(ls -v /var/bd_rhapsody_cwl | grep '^v' | sed 's#v##' | tail -1) + - 'echo "bdgenomics/rhapsody: \"$VERSION\"" > /var/software_versions.txt' runners: - type: executable diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl b/src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl deleted file mode 100644 index fead2c02..00000000 --- a/src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl +++ /dev/null @@ -1,115 +0,0 @@ -requirements: - InlineJavascriptRequirement: {} -class: CommandLineTool -label: Reference Files Generator for BD Rhapsodyâ„¢ Sequencing Analysis Pipeline -cwlVersion: v1.2 -doc: >- - The Reference Files Generator creates an archive containing Genome Index and Transcriptome annotation files needed for the BD Rhapsodyâ„¢ Sequencing Analysis Pipeline. The app takes as input one or more FASTA and GTF files and produces a compressed archive in the form of a tar.gz file. The archive contains:\n - STAR index\n - Filtered GTF file - - -baseCommand: run_reference_generator.sh -inputs: - Genome_fasta: - type: File[] - label: Reference Genome - doc: |- - Reference genome file in FASTA format. The BD Rhapsodyâ„¢ Sequencing Analysis Pipeline uses GRCh38 for Human and GRCm39 for Mouse. - inputBinding: - prefix: --reference-genome - shellQuote: false - Gtf: - type: File[] - label: Transcript Annotations - doc: |- - Transcript annotation files in GTF format. The BD Rhapsodyâ„¢ Sequencing Analysis Pipeline uses Gencode v42 for Human and M31 for Mouse. - inputBinding: - prefix: --gtf - shellQuote: false - Extra_sequences: - type: File[]? - label: Extra Sequences - doc: |- - Additional sequences in FASTA format to use when building the STAR index. (E.g. phiX genome) - inputBinding: - prefix: --extra-sequences - shellQuote: false - Mitochondrial_Contigs: - type: string[]? - default: ["chrM", "chrMT", "M", "MT"] - label: Mitochondrial Contig Names - doc: |- - Names of the Mitochondrial contigs in the provided Reference Genome. Fragments originating from contigs other than these are identified as 'nuclear fragments' in the ATACseq analysis pipeline. - inputBinding: - prefix: --mitochondrial-contigs - shellQuote: false - Filtering_off: - type: boolean? - label: Turn off filtering - doc: |- - By default the input Transcript Annotation files are filtered based on the gene_type/gene_biotype attribute. Only features having the following attribute values are are kept: - - protein_coding - - lncRNA (lincRNA and antisense for Gencode < v31/M22/Ensembl97) - - IG_LV_gene - - IG_V_gene - - IG_V_pseudogene - - IG_D_gene - - IG_J_gene - - IG_J_pseudogene - - IG_C_gene - - IG_C_pseudogene - - TR_V_gene - - TR_V_pseudogene - - TR_D_gene - - TR_J_gene - - TR_J_pseudogene - - TR_C_gene - If you have already pre-filtered the input Annotation files and/or wish to turn-off the filtering, please set this option to True. - inputBinding: - prefix: --filtering-off - shellQuote: false - WTA_Only: - type: boolean? - label: WTA only index - doc: Build a WTA only index, otherwise builds a WTA + ATAC index. - inputBinding: - prefix: --wta-only-index - shellQuote: false - Archive_prefix: - type: string? - label: Archive Prefix - doc: |- - A prefix for naming the compressed archive file containing the Reference genome index and annotation files. The default value is constructed based on the input Reference files. - inputBinding: - prefix: --archive-prefix - shellQuote: false - Extra_STAR_params: - type: string? - label: Extra STAR Params - doc: |- - Additional parameters to pass to STAR when building the genome index. Specify exactly like how you would on the command line. - Example: - --limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11 - inputBinding: - prefix: --extra-star-params - shellQuote: true - - Maximum_threads: - type: int? - label: Maximum Number of Threads - doc: |- - The maximum number of threads to use in the pipeline. By default, all available cores are used. - inputBinding: - prefix: --maximum-threads - shellQuote: false - -outputs: - - Archive: - type: File - doc: |- - A Compressed archive containing the Reference Genome Index and annotation GTF files. This archive is meant to be used as an input in the BD Rhapsodyâ„¢ Sequencing Analysis Pipeline. - id: Reference_Archive - label: Reference Files Archive - outputBinding: - glob: '*.tar.gz' - diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/script.py b/src/bd_rhapsody/bd_rhapsody_make_reference/script.py index ca635508..dcbfe933 100644 --- a/src/bd_rhapsody/bd_rhapsody_make_reference/script.py +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/script.py @@ -83,21 +83,21 @@ def generate_config(par: dict[str, Any], meta, config) -> str: for config_key, arg_type, par_value in config_key_value_pairs: if arg_type == "file": - str = strip_margin(f"""\ + content = strip_margin(f"""\ |{config_key}: |""") if isinstance(par_value, list): for file in par_value: - str += strip_margin(f"""\ + content += strip_margin(f"""\ | - class: File | location: "{file}" |""") else: - str += strip_margin(f"""\ + content += strip_margin(f"""\ | class: File | location: "{par_value}" |""") - content_list.append(str) + content_list.append(content) else: content_list.append(strip_margin(f"""\ |{config_key}: {par_value} @@ -108,9 +108,9 @@ def generate_config(par: dict[str, Any], meta, config) -> str: def get_cwl_file(meta: dict[str, Any]) -> str: # create cwl file (if need be) - cwl_file=os.path.join(meta["resources_dir"], "make_rhap_reference_2.2.1_nodocker.cwl") + cwl_file="/var/bd_rhapsody_cwl/v2.2.1/Extra_Utilities/make_rhap_reference_2.2.1.cwl" - return cwl_file + return os.path.abspath(cwl_file) def main(par: dict[str, Any], meta: dict[str, Any]): config = read_config(meta["config"]) diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh deleted file mode 100644 index 8d468064..00000000 --- a/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -TMP_DIR=/tmp/bd_rhapsody_make_reference -OUT_DIR=src/bd_rhapsody/bd_rhapsody_make_reference/test_data - -# check if seqkit is installed -if ! command -v seqkit &> /dev/null; then - echo "seqkit could not be found" - exit 1 -fi - -# create temporary directory and clean up on exit -mkdir -p $TMP_DIR -function clean_up { - rm -rf "$TMP_DIR" -} -trap clean_up EXIT - -# fetch reference -ORIG_FA=$TMP_DIR/reference.fa.gz -if [ ! -f $ORIG_FA ]; then - wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz \ - -O $ORIG_FA -fi - -ORIG_GTF=$TMP_DIR/reference.gtf.gz -if [ ! -f $ORIG_GTF ]; then - wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz \ - -O $ORIG_GTF -fi - -# create small reference -START=30000 -END=31500 -CHR=chr1 - -# subset to small region -seqkit grep -r -p "^$CHR\$" "$ORIG_FA" | \ - seqkit subseq -r "$START:$END" > $OUT_DIR/reference_small.fa - -zcat "$ORIG_GTF" | \ - awk -v FS='\t' -v OFS='\t' " - \$1 == \"$CHR\" && \$4 >= $START && \$5 <= $END { - \$4 = \$4 - $START + 1; - \$5 = \$5 - $START + 1; - print; - }" > $OUT_DIR/reference_small.gtf diff --git a/src/bd_rhapsody/bd_rhapsody_sequence_analysis/_process_cwl.R b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/_process_cwl.R new file mode 100644 index 00000000..e33b8ea7 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/_process_cwl.R @@ -0,0 +1,116 @@ +# Extract arguments from CWL file and write them to arguments.yaml +# +# This script: +# - reads the CWL file +# - extracts the main workflow arguments +# - compares cwl arguments to viash config arguments +# - writes the arguments to arguments.yaml +# +# It can be used to update the arguments in the viash config after an +# update to the CWL file has been made. +# +# Dependencies: tidyverse, jsonlite, yaml, dynutils +# +# Install dependencies: +# ```R +# install.packages(c("tidyverse", "jsonlite", "yaml", "dynutils")) +# ``` +# +# Usage: +# ```bash +# Rscript src/bd_rhapsody/bd_rhapsody_sequence_analysis/_process_cwl.R +# ``` + +library(tidyverse) + +# fetch and read cwl file +lines <- read_lines("https://bitbucket.org/CRSwDev/cwl/raw/8feeace1141b24749ea6003f8e6ad6d3ad5232de/v2.2.1/rhapsody_pipeline_2.2.1.cwl") +cwl_header <- lines[[1]] +cwl_obj <- jsonlite::fromJSON(lines[-1], simplifyVector = FALSE) + +# detect main workflow arguments +gr <- dynutils::list_as_tibble(cwl_obj$`$graph`) + +gr %>% print(n = 100) + +main <- gr %>% filter(gr$id == "#main") + +main_inputs <- main$inputs[[1]] + +input_ids <- main_inputs %>% map_chr("id") %>% gsub("^#main/", "", .) + +# check whether in config +config <- yaml::read_yaml("src/bd_rhapsody/bd_rhapsody_sequence_analysis/config.vsh.yaml") +config$all_arguments <- config$argument_groups %>% map("arguments") %>% list_flatten() +arg_names <- config$all_arguments %>% map_chr("name") %>% gsub("^--", "", .) + +# arguments in cwl but not in config +setdiff(tolower(input_ids), arg_names) + +# arguments in config but not in cwl +setdiff(arg_names, tolower(input_ids)) + +# create arguments from main_inputs +arguments <- map(main_inputs, function(main_input) { + input_id <- main_input$id %>% gsub("^#main/", "", .) + input_type <- main_input$type[[2]] + + if (is.list(input_type) && input_type$type == "array") { + multiple <- TRUE + input_type <- input_type$items + } else { + multiple <- FALSE + } + + if (is.list(input_type) && input_type$type == "enum") { + choices <- input_type$symbols %>% + gsub(paste0(input_type$name, "/"), "", .) + input_type <- "enum" + } else { + choices <- NULL + } + + description <- + if (is.null(main_input$label)) { + main_input$doc + } else if (is.null(main_input$doc)) { + main_input$label + } else { + paste0(main_input$label, ". ", main_input$doc) + } + + type_map <- c( + "float" = "double", + "int" = "integer", + "string" = "string", + "boolean" = "boolean", + "File" = "file", + "enum" = "string" + ) + + out <- list( + name = paste0("--", tolower(input_id)), + type = type_map[input_type], + # TODO: use summary when viash 0.9 is released + # summary = main_input$doc, + # description = main_input$doc, + description = description, + multiple = multiple, + choices = choices, + info = list( + config_key = input_id + ) + ) + + out[!sapply(out, is.null)] +}) + + + +yaml::write_yaml( + arguments, + "src/bd_rhapsody/bd_rhapsody_sequence_analysis/arguments.yaml", + handlers = list( + logical = yaml::verbatim_logical + ) +) diff --git a/src/bd_rhapsody/bd_rhapsody_sequence_analysis/config.vsh.yaml b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/config.vsh.yaml new file mode 100644 index 00000000..eb3eaf38 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/config.vsh.yaml @@ -0,0 +1,661 @@ +name: bd_rhapsody_sequence_analysis +namespace: bd_rhapsody +description: | + BD Rhapsody Sequence Analysis CWL pipeline v2.2. + + This pipeline performs analysis of single-cell multiomic sequence read (FASTQ) data. The supported + sequencing libraries are those generated by the BD Rhapsody™ assay kits, including: Whole Transcriptome + mRNA (WTA), Targeted mRNA, AbSeq Antibody-Oligonucleotides (ABC), Single-Cell Multiplexing (SMK), + TCR/BCR (VDJ), and ATAC-Seq. +keywords: [rna-seq, single-cell, multiomic, atac-seq, targeted, abseq, tcr, bcr] +links: + repository: https://bitbucket.org/CRSwDev/cwl/src/master/v2.2.1 + documentation: https://bd-rhapsody-bioinfo-docs.genomics.bd.com +license: Unknown +authors: + - __merge__: /src/_authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] + - __merge__: /src/_authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: + - name: Inputs + arguments: + - name: "--reads" + type: file + description: | + Reads (optional) - Path to your FASTQ.GZ formatted read files from libraries that may include: + + - WTA mRNA + - Targeted mRNA + - AbSeq + - Sample Multiplexing + - VDJ + + You may specify as many R1/R2 read pairs as you want. + required: false + multiple: true + example: + - WTALibrary_S1_L001_R1_001.fastq.gz + - WTALibrary_S1_L001_R2_001.fastq.gz + info: + config_key: Reads + - name: "--reads_atac" + type: file + description: | + Path to your FASTQ.GZ formatted read files from ATAC-Seq libraries. + You may specify as many R1/R2/I2 files as you want. + required: false + multiple: true + example: + - ATACLibrary_S2_L001_R1_001.fastq.gz + - ATACLibrary_S2_L001_R2_001.fastq.gz + - ATACLibrary_S2_L001_I2_001.fastq.gz + info: + config_key: Reads_ATAC + - name: References + description: | + Assay type will be inferred from the provided reference(s). + Do not provide both reference_archive and targeted_reference at the same time. + + Valid reference input combinations: + - reference_archive: WTA only + - reference_archive & abseq_reference: WTA + AbSeq + - reference_archive & supplemental_reference: WTA + extra transgenes + - reference_archive & abseq_reference & supplemental_reference: WTA + AbSeq + extra transgenes + - reference_archive: WTA + ATAC or ATAC only + - reference_archive & supplemental_reference: WTA + ATAC + extra transgenes + - targeted_reference: Targeted only + - targeted_reference & abseq_reference: Targeted + AbSeq + - abseq_reference: AbSeq only + + The reference_archive can be generated with the bd_rhapsody_make_reference component. + Alternatively, BD also provides standard references which can be downloaded from these locations: + + - Human: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Human_WTA_2023-02.tar.gz + - Mouse: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Mouse_WTA_2023-02.tar.gz + arguments: + - name: "--reference_archive" + type: file + description: | + Path to Rhapsody WTA Reference in the tar.gz format. + + Structure of the reference archive: + + - `BD_Rhapsody_Reference_Files/`: top level folder + - `star_index/`: sub-folder containing STAR index, that is files created with `STAR --runMode genomeGenerate` + - GTF for gene-transcript-annotation e.g. "gencode.v43.primary_assembly.annotation.gtf" + example: "RhapRef_Human_WTA_2023-02.tar.gz" + required: false + info: + config_key: Reference_Archive + - name: "--targeted_reference" + type: file + description: | + Path to the targeted reference file in FASTA format. + example: "BD_Rhapsody_Immune_Response_Panel_Hs.fasta" + multiple: true + info: + config_key: Targeted_Reference + - name: "--abseq_reference" + type: file + description: Path to the AbSeq reference file in FASTA format. Only needed if BD AbSeq Ab-Oligos are used. + example: "AbSeq_reference.fasta" + multiple: true + info: + config_key: AbSeq_Reference + - name: "--supplemental_reference" + type: file + alternatives: [-s] + description: Path to the supplemental reference file in FASTA format. Only needed if there are additional transgene sequences to be aligned against in a WTA assay experiment. + example: "supplemental_reference.fasta" + multiple: true + info: + config_key: Supplemental_Reference + - name: Outputs + description: Outputs for all pipeline runs + # based on https://bd-rhapsody-bioinfo-docs.genomics.bd.com/outputs/top_outputs.html + arguments: + - name: "--output_dir" + type: file + direction: output + alternatives: [-o] + description: "The unprocessed output directory containing all the outputs from the pipeline." + required: true + example: output_dir/ + - name: "--output_seurat" + type: file + direction: output + description: "Single-cell analysis tool inputs. Seurat (.rds) input file containing RSEC molecules data table and all cell annotation metadata." + example: output_seurat.rds + required: false + info: + template: "[sample_name]_Seurat.rds" + - name: "--output_mudata" + type: file + direction: output + description: "Single-cell analysis tool inputs. Scanpy / Muon input file containing RSEC molecules data table and all cell annotation metadata." + example: output_mudata.h5mu + required: false + info: + template: "[sample_name].h5mu" + - name: "--metrics_summary" + type: file + direction: output + description: "Metrics Summary. Report containing sequencing, molecules, and cell metrics." + example: metrics_summary.csv + required: false + info: + template: "[sample_name]_Metrics_Summary.csv" + - name: "--pipeline_report" + type: file + direction: output + description: "Pipeline Report. Summary report containing the results from the sequencing analysis pipeline run." + example: pipeline_report.html + required: false + info: + template: "[sample_name]_Pipeline_Report.html" + - name: "--rsec_mols_per_cell" + type: file + direction: output + description: "Molecules per bioproduct per cell bassed on RSEC" + example: RSEC_MolsPerCell_MEX.zip + required: false + info: + template: "[sample_name]_RSEC_MolsPerCell_MEX.zip" + - name: "--dbec_mols_per_cell" + type: file + direction: output + description: "Molecules per bioproduct per cell bassed on DBEC. DBEC data table is only output if the experiment includes targeted mRNA or AbSeq bioproducts." + example: DBEC_MolsPerCell_MEX.zip + required: false + info: + template: "[sample_name]_DBEC_MolsPerCell_MEX.zip" + - name: "--rsec_mols_per_cell_unfiltered" + type: file + direction: output + description: "Unfiltered tables containing all cell labels with ≥10 reads." + example: RSEC_MolsPerCell_Unfiltered_MEX.zip + required: false + info: + template: "[sample_name]_RSEC_MolsPerCell_Unfiltered_MEX.zip" + - name: "--bam" + type: file + direction: output + description: "Alignment file of R2 with associated R1 annotations for Bioproduct." + example: BioProduct.bam + required: false + info: + template: "[sample_name]_Bioproduct.bam" + - name: "--bam_index" + type: file + direction: output + description: "Index file for the alignment file." + example: BioProduct.bam.bai + required: false + info: + template: "[sample_name]_Bioproduct.bam.bai" + - name: "--bioproduct_stats" + type: file + direction: output + description: "Bioproduct Stats. Metrics from RSEC and DBEC Unique Molecular Identifier adjustment algorithms on a per-bioproduct basis." + example: Bioproduct_Stats.csv + required: false + info: + template: "[sample_name]_Bioproduct_Stats.csv" + - name: "--dimred_tsne" + type: file + direction: output + description: "t-SNE dimensionality reduction coordinates per cell index" + example: tSNE_coordinates.csv + required: false + info: + template: "[sample_name]_(assay)_tSNE_coordinates.csv" + - name: "--dimred_umap" + type: file + direction: output + description: "UMAP dimensionality reduction coordinates per cell index" + example: UMAP_coordinates.csv + required: false + info: + template: "[sample_name]_(assay)_UMAP_coordinates.csv" + - name: "--immune_cell_classification" + type: file + direction: output + description: "Immune Cell Classification. Cell type classification based on the expression of immune cell markers." + example: Immune_Cell_Classification.csv + required: false + info: + template: "[sample_name]_(assay)_cell_type_experimental.csv" + - name: Multiplex outputs + description: Outputs when multiplex option is selected + arguments: + - name: "--sample_tag_metrics" + type: file + direction: output + description: "Sample Tag Metrics. Metrics from the sample determination algorithm." + example: Sample_Tag_Metrics.csv + required: false + info: + template: "[sample_name]_Sample_Tag_Metrics.csv" + - name: "--sample_tag_calls" + type: file + direction: output + description: "Sample Tag Calls. Assigned Sample Tag for each putative cell" + example: Sample_Tag_Calls.csv + required: false + info: + template: "[sample_name]_Sample_Tag_Calls.csv" + - name: "--sample_tag_counts" + type: file + direction: output + description: "Sample Tag Counts. Separate data tables and metric summary for cells assigned to each sample tag. Note: For putative cells that could not be assigned a specific Sample Tag, a Multiplet_and_Undetermined.zip file is also output." + example: Sample_Tag1.zip + required: false + multiple: true + info: + template: "[sample_name]_Sample_Tag[number].zip" + - name: "--sample_tag_counts_unassigned" + type: file + direction: output + description: "Sample Tag Counts Unassigned. Data table and metric summary for cells that could not be assigned a specific Sample Tag." + example: Multiplet_and_Undetermined.zip + required: false + info: + template: "[sample_name]_Multiplet_and_Undetermined.zip" + - name: VDJ Outputs + description: Outputs when VDJ option selected + arguments: + - name: "--vdj_metrics" + type: file + direction: output + description: "VDJ Metrics. Overall metrics from the VDJ analysis." + example: VDJ_Metrics.csv + required: false + info: + template: "[sample_name]_VDJ_Metrics.csv" + - name: "--vdj_per_cell" + type: file + direction: output + description: "VDJ Per Cell. Cell specific read and molecule counts, VDJ gene segments, CDR3 sequences, paired chains, and cell type." + example: VDJ_perCell.csv + required: false + info: + template: "[sample_name]_VDJ_perCell.csv" + - name: "--vdj_per_cell_uncorrected" + type: file + direction: output + description: "VDJ Per Cell Uncorrected. Cell specific read and molecule counts, VDJ gene segments, CDR3 sequences, paired chains, and cell type." + example: VDJ_perCell_uncorrected.csv + required: false + info: + template: "[sample_name]_VDJ_perCell_uncorrected.csv" + - name: "--vdj_dominant_contigs" + type: file + direction: output + description: "VDJ Dominant Contigs. Dominant contig for each cell label chain type combination (putative cells only)." + example: VDJ_Dominant_Contigs_AIRR.csv + required: false + info: + template: "[sample_name]_VDJ_Dominant_Contigs_AIRR.csv" + - name: "--vdj_unfiltered_contigs" + type: file + direction: output + description: "VDJ Unfiltered Contigs. All contigs that were assembled and annotated successfully (all cells)." + example: VDJ_Unfiltered_Contigs_AIRR.csv + required: false + info: + template: "[sample_name]_VDJ_Unfiltered_Contigs_AIRR.csv" + - name: "ATAC-Seq outputs" + description: Outputs when ATAC-Seq option selected + arguments: + - name: "--atac_metrics" + type: file + direction: output + description: "ATAC Metrics. Overall metrics from the ATAC-Seq analysis." + example: ATAC_Metrics.csv + required: false + info: + template: "[sample_name]_ATAC_Metrics.csv" + - name: "--atac_metrics_json" + type: file + direction: output + description: "ATAC Metrics JSON. Overall metrics from the ATAC-Seq analysis in JSON format." + example: ATAC_Metrics.json + required: false + info: + template: "[sample_name]_ATAC_Metrics.json" + - name: "--atac_fragments" + type: file + direction: output + description: "ATAC Fragments. Chromosomal location, cell index, and read support for each fragment detected" + example: ATAC_Fragments.bed.gz + required: false + info: + template: "[sample_name]_ATAC_Fragments.bed.gz" + - name: "--atac_fragments_index" + type: file + direction: output + description: "Index of ATAC Fragments." + example: ATAC_Fragments.bed.gz.tbi + required: false + info: + template: "[sample_name]_ATAC_Fragments.bed.gz.tbi" + - name: "--atac_transposase_sites" + type: file + direction: output + description: "ATAC Transposase Sites. Chromosomal location, cell index, and read support for each transposase site detected" + example: ATAC_Transposase_Sites.bed.gz + required: false + info: + template: "[sample_name]_ATAC_Transposase_Sites.bed.gz" + - name: "--atac_transposase_sites_index" + type: file + direction: output + description: "Index of ATAC Transposase Sites." + example: ATAC_Transposase_Sites.bed.gz.tbi + required: false + info: + template: "[sample_name]_ATAC_Transposase_Sites.bed.gz.tbi" + - name: "--atac_peaks" + type: file + direction: output + description: "ATAC Peaks. Peak regions of transposase activity" + example: ATAC_Peaks.bed.gz + required: false + info: + template: "[sample_name]_ATAC_Peaks.bed.gz" + - name: "--atac_peaks_index" + type: file + direction: output + description: "Index of ATAC Peaks." + example: ATAC_Peaks.bed.gz.tbi + required: false + info: + template: "[sample_name]_ATAC_Peaks.bed.gz.tbi" + - name: "--atac_peak_annotation" + type: file + direction: output + description: "ATAC Peak Annotation. Estimated annotation of peak-to-gene connections" + example: peak_annotation.tsv.gz + required: false + info: + template: "[sample_name]_peak_annotation.tsv.gz" + - name: "--atac_cell_by_peak" + type: file + direction: output + description: "ATAC Cell by Peak. Peak regions of transposase activity per cell" + example: ATAC_Cell_by_Peak_MEX.zip + required: false + info: + template: "[sample_name]_ATAC_Cell_by_Peak_MEX.zip" + - name: "--atac_cell_by_peak_unfiltered" + type: file + direction: output + description: "ATAC Cell by Peak Unfiltered. Unfiltered file containing all cell labels with >=1 transposase sites in peaks." + example: ATAC_Cell_by_Peak_Unfiltered_MEX.zip + required: false + info: + template: "[sample_name]_ATAC_Cell_by_Peak_Unfiltered_MEX.zip" + - name: "--atac_bam" + type: file + direction: output + description: "ATAC BAM. Alignment file for R1 and R2 with associated I2 annotations for ATAC-Seq. Only output if the BAM generation flag is set to true." + example: ATAC.bam + required: false + info: + template: "[sample_name]_ATAC.bam" + - name: "--atac_bam_index" + type: file + direction: output + description: "Index of ATAC BAM." + example: ATAC.bam.bai + required: false + info: + template: "[sample_name]_ATAC.bam.bai" + - name: AbSeq Cell Calling outputs + description: Outputs when Cell Calling Abseq is selected + arguments: + - name: "--protein_aggregates_experimental" + type: file + direction: output + description: "Protein Aggregates Experimental" + example: Protein_Aggregates_Experimental.csv + required: false + info: + template: "[sample_name]_Protein_Aggregates_Experimental.csv" + - name: Putative Cell Calling Settings + arguments: + - name: "--cell_calling_data" + type: string + description: | + Specify the dataset to be used for putative cell calling: mRNA, AbSeq, ATAC, mRNA_and_ATAC + + For putative cell calling using an AbSeq dataset, please provide an AbSeq_Reference fasta file above. + + For putative cell calling using an ATAC dataset, please provide a WTA+ATAC-Seq Reference_Archive file above. + + The default data for putative cell calling, will be determined the following way: + + - If mRNA Reads and ATAC Reads exist: mRNA_and_ATAC + - If only ATAC Reads exist: ATAC + - Otherwise: mRNA + choices: [mRNA, AbSeq, ATAC, mRNA_and_ATAC] + example: mRNA + info: + config_key: Cell_Calling_Data + - name: "--cell_calling_bioproduct_algorithm" + type: string + description: | + Specify the bioproduct algorithm to be used for putative cell calling: Basic or Refined + + By default, the Basic algorithm will be used for putative cell calling. + choices: [Basic, Refined] + example: Basic + info: + config_key: Cell_Calling_Bioproduct_Algorithm + - name: "--cell_calling_atac_algorithm" + type: string + description: | + Specify the ATAC-seq algorithm to be used for putative cell calling: Basic or Refined + + By default, the Basic algorithm will be used for putative cell calling. + choices: [Basic, Refined] + example: Basic + info: + config_key: Cell_Calling_ATAC_Algorithm + - name: "--exact_cell_count" + type: integer + description: | + Set a specific number (>=1) of cells as putative, based on those with the highest error-corrected read count + example: 10000 + min: 1 + info: + config_key: Exact_Cell_Count + - name: "--expected_cell_count" + type: integer + description: | + Guide the basic putative cell calling algorithm by providing an estimate of the number of cells expected. Usually this can be the number of cells loaded into the Rhapsody cartridge. If there are multiple inflection points on the second derivative cumulative curve, this will ensure the one selected is near the expected. + example: 20000 + min: 1 + info: + config_key: Expected_Cell_Count + - name: Intronic Reads Settings + arguments: + - name: --exclude_intronic_reads + type: boolean + description: | + By default, the flag is false, and reads aligned to exons and introns are considered and represented in molecule counts. When the flag is set to true, intronic reads will be excluded. + The value can be true or false. + example: false + info: + config_key: Exclude_Intronic_Reads + - name: Multiplex Settings + arguments: + - name: "--sample_tags_version" + type: string + description: | + Specify the version of the Sample Tags used in the run: + + * If Sample Tag Multiplexing was done, specify the appropriate version: human, mouse, flex, nuclei_includes_mrna, nuclei_atac_only + * If this is an SMK + Nuclei mRNA run or an SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq) run (and not an SMK + ATAC-Seq only run), choose the "nuclei_includes_mrna" option. + * If this is an SMK + ATAC-Seq only run (and not SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq)), choose the "nuclei_atac_only" option. + choices: [human, mouse, flex, nuclei_includes_mrna, nuclei_atac_only] + example: human + info: + config_key: Sample_Tags_Version + - name: "--tag_names" + type: string + description: | + Specify the tag number followed by '-' and the desired sample name to appear in Sample_Tag_Metrics.csv + Do not use the special characters: &, (), [], {}, <>, ?, | + multiple: true + example: [4-mySample, 9-myOtherSample, 6-alsoThisSample] + info: + config_key: Tag_Names + - name: VDJ arguments + arguments: + - name: "--vdj_version" + type: string + description: | + If VDJ was done, specify the appropriate option: human, mouse, humanBCR, humanTCR, mouseBCR, mouseTCR + choices: [human, mouse, humanBCR, humanTCR, mouseBCR, mouseTCR] + example: human + info: + config_key: VDJ_Version + - name: ATAC options + arguments: + - name: "--predefined_atac_peaks" + type: file + description: An optional BED file containing pre-established chromatin accessibility peak regions for generating the ATAC cell-by-peak matrix. + example: predefined_peaks.bed + info: + config_key: Predefined_ATAC_Peaks + - name: Additional options + arguments: + - name: "--run_name" + type: string + description: | + Specify a run name to use as the output file base name. Use only letters, numbers, or hyphens. Do not use special characters or spaces. + default: sample + info: + config_key: Run_Name + - name: "--generate_bam" + type: boolean + description: | + Specify whether to create the BAM file output + default: false + info: + config_key: Generate_Bam + - name: "--long_reads" + type: boolean + description: | + Use STARlong (default: undefined - i.e. autodetects based on read lengths) - Specify if the STARlong aligner should be used instead of STAR. Set to true if the reads are longer than 650bp. + info: + config_key: Long_Reads + - name: Advanced options + description: | + NOTE: Only change these if you are really sure about what you are doing + arguments: + - name: "--custom_star_params" + type: string + description: | + Modify STAR alignment parameters - Set this parameter to fully override default STAR mapping parameters used in the pipeline. + For reference this is the default that is used: + + Short Reads: `--outFilterScoreMinOverLread 0 --outFilterMatchNminOverLread 0 --outFilterMultimapScoreRange 0 --clip3pAdapterSeq AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA --seedSearchStartLmax 50 --outFilterMatchNmin 25 --limitOutSJcollapsed 2000000` + Long Reads: Same as Short Reads + `--seedPerReadNmax 10000` + + This applies to fastqs provided in the Reads user input + Do NOT set any non-mapping related params like `--genomeDir`, `--outSAMtype`, `--outSAMunmapped`, `--readFilesIn`, `--runThreadN`, etc. + We use STAR version 2.7.10b + example: "--alignIntronMax 6000 --outFilterScoreMinOverLread 0.1 --limitOutSJcollapsed 2000000" + info: + config_key: Custom_STAR_Params + - name: "--custom_bwa_mem2_params" + type: string + description: | + Modify bwa-mem2 alignment parameters - Set this parameter to fully override bwa-mem2 mapping parameters used in the pipeline + The pipeline does not specify any custom mapping params to bwa-mem2 so program default values are used + This applies to fastqs provided in the Reads_ATAC user input + Do NOT set any non-mapping related params like `-C`, `-t`, etc. + We use bwa-mem2 version 2.2.1 + example: "-k 16 -w 200 -r" + info: + config_key: Custom_bwa_mem2_Params + - name: CWL-runner arguments + arguments: + - name: "--parallel" + type: boolean + description: "Run jobs in parallel." + default: true + - name: "--timestamps" + type: boolean_true + description: "Add timestamps to the errors, warnings, and notifications." + - name: Undocumented arguments + arguments: + - name: --abseq_umi + type: integer + multiple: false + info: + config_key: AbSeq_UMI + - name: --target_analysis + type: boolean + multiple: false + info: + config_key: Target_analysis + - name: --vdj_jgene_evalue + type: double + description: | + e-value threshold for J gene. The e-value threshold for J gene call by IgBlast/PyIR, default is set as 0.001 + multiple: false + info: + config_key: VDJ_JGene_Evalue + - name: --vdj_vgene_evalue + type: double + description: | + e-value threshold for V gene. The e-value threshold for V gene call by IgBlast/PyIR, default is set as 0.001 + multiple: false + info: + config_key: VDJ_VGene_Evalue + - name: --write_filtered_reads + type: boolean + multiple: false + info: + config_key: Write_Filtered_Reads +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py + - path: ../test_data + - path: ../helpers + +requirements: + commands: [ "cwl-runner" ] + +engines: + - type: docker + image: bdgenomics/rhapsody:2.2.1 + setup: + - type: apt + packages: [procps, git] + - type: python + packages: [cwlref-runner, cwl-runner] + - type: docker + run: | + mkdir /var/bd_rhapsody_cwl && \ + cd /var/bd_rhapsody_cwl && \ + git clone https://bitbucket.org/CRSwDev/cwl.git . && \ + git checkout 8feeace1141b24749ea6003f8e6ad6d3ad5232de + - type: docker + run: + - VERSION=$(ls -v /var/bd_rhapsody_cwl | grep '^v' | sed 's#v##' | tail -1) + - 'echo "bdgenomics/rhapsody: \"$VERSION\"" > /var/software_versions.txt' + test_setup: + - type: python + packages: [biopython, gffutils] +runners: + - type: executable + - type: nextflow diff --git a/src/bd_rhapsody/bd_rhapsody_sequence_analysis/help.txt b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/help.txt new file mode 100644 index 00000000..618faa3e --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/help.txt @@ -0,0 +1,167 @@ +```bash +cwl-runner src/bd_rhapsody/bd_rhapsody_sequence_analysis/rhapsody_pipeline_2.2.1_nodocker.cwl --help +``` + +usage: src/bd_rhapsody/bd_rhapsody_sequence_analysis/rhapsody_pipeline_2.2.1_nodocker.cwl + [-h] [--AbSeq_Reference ABSEQ_REFERENCE] [--AbSeq_UMI ABSEQ_UMI] + [--Cell_Calling_ATAC_Algorithm CELL_CALLING_ATAC_ALGORITHM] + [--Cell_Calling_Bioproduct_Algorithm CELL_CALLING_BIOPRODUCT_ALGORITHM] + [--Cell_Calling_Data CELL_CALLING_DATA] + [--Custom_STAR_Params CUSTOM_STAR_PARAMS] + [--Custom_bwa_mem2_Params CUSTOM_BWA_MEM2_PARAMS] + [--Exact_Cell_Count EXACT_CELL_COUNT] [--Exclude_Intronic_Reads] + [--Expected_Cell_Count EXPECTED_CELL_COUNT] [--Generate_Bam] + [--Long_Reads] [--Maximum_Threads MAXIMUM_THREADS] + [--Predefined_ATAC_Peaks PREDEFINED_ATAC_PEAKS] [--Reads READS] + [--Reads_ATAC READS_ATAC] [--Reference_Archive REFERENCE_ARCHIVE] + [--Run_Name RUN_NAME] [--Sample_Tags_Version SAMPLE_TAGS_VERSION] + [--Supplemental_Reference SUPPLEMENTAL_REFERENCE] + [--Tag_Names TAG_NAMES] [--Target_analysis] + [--Targeted_Reference TARGETED_REFERENCE] + [--VDJ_JGene_Evalue VDJ_JGENE_EVALUE] + [--VDJ_VGene_Evalue VDJ_VGENE_EVALUE] [--VDJ_Version VDJ_VERSION] + [--Write_Filtered_Reads] + [job_order] + +The BD Rhapsody™ assays are used to create sequencing libraries from single +cell transcriptomes. After sequencing, the analysis pipeline takes the FASTQ +files and a reference file for gene alignment. The pipeline generates +molecular counts per cell, read counts per cell, metrics, and an alignment +file. + +positional arguments: + job_order Job input json file + +options: + -h, --help show this help message and exit + --AbSeq_Reference ABSEQ_REFERENCE + AbSeq Reference + --AbSeq_UMI ABSEQ_UMI + --Cell_Calling_ATAC_Algorithm CELL_CALLING_ATAC_ALGORITHM + Specify the ATAC algorithm to be used for ATAC + putative cell calling. The Basic algorithm is the + default. + --Cell_Calling_Bioproduct_Algorithm CELL_CALLING_BIOPRODUCT_ALGORITHM + Specify the bioproduct algorithm to be used for + mRNA/AbSeq putative cell calling. The Basic algorithm + is the default. + --Cell_Calling_Data CELL_CALLING_DATA + Specify the data to be used for putative cell calling. + The default data for putative cell calling will be + determined the following way: - If mRNA and ATAC Reads + exist, mRNA_and_ATAC is the default. - If only ATAC + Reads exist, ATAC is the default. - Otherwise, mRNA is + the default. + --Custom_STAR_Params CUSTOM_STAR_PARAMS + Allows you to specify custom STAR aligner mapping + parameters. Only the mapping parameters you provide + here will be used with STAR, meaning that you must + provide the complete list of parameters that you want + to take effect. For reference, the parameters used by + default in the pipeline are: 1. Short Reads: + --outFilterScoreMinOverLread 0 + --outFilterMatchNminOverLread 0 + --outFilterMultimapScoreRange 0 --clip3pAdapterSeq + AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + --seedSearchStartLmax 50 --outFilterMatchNmin 25 + --limitOutSJcollapsed 2000000 2. Long Reads: Same + options as short reads + --seedPerReadNmax 10000 + Example input: --alignIntronMax 500000 + --outFilterScoreMinOverLread 0 --limitOutSJcollapsed + 2000000 Important: 1. This applies to fastqs provided + in the Reads user input 2. Please do not specify any + non-mapping related params like: --runThreadN, + --genomeDir --outSAMtype, etc. 3. Please only use + params supported by STAR version 2.7.10b + --Custom_bwa_mem2_Params CUSTOM_BWA_MEM2_PARAMS + Allows you to specify custom bwa-mem2 mapping + parameters. Only the mapping parameters you provide + here will be used with bwa-mem2, meaning that you must + provide the complete list of parameters that you want + to take effect. The pipeline uses program default + mapping parameters. Example input: -k 15 -w 200 -r 2 + Important: 1. This applies to fastqs provided in the + Reads_ATAC user input 2. Please do not specify any + non-mapping related params like: -C, -t, etc. 3. + Please only use params supported by bwa-mem2 version + 2.2.1 + --Exact_Cell_Count EXACT_CELL_COUNT + Set a specific number (>=1) of cells as putative, + based on those with the highest error-corrected read + count + --Exclude_Intronic_Reads + By default, reads aligned to exons and introns are + considered and represented in molecule counts. + Including intronic reads may increase sensitivity, + resulting in an increase in molecule counts and the + number of genes per cell for both cellular and nuclei + samples. Intronic reads may indicate unspliced mRNAs + and are also useful, for example, in the study of + nuclei and RNA velocity. When set to true, intronic + reads will be excluded. + --Expected_Cell_Count EXPECTED_CELL_COUNT + Optional. Guide the basic putative cell calling + algorithm by providing an estimate of the number of + cells expected. Usually this can be the number of + cells loaded into the Rhapsody cartridge. If there are + multiple inflection points on the second derivative + cumulative curve, this will ensure the one selected is + near the expected. + --Generate_Bam Default: false. A Bam read alignment file contains + reads from all the input libraries, but creating it + can consume a lot of compute and disk resources. By + setting this field to true, the Bam file will be + created. This option is shared for both Bioproduct and + ATAC libraries. + --Long_Reads By default, we detect if there are any reads longer + than 650bp and then flag QualCLAlign to use STARlong + instead of STAR. This flag can be explicitly set if it + is known in advance that there are reads longer than + 650bp. + --Maximum_Threads MAXIMUM_THREADS + The maximum number of threads to use in the pipeline. + By default, all available cores are used. + --Predefined_ATAC_Peaks PREDEFINED_ATAC_PEAKS + An optional BED file containing pre-established + chromatin accessibility peak regions for generating + the ATAC cell-by-peak matrix. Only applies to ATAC + assays. + --Reads READS FASTQ files from libraries that may include WTA mRNA, + Targeted mRNA, AbSeq, Sample Multiplexing, and related + technologies + --Reads_ATAC READS_ATAC + FASTQ files from libraries generated using the ATAC + assay protocol. Each lane of a library is expected to + have 3 FASTQs - R1, R2 and I1/I2, where the index read + contains the Cell Barcode and UMI sequence. Only + applies to ATAC assays. + --Reference_Archive REFERENCE_ARCHIVE + Reference Files Archive + --Run_Name RUN_NAME This is a name for output files, for example + Experiment1_Metrics_Summary.csv. Default if left empty + is to name run based on a library. Any non-alpha + numeric characters will be changed to a hyphen. + --Sample_Tags_Version SAMPLE_TAGS_VERSION + The sample multiplexing kit version. This option + should only be set for a multiplexed experiment. + --Supplemental_Reference SUPPLEMENTAL_REFERENCE + Supplemental Reference + --Tag_Names TAG_NAMES + Specify the Sample Tag number followed by - (hyphen) + and a sample name to appear in the output files. For + example: 4-Ramos. Should be alpha numeric, with + - + and _ allowed. Any special characters: &, (), [], {}, + <>, ?, | will be corrected to underscores. + --Target_analysis + --Targeted_Reference TARGETED_REFERENCE + Targeted Reference + --VDJ_JGene_Evalue VDJ_JGENE_EVALUE + The e-value threshold for J gene call by IgBlast/PyIR, + default is set as 0.001 + --VDJ_VGene_Evalue VDJ_VGENE_EVALUE + The e-value threshold for V gene call by IgBlast/PyIR, + default is set as 0.001 + --VDJ_Version VDJ_VERSION + The VDJ species and chain types. This option should + only be set for VDJ experiment. + --Write_Filtered_Reads diff --git a/src/bd_rhapsody/bd_rhapsody_sequence_analysis/pipeline_inputs_template_2.2.1.yaml b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/pipeline_inputs_template_2.2.1.yaml new file mode 100644 index 00000000..19728a57 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/pipeline_inputs_template_2.2.1.yaml @@ -0,0 +1,203 @@ +#!/usr/bin/env cwl-runner + +cwl:tool: rhapsody + +# This is a template YML file used to specify the inputs for a BD Rhapsody Sequence Analysis pipeline run. +# See the BD Rhapsody Sequence Analysis Pipeline User Guide for more details. Enter the following information: + + +## Reads (optional) - Path to your FASTQ.GZ formatted read files from libraries that may include: +# - WTA mRNA +# - Targeted mRNA +# - AbSeq +# - Sample Multiplexing +# - VDJ +# You may specify as many R1/R2 read pairs as you want. +Reads: + + - class: File + location: "test/WTALibrary_S1_L001_R1_001.fastq.gz" + + - class: File + location: "test/WTALibrary_S1_L001_R2_001.fastq.gz" + +## Reads_ATAC (optional) - Path to your FASTQ.GZ formatted read files from ATAC-Seq libraries. +## You may specify as many R1/R2/I2 files as you want. +Reads_ATAC: + + - class: File + location: "test/ATACLibrary_S2_L001_R1_001.fastq.gz" + + - class: File + location: "test/ATACLibrary_S2_L001_R2_001.fastq.gz" + + - class: File + location: "test/ATACLibrary_S2_L001_I2_001.fastq.gz" + + +## Assay type will be inferred from the provided reference(s) +## Do not provide both Reference_Archive and Targeted_Reference at the same time +## +## Valid reference input combinations: +## WTA Reference_Archive (WTA only) +## WTA Reference_Archive + AbSeq_Reference (WTA + AbSeq) +## WTA Reference_Archive + Supplemental_Reference (WTA + extra transgenes) +## WTA Reference_Archive + AbSeq_Reference + Supplemental_Reference (WTA + AbSeq + extra transgenes) +## WTA+ATAC-Seq Reference_Archive (WTA + ATAC, ATAC only) +## WTA+ATAC-Seq Reference_Archive + Supplemental_Reference (WTA + ATAC + extra transgenes) +## Targeted_Reference (Targeted only) +## Targeted_Reference + AbSeq_Reference (Targeted + AbSeq) +## AbSeq_Reference (AbSeq only) + +## See the BD Rhapsody Sequence Analysis Pipeline User Guide for instructions on how to: +## - Obtain a pre-built Rhapsody Reference file +## - Create a custom Rhapsody Reference file + +## WTA Reference_Archive (required for WTA mRNA assay) - Path to Rhapsody WTA Reference in the tar.gz format. +## +## --Structure of reference archive-- +## BD_Rhapsody_Reference_Files/ # top level folder +## star_index/ # sub-folder containing STAR index +## [files created with STAR --runMode genomeGenerate] +## [GTF for gene-transcript-annotation e.g. "gencode.v43.primary_assembly.annotation.gtf"] +## +## WTA+ATAC-Seq Reference_Archive (required for ATAC-Seq or Multiomic ATAC-Seq (WTA+ATAC-Seq) assays) - Path to Rhapsody WTA+ATAC-Seq Reference in the tar.gz format. +## +## --Structure of reference archive-- +## BD_Rhapsody_Reference_Files/ # top level folder +## star_index/ # sub-folder containing STAR index +## [files created with STAR --runMode genomeGenerate] +## [GTF for gene-transcript-annotation e.g. "gencode.v43.primary_assembly.annotation.gtf"] +## +## mitochondrial_contigs.txt # mitochondrial contigs in the reference genome - one contig name per line. e.g. chrMT or chrM, etc. +## +## bwa-mem2_index/ # sub-folder containing bwa-mem2 index +## [files created with bwa-mem2 index] +## +Reference_Archive: + class: File + location: "test/RhapRef_Human_WTA_2023-02.tar.gz" +# location: "test/RhapRef_Human_WTA-ATAC_2023-08.tar.gz" + +## Targeted_Reference (required for Targeted mRNA assay) - Path to the targeted reference file in FASTA format. +#Targeted_Reference: +# - class: File +# location: "test/BD_Rhapsody_Immune_Response_Panel_Hs.fasta" + +## AbSeq_Reference (optional) - Path to the AbSeq reference file in FASTA format. Only needed if BD AbSeq Ab-Oligos are used. +## For putative cell calling using an AbSeq dataset, please provide an AbSeq reference fasta file as the AbSeq_Reference. +#AbSeq_Reference: +# - class: File +# location: "test/AbSeq_reference.fasta" + +## Supplemental_Reference (optional) - Path to the supplemental reference file in FASTA format. Only needed if there are additional transgene sequences to be aligned against in a WTA assay experiment +#Supplemental_Reference: +# - class: File +# location: "test/supplemental_reference.fasta" + +#################################### +## Putative Cell Calling Settings ## +#################################### + +## Putative cell calling dataset (optional) - Specify the dataset to be used for putative cell calling: mRNA, AbSeq, ATAC, mRNA_and_ATAC +## For putative cell calling using an AbSeq dataset, please provide an AbSeq_Reference fasta file above. +## For putative cell calling using an ATAC dataset, please provide a WTA+ATAC-Seq Reference_Archive file above. +## The default data for putative cell calling, will be determined the following way: +## If mRNA Reads and ATAC Reads exist: +## Cell_Calling_Data: mRNA_and_ATAC +## If only ATAC Reads exist: +## Cell_Calling_Data: ATAC +## Otherwise: +## Cell_Calling_Data: mRNA +#Cell_Calling_Data: mRNA + +## Putative cell calling bioproduct algorithm (optional) - Specify the bioproduct algorithm to be used for putative cell calling: Basic or Refined +## By default, the Basic algorithm will be used for putative cell calling. +#Cell_Calling_Bioproduct_Algorithm: Basic + +## Putative cell calling ATAC algorithm (optional) - Specify the ATAC-seq algorithm to be used for putative cell calling: Basic or Refined +## By default, the Basic algorithm will be used for putative cell calling. +#Cell_Calling_ATAC_Algorithm: Basic + +## Exact cell count (optional) - Set a specific number (>=1) of cells as putative, based on those with the highest error-corrected read count +#Exact_Cell_Count: 10000 + +## Expected Cell Count (optional) - Guide the basic putative cell calling algorithm by providing an estimate of the number of cells expected. Usually this can be the number of cells loaded into the Rhapsody cartridge. If there are multiple inflection points on the second derivative cumulative curve, this will ensure the one selected is near the expected. +#Expected_Cell_Count: 20000 + + +#################################### +## Intronic Reads Settings ## +#################################### + +## Exclude_Intronic_Reads (optional) +## By default, the flag is false, and reads aligned to exons and introns are considered and represented in molecule counts. When the flag is set to true, intronic reads will be excluded. +## The value can be true or false. +#Exclude_Intronic_Reads: true + +####################### +## Multiplex options ## +####################### + +## Sample Tags Version (optional) - If Sample Tag Multiplexing was done, specify the appropriate version: human, mouse, flex, nuclei_includes_mrna, nuclei_atac_only +## If this is an SMK + Nuclei mRNA run or an SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq) run (and not an SMK + ATAC-Seq only run), choose the "nuclei_includes_mrna" option. +## If this is an SMK + ATAC-Seq only run (and not SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq)), choose the "nuclei_atac_only" option. +#Sample_Tags_Version: human + +## Tag_Names (optional) - Specify the tag number followed by '-' and the desired sample name to appear in Sample_Tag_Metrics.csv +# Do not use the special characters: &, (), [], {}, <>, ?, | +#Tag_Names: [4-mySample, 9-myOtherSample, 6-alsoThisSample] + +################ +## VDJ option ## +################ + +## VDJ Version (optional) - If VDJ was done, specify the appropriate option: human, mouse, humanBCR, humanTCR, mouseBCR, mouseTCR +#VDJ_Version: human + +################## +## ATAC options ## +################## + +## Predefined ATAC Peaks - An optional BED file containing pre-established chromatin accessibility peak regions for generating the ATAC cell-by-peak matrix. +#Predefined_ATAC_Peaks: +# class: File +# location: "path/predefined_peaks.bed" + +######################## +## Additional Options ## +######################## + +## Run Name (optional)- Specify a run name to use as the output file base name. Use only letters, numbers, or hyphens. Do not use special characters or spaces. +#Run_Name: my-experiment + +## Generate Bam (optional, default: false) - Specify whether to create the BAM file output +#Generate_Bam: true + +## Maximum_Threads (integer, optional, default: [use all cores of CPU]) - Set the maximum number of threads to use in the read processing steps of the pipeline: QualCLAlign, AlignmentAnalysis, VDJ assembly +#Maximum_Threads: 16 + +## Use STARlong (optional, default: "auto" - i.e. autodetects based on read lengths) - Specify if the STARlong aligner should be used instead of STAR. Set to true if the reads are longer than 650bp. +## The value can be true or false. +#Long_Reads: true + +######################## +## Advanced Options ## +######################## +## NOTE: Only change these if you are really sure about what you are doing + +## Modify STAR alignment parameters - Set this parameter to fully override default STAR mapping parameters used in the pipeline. +## For reference this is the default that is used: +## Short Reads: --outFilterScoreMinOverLread 0 --outFilterMatchNminOverLread 0 --outFilterMultimapScoreRange 0 --clip3pAdapterSeq AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA --seedSearchStartLmax 50 --outFilterMatchNmin 25 --limitOutSJcollapsed 2000000 +## Long Reads: Same as Short Reads + --seedPerReadNmax 10000 +## This applies to fastqs provided in the Reads user input +## Do NOT set any non-mapping related params like --genomeDir, --outSAMtype, --outSAMunmapped, --readFilesIn, --runThreadN, etc. +## We use STAR version 2.7.10b +#Custom_STAR_Params: --alignIntronMax 6000 --outFilterScoreMinOverLread 0.1 --limitOutSJcollapsed 2000000 + +## Modify bwa-mem2 alignment parameters - Set this parameter to fully override bwa-mem2 mapping parameters used in the pipeline +## The pipeline does not specify any custom mapping params to bwa-mem2 so program default values are used +## This applies to fastqs provided in the Reads_ATAC user input +## Do NOT set any non-mapping related params like -C, -t, etc. +## We use bwa-mem2 version 2.2.1 +#Custom_bwa_mem2_Params: -k 16 -w 200 -r diff --git a/src/bd_rhapsody/bd_rhapsody_sequence_analysis/script.py b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/script.py new file mode 100644 index 00000000..cbddf6bf --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/script.py @@ -0,0 +1,243 @@ +import os +import re +import subprocess +import tempfile +from typing import Any +import yaml +import shutil +import glob + +## VIASH START +par = { + 'reads': [ + 'resources_test/bdrhap_5kjrt/raw/12ABC_S1_L432_R1_001_subset.fastq.gz', + 'resources_test/bdrhap_5kjrt/raw/12ABC_S1_L432_R2_001_subset.fastq.gz' + ], + 'reads_atac': None, + 'reference_archive': "resources_test/reference_gencodev41_chr1/reference_bd_rhapsody.tar.gz", + 'targeted_reference': [], + 'abseq_reference': [], + 'supplemental_reference': [], + 'output': 'output_dir', + 'cell_calling_data': None, + 'cell_calling_bioproduct_algorithm': None, + 'cell_calling_atac_algorithm': None, + 'exact_cell_count': None, + 'expected_cell_count': None, + 'exclude_intronic_reads': None, + 'sample_tags_version': None, + 'tag_names': [], + 'vdj_version': None, + 'predefined_atac_peaks': None, + 'run_name': "sample", + 'generate_bam': None, + 'alignment_star_params': None, + 'alignment_bwa_mem2_params': None, + 'parallel': True, + 'timestamps': False, + 'dryrun': False +} +meta = { + 'config': "target/nextflow/bd_rhaspody/bd_rhaspody_sequence_analysis/.config.vsh.yaml", + 'resources_dir': os.path.abspath('src/bd_rhaspody/bd_rhaspody_sequence_analysis'), + 'temp_dir': os.getenv("VIASH_TEMP"), + 'memory_mb': None, + 'cpus': None +} +## VIASH END + +def clean_arg(argument): + argument["clean_name"] = argument["name"].lstrip("-") + return argument + +def read_config(path: str) -> dict[str, Any]: + with open(path, 'r') as f: + config = yaml.safe_load(f) + + config["arguments"] = [ + clean_arg(arg) + for grp in config["argument_groups"] + for arg in grp["arguments"] + ] + + return config + +def strip_margin(text: str) -> str: + return re.sub('(\n?)[ \t]*\|', '\\1', text) + +def process_params(par: dict[str, Any], config, temp_dir: str) -> str: + # check input parameters + assert par["reads"] or par["reads_atac"], "Pass at least one set of inputs to --reads or --reads_atac." + + # output to temp dir if output_dir was not passed + if not par["output_dir"]: + par["output_dir"] = os.path.join(temp_dir, "output") + + # checking sample prefix + if par["run_name"] and re.match("[^A-Za-z0-9]", par["run_name"]): + print("--run_name should only consist of letters, numbers or hyphens. Replacing all '[^A-Za-z0-9]' with '-'.", flush=True) + par["run_name"] = re.sub("[^A-Za-z0-9\\-]", "-", par["run_name"]) + + # make paths absolute + for argument in config["arguments"]: + arg_clean_name = argument["clean_name"] + if not par[arg_clean_name] or not argument["type"] == "file": + continue + par_value = par[arg_clean_name] + if isinstance(par_value, list): + par_value_absolute = list(map(os.path.abspath, par_value)) + else: + par_value_absolute = os.path.abspath(par_value) + par[arg_clean_name] = par_value_absolute + + return par + +def generate_config(par: dict[str, Any], config) -> str: + content_list = [strip_margin(f"""\ + |#!/usr/bin/env cwl-runner + | + |cwl:tool: rhapsody + |""")] + + for argument in config["arguments"]: + arg_clean_name = argument["clean_name"] + arg_par_value = par[arg_clean_name] + arg_info = argument.get("info") or {} # Note: .info might be None + config_key = arg_info.get("config_key") + if arg_par_value and config_key: + + if argument["type"] == "file": + content = strip_margin(f"""\ + |{config_key}: + |""") + if isinstance(arg_par_value, list): + for file in arg_par_value: + content += strip_margin(f"""\ + | - class: File + | location: "{file}" + |""") + else: + content += strip_margin(f"""\ + | class: File + | location: "{arg_par_value}" + |""") + content_list.append(content) + else: + content_list.append(strip_margin(f"""\ + |{config_key}: {arg_par_value} + |""")) + + ## Write config to file + return ''.join(content_list) + +def generate_config_file(par: dict[str, Any], config: dict[str, Any], temp_dir: str) -> str: + config_file = os.path.join(temp_dir, "config.yml") + config_content = generate_config(par, config) + with open(config_file, "w") as f: + f.write(config_content) + return config_file + +def generate_cwl_file(meta: dict[str, Any], dir: str) -> str: + # create cwl file (if need be) + # orig_cwl_file=os.path.join(meta["resources_dir"], "rhapsody_pipeline_2.2.1_nodocker.cwl") + orig_cwl_file="/var/bd_rhapsody_cwl/v2.2.1/rhapsody_pipeline_2.2.1.cwl" + + if not meta["memory_mb"] and not meta["cpus"]: + return os.path.abspath(orig_cwl_file) + + # Inject computational requirements into pipeline + cwl_file = os.path.join(dir, "pipeline.cwl") + + # Read in the file + with open(orig_cwl_file, 'r') as file : + cwl_data = file.read() + + # Inject computational requirements into pipeline + if meta["memory_mb"]: + memory = int(meta["memory_mb"]) - 2000 # keep 2gb for OS + cwl_data = re.sub('"ramMin": [^\n]*[^,](,?)\n', f'"ramMin": {memory}\\1\n', cwl_data) + if meta["cpus"]: + cwl_data = re.sub('"coresMin": [^\n]*[^,](,?)\n', f'"coresMin": {meta["cpus"]}\\1\n', cwl_data) + + # Write the file out again + with open(cwl_file, 'w') as file: + file.write(cwl_data) + + return os.path.abspath(cwl_file) + +def copy_outputs(par: dict[str, Any], config: dict[str, Any]): + for arg in config["arguments"]: + par_value = par[arg["clean_name"]] + if par_value and arg["type"] == "file" and arg["direction"] == "output": + # example template: '[sample_name]_(assay)_cell_type_experimental.csv' + template = (arg.get("info") or {}).get("template") # Note: .info might be None + if template: + template_glob = template\ + .replace("[sample_name]", par["run_name"])\ + .replace("(assay)", "*")\ + .replace("[number]", "*") + files = glob.glob(os.path.join(par["output_dir"], template_glob)) + if not files and arg["required"]: + raise ValueError(f"Expected output file '{template_glob}' not found.") + elif len(files) > 1 and not arg["multiple"]: + raise ValueError(f"Expected single output file '{template_glob}', but found multiple.") + + if not arg["multiple"]: + shutil.copy(files[0], par_value) + else: + # replace '*' in par_value with index + for i, file in enumerate(files): + shutil.copy(file, par_value.replace("*", str(i))) + + +def main(par: dict[str, Any], meta: dict[str, Any], temp_dir: str): + config = read_config(meta["config"]) + + # Preprocess params + par = process_params(par, config, temp_dir) + + ## Process parameters + cmd = [ + "cwl-runner", + "--no-container", + "--preserve-entire-environment", + "--outdir", par["output_dir"], + ] + + if par["parallel"]: + cmd.append("--parallel") + + if par["timestamps"]: + cmd.append("--timestamps") + + # Create cwl file (if need be) + cwl_file = generate_cwl_file(meta, temp_dir) + cmd.append(cwl_file) + + # Create params file + config_file = generate_config_file(par, config, temp_dir) + cmd.append(config_file) + + # keep environment variables but set TMPDIR to temp_dir + env = dict(os.environ) + env["TMPDIR"] = temp_dir + + # Create output dir if not exists + if not os.path.exists(par["output_dir"]): + os.makedirs(par["output_dir"]) + + # Run command + print("> " + ' '.join(cmd), flush=True) + _ = subprocess.run( + cmd, + cwd=os.path.dirname(config_file), + env=env, + check=True + ) + + # Copy outputs + copy_outputs(par, config) + +if __name__ == "__main__": + with tempfile.TemporaryDirectory(prefix="cwl-bd_rhapsody-", dir=meta["temp_dir"]) as temp_dir: + main(par, meta, temp_dir) diff --git a/src/bd_rhapsody/bd_rhapsody_sequence_analysis/test.py b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/test.py new file mode 100644 index 00000000..aed8e80b --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/test.py @@ -0,0 +1,494 @@ +import subprocess +import gzip +from pathlib import Path +from typing import Tuple +import numpy as np +import random +import mudata as md + +## VIASH START +meta = { + "name": "bd_rhapsody_sequence_analysis", + "executable": "target/docker/bd_rhapsody/bd_rhapsody_sequence_analysis/bd_rhapsody_sequence_analysis", + "resources_dir": "src/bd_rhapsody", + "cpus": 8, + "memory_mb": 4096, +} +## VIASH END + +import sys +sys.path.append(meta["resources_dir"]) + +from helpers.rhapsody_cell_label import index_to_sequence + +meta["executable"] = Path(meta["executable"]) +meta["resources_dir"] = Path(meta["resources_dir"]) + +######################################################################################### + +# Generate index +print("> Generate index", flush=True) +# cwl_file = meta["resources_dir"] / "bd_rhapsody_make_reference.cwl" +cwl_file = "/var/bd_rhapsody_cwl/v2.2.1/Extra_Utilities/make_rhap_reference_2.2.1.cwl" +reference_small_gtf = meta["resources_dir"] / "test_data" / "reference_small.gtf" +reference_small_fa = meta["resources_dir"] / "test_data" / "reference_small.fa" +bdabseq_panel_fa = meta["resources_dir"] / "test_data" / "BDAbSeq_ImmuneDiscoveryPanel.fasta" +sampletagsequences_fa = meta["resources_dir"] / "test_data" / "SampleTagSequences_HomoSapiens_ver1.fasta" + +config_file = Path("reference_config.yml") +reference_file = Path("Rhap_reference.tar.gz") + +subprocess.run([ + "cwl-runner", + "--no-container", + "--preserve-entire-environment", + "--outdir", + ".", + str(cwl_file), + "--Genome_fasta", + str(reference_small_fa), + "--Gtf", + str(reference_small_gtf), + "--Extra_STAR_params", + "--genomeSAindexNbases 4" +]) + +######################################################################################### +# Load reference in memory + +from Bio import SeqIO +import gffutils + +# Load FASTA sequence +with open(str(reference_small_fa), "r") as handle: + reference_fasta_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) +with open(str(bdabseq_panel_fa), "r") as handle: + bdabseq_panel_fasta_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) +with open(str(sampletagsequences_fa), "r") as handle: + sampletagsequences_fasta_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) + +# create in memory db +reference_gtf_db = gffutils.create_db( + str(reference_small_gtf), + dbfn=":memory:", + force=True, + keep_order=True, + merge_strategy="merge", + sort_attribute_values=True, + disable_infer_transcripts=True, + disable_infer_genes=True +) + +############################################# +# TODO: move helper functions to separate helper file + + +def generate_bd_read_metadata( + instrument_id: str = "A00226", + run_id: str = "970", + flowcell_id: str = "H5FGVMXY", + lane: int = 1, + tile: int = 1101, + x: int = 1000, + y: int = 1000, + illumina_flag: str = "1:N:0", + sample_id: str = "CAGAGAGG", +) -> str: + """ + Generate a FASTQ metadata line for a BD Rhapsody FASTQ file. + + Args: + instrument_id: The instrument ID. + run_id: The run ID. + flowcell_id: The flowcell ID. + lane: The lane number. + tile: The tile number. Between 1101 and 1112 in the used example data. + x: The x-coordinate. Between 1000 and 32967 in the used example data. + y: The y-coordinate. Between 1000 and 37059 in the used example data. + illumina_flag: The Illumina flag. Either 1:N:0 or 2:N:0 in the used example data. + sample_id: The sample ID. + """ + # format: @A00226:970:H5FGVDMXY:1:1101:2645:1000 2:N:0:CAGAGAGG + return f"@{instrument_id}:{run_id}:{flowcell_id}:{lane}:{tile}:{x}:{y} {illumina_flag}:{sample_id}" + + +def generate_bd_wta_transcript( + transcript_length: int = 42, +) -> str: + """ + Generate a WTA transcript from a given GTF and FASTA file. + """ + + # Randomly select a gene + gene = random.choice(list(reference_gtf_db.features_of_type("gene"))) + + # Find all exons within the gene + exons = list(reference_gtf_db.children(gene, featuretype="exon", order_by="start")) + + # Calculate total exon length + total_exon_length = sum(exon.end - exon.start + 1 for exon in exons) + + # If total exon length is less than desired transcript length, use it as is + max_transcript_length = min(total_exon_length, transcript_length) + + # Build the WTA transcript sequence + sequence = "" + for exon in exons: + exon_seq = str(reference_fasta_dict[exon.seqid].seq[exon.start - 1 : exon.end]) + sequence += exon_seq + + # Break if desired length is reached + if len(sequence) >= max_transcript_length: + sequence = sequence[:max_transcript_length] + break + + # add padding if need be + if len(sequence) < max_transcript_length: + sequence += "N" * (max_transcript_length - len(sequence)) + + return sequence + + +def generate_bd_wta_read( + cell_index: int = 0, + bead_version: str = "EnhV2", + umi_length: int = 14, + transcript_length: int = 42, +) -> Tuple[str, str]: + """ + Generate a BD Rhapsody WTA read pair for a given cell index. + + Args: + cell_index: The cell index to generate reads for. + bead_version: The bead version to use for generating the cell label. + umi_length: The length of the UMI to generate. + transcript_length: The length of the transcript to generate + + Returns: + A tuple of two strings, the first string being the R1 read and the second string being the R2 read. + + More info: + + See structure of reads: + - https://bd-rhapsody-bioinfo-docs.genomics.bd.com/steps/top_steps.html + - https://bd-rhapsody-bioinfo-docs.genomics.bd.com/steps/steps_cell_label.html + - https://scomix.bd.com/hc/en-us/articles/360057714812-All-FAQ + R1 is Cell Label + UMI + PolyT -> 60 bp + actually, CLS1 + "GTGA" + CLS2 + "GACA" + CLS3 + UMI + R2 is the actual read -> 42 bp + + Example R1 + CLS1 Link CLS2 Link CLS3 UMI + AAAATCCTGT GTGA AACCAAAGT GACA GATAGAGGAG CGCATGTTTATAAC + """ + + # generate metadata + per_row = np.floor((32967 - 1000) / 9) + per_col = np.floor((37059 - 1000) / 9) + + assert cell_index >= 0 and cell_index < per_row * per_col, f"cell_index must be between 0 and {per_row} * {per_col}" + x = 1000 + (cell_index % per_row) * 9 + y = 1000 + (cell_index // per_row) * 9 + instrument_id = "A00226" + run_id = "970" + flowcell_id = "H5FGVMXY" + meta_r1 = generate_bd_read_metadata(instrument_id=instrument_id, run_id=run_id, flowcell_id=flowcell_id, x=x, y=y, illumina_flag="1:N:0") + meta_r2 = generate_bd_read_metadata(instrument_id=instrument_id, run_id=run_id, flowcell_id=flowcell_id, x=x, y=y, illumina_flag="2:N:0") + + # generate r1 (cls1 + link + cls2 + link + cls3 + umi) + assert cell_index >= 0 and cell_index < 384 * 384 * 384 + cell_label = index_to_sequence(cell_index + 1, bead_version=bead_version) + # sample random umi + umi = "".join(random.choices("ACGT", k=umi_length)) + quality_r1 = "I" * (len(cell_label) + len(umi)) + r1 = f"{meta_r1}\n{cell_label}{umi}\n+\n{quality_r1}\n" + + # generate r2 by extracting sequence from fasta and gtf + wta_transcript = generate_bd_wta_transcript(transcript_length=transcript_length) + quality_r2 = "I" * transcript_length + r2 = f"{meta_r2}\n{wta_transcript}\n+\n{quality_r2}\n" + + return r1, r2 + +def generate_bd_wta_fastq_files( + num_cells: int = 100, + num_reads_per_cell: int = 1000, +) -> Tuple[str, str]: + """ + Generate BD Rhapsody WTA FASTQ files for a given number of cells and transcripts per cell. + + Args: + num_cells: The number of cells to generate + num_reads_per_cell: The number of reads to generate per cell + + Returns: + A tuple of two strings, the first string being the R1 reads and the second string being the R2 reads. + """ + r1_reads = "" + r2_reads = "" + for cell_index in range(num_cells): + for _ in range(num_reads_per_cell): + r1, r2 = generate_bd_wta_read(cell_index) + r1_reads += r1 + r2_reads += r2 + + return r1_reads, r2_reads + +def generate_bd_abc_read( + cell_index: int = 0, + bead_version: str = "EnhV2", + umi_length: int = 14, + transcript_length: int = 72, +) -> Tuple[str, str]: + """ + Generate a BD Rhapsody ABC read pair for a given cell index. + + Args: + cell_index: The cell index to generate reads for. + bead_version: The bead version to use for generating the cell label. + umi_length: The length of the UMI to generate. + transcript_length: The length of the transcript to generate + + Returns: + A tuple of two strings, the first string being the R1 read and the second string being the R2 read. + """ + # generate metadata + per_row = np.floor((32967 - 1000) / 9) + per_col = np.floor((37059 - 1000) / 9) + + assert cell_index >= 0 and cell_index < per_row * per_col, f"cell_index must be between 0 and {per_row} * {per_col}" + x = 1000 + (cell_index % per_row) * 9 + y = 1000 + (cell_index // per_row) * 9 + instrument_id = "A01604" + run_id = "19" + flowcell_id = "HMKLYDRXY" + meta_r1 = generate_bd_read_metadata(instrument_id=instrument_id, run_id=run_id, flowcell_id=flowcell_id, x=x, y=y, illumina_flag="1:N:0") + meta_r2 = generate_bd_read_metadata(instrument_id=instrument_id, run_id=run_id, flowcell_id=flowcell_id, x=x, y=y, illumina_flag="2:N:0") + + # generate r1 (cls1 + link + cls2 + link + cls3 + umi) + assert cell_index >= 0 and cell_index < 384 * 384 * 384 + cell_label = index_to_sequence(cell_index + 1, bead_version=bead_version) + # sample random umi + umi = "".join(random.choices("ACGT", k=umi_length)) + quality_r1 = "I" * (len(cell_label) + len(umi)) + r1 = f"{meta_r1}\n{cell_label}{umi}\n+\n{quality_r1}\n" + + # generate r2 by sampling sequence from bdabseq_panel_fa + abseq_seq = str(random.choice(list(bdabseq_panel_fasta_dict.values())).seq) + abc_suffix = "AAAAAAAAAAAAAAAAAAAAAAA" + abc_data = abseq_seq[:transcript_length - len(abc_suffix) - 1] + abc_prefix = "N" + "".join(random.choices("ACGT", k=transcript_length - len(abc_data) - len(abc_suffix) - 1)) + + abc_transcript = f"{abc_prefix}{abc_data}{abc_suffix}" + + quality_r2 = "#" + "I" * (len(abc_transcript) - 1) + r2 = f"{meta_r2}\n{abc_transcript}\n+\n{quality_r2}\n" + + return r1, r2 + +def generate_bd_abc_fastq_files( + num_cells: int = 100, + num_reads_per_cell: int = 1000, +) -> Tuple[str, str]: + """ + Generate BD Rhapsody ABC FASTQ files for a given number of cells and transcripts per cell. + + Args: + num_cells: The number of cells to generate + num_reads_per_cell: The number of reads to generate per cell + + Returns: + A tuple of two strings, the first string being the R1 reads and the second string being the R2 reads. + """ + r1_reads = "" + r2_reads = "" + for cell_index in range(num_cells): + for _ in range(num_reads_per_cell): + r1, r2 = generate_bd_abc_read(cell_index) + r1_reads += r1 + r2_reads += r2 + + return r1_reads, r2_reads + +def generate_bd_smk_read( + cell_index: int = 0, + bead_version: str = "EnhV2", + umi_length: int = 14, + transcript_length: int = 72, + num_sample_tags: int = 3, +): + """ + Generate a BD Rhapsody SMK read pair for a given cell index. + + Args: + cell_index: The cell index to generate reads for. + bead_version: The bead version to use for generating the cell label. + umi_length: The length of the UMI to generate. + transcript_length: The length of the transcript to generate + num_sample_tags: The number of sample tags to use + + Returns: + A tuple of two strings, the first string being the R1 read and the second string being the R2 read. + """ + # generate metadata + per_row = np.floor((32967 - 1000) / 9) + per_col = np.floor((37059 - 1000) / 9) + + assert cell_index >= 0 and cell_index < per_row * per_col, f"cell_index must be between 0 and {per_row} * {per_col}" + x = 1000 + (cell_index % per_row) * 9 + y = 1000 + (cell_index // per_row) * 9 + instrument_id = "A00226" + run_id = "970" + flowcell_id = "H5FGVDMXY" + + meta_r1 = generate_bd_read_metadata(instrument_id=instrument_id, run_id=run_id, flowcell_id=flowcell_id, x=x, y=y, illumina_flag="1:N:0") + meta_r2 = generate_bd_read_metadata(instrument_id=instrument_id, run_id=run_id, flowcell_id=flowcell_id, x=x, y=y, illumina_flag="2:N:0") + + # generate r1 (cls1 + link + cls2 + link + cls3 + umi) + assert cell_index >= 0 and cell_index < 384 * 384 * 384 + cell_label = index_to_sequence(cell_index + 1, bead_version=bead_version) + # sample random umi + umi = "".join(random.choices("ACGT", k=umi_length)) + quality_r1 = "I" * (len(cell_label) + len(umi)) + r1 = f"{meta_r1}\n{cell_label}{umi}\n+\n{quality_r1}\n" + + # generate r2 by selecting the cell_index %% num_sample_tags sample tags + sampletag_index = cell_index % num_sample_tags + sampletag_seq = str(list(sampletagsequences_fasta_dict.values())[sampletag_index].seq) + smk_data = sampletag_seq[:transcript_length] + smk_suffix = "A" * (transcript_length - len(smk_data)) + quality_r2 = "I" * len(smk_data) + "#" * len(smk_suffix) + r2 = f"{meta_r2}\n{smk_data}{smk_suffix}\n+\n{quality_r2}\n" + + return r1, r2 + +def generate_bd_smk_fastq_files( + num_cells: int = 100, + num_reads_per_cell: int = 1000, + num_sample_tags: int = 3, +) -> Tuple[str, str]: + """ + Generate BD Rhapsody SMK FASTQ files for a given number of cells and transcripts per cell. + + Args: + num_cells: The number of cells to generate + num_reads_per_cell: The number of reads to generate per cell + num_sample_tags: The number of sample tags to use + + Returns: + A tuple of two strings, the first string being the R1 reads and the second string being the R2 reads. + """ + r1_reads = "" + r2_reads = "" + for cell_index in range(num_cells): + for _ in range(num_reads_per_cell): + r1, r2 = generate_bd_smk_read(cell_index, num_sample_tags=num_sample_tags) + r1_reads += r1 + r2_reads += r2 + + return r1_reads, r2_reads + +######################################################################################### + +# Prepare WTA, ABC, and SMK test data +print("> Prepare WTA test data", flush=True) +wta_reads_r1_str, wta_reads_r2_str = generate_bd_wta_fastq_files(num_cells=100, num_reads_per_cell=1000) +with gzip.open("WTAreads_R1.fq.gz", "wt") as f: + f.write(wta_reads_r1_str) +with gzip.open("WTAreads_R2.fq.gz", "wt") as f: + f.write(wta_reads_r2_str) + +print("> Prepare ABC test data", flush=True) +abc_reads_r1_str, abc_reads_r2_str = generate_bd_abc_fastq_files(num_cells=100, num_reads_per_cell=1000) +with gzip.open("ABCreads_R1.fq.gz", "wt") as f: + f.write(abc_reads_r1_str) +with gzip.open("ABCreads_R2.fq.gz", "wt") as f: + f.write(abc_reads_r2_str) + +print("> Prepare SMK test data", flush=True) +smk_reads_r1_str, smk_reads_r2_str = generate_bd_smk_fastq_files(num_cells=100, num_reads_per_cell=1000, num_sample_tags=3) +with gzip.open("SMKreads_R1.fq.gz", "wt") as f: + f.write(smk_reads_r1_str) +with gzip.open("SMKreads_R2.fq.gz", "wt") as f: + f.write(smk_reads_r2_str) + +######################################################################################### + +# Run executable +print(f">> Run {meta['name']}", flush=True) +output_dir = Path("output") +subprocess.run([ + meta['executable'], + "--reads=WTAreads_R1.fq.gz;WTAreads_R2.fq.gz", + f"--reference_archive={reference_file}", + "--reads=ABCreads_R1.fq.gz;ABCreads_R2.fq.gz", + f"--abseq_reference={bdabseq_panel_fa}", + "--reads=SMKreads_R1.fq.gz;SMKreads_R2.fq.gz", + "--tag_names=1-Sample1;2-Sample2;3-Sample3", + "--sample_tags_version=human", + "--output_dir=output", + "--exact_cell_count=100", + f"---cpus={meta['cpus'] or 1}", + f"---memory={meta['memory_mb'] or 2048}mb", + # "--output_seurat=seurat.rds", + "--output_mudata=mudata.h5mu", + "--metrics_summary=metrics_summary.csv", + "--pipeline_report=pipeline_report.html", +]) + + +# Check if output exists +print(">> Check if output exists", flush=True) +assert (output_dir / "sample_Bioproduct_Stats.csv").exists() +assert (output_dir / "sample_Metrics_Summary.csv").exists() +assert (output_dir / "sample_Pipeline_Report.html").exists() +assert (output_dir / "sample_RSEC_MolsPerCell_MEX.zip").exists() +assert (output_dir / "sample_RSEC_MolsPerCell_Unfiltered_MEX.zip").exists() +# seurat object is not generated when abc data is added +# assert (output_dir / "sample_Seurat.rds").exists() +assert (output_dir / "sample.h5mu").exists() + +# check individual outputs +# assert Path("seurat.rds").exists() +assert Path("mudata.h5mu").exists() +assert Path("metrics_summary.csv").exists() +assert Path("pipeline_report.html").exists() + +print(">> Check contents of output", flush=True) +data = md.read_h5mu("mudata.h5mu") + +assert data.n_obs == 100, "Number of cells is incorrect" +assert "rna" in data.mod, "RNA data is missing" +assert "prot" in data.mod, "Protein data is missing" + +# check rna data +data_rna = data.mod["rna"] +assert data_rna.n_vars == 1, "Number of genes is incorrect" +assert data_rna.X.sum(axis=1).min() > 950, "Number of reads per cell is incorrect" +# assert data_rna.var.Raw_Reads.sum() == 100000, "Number of reads is incorrect" +assert data_rna.var.Raw_Reads.sum() >= 99990 and data_rna.var.Raw_Reads.sum() <= 100010, \ + f"Expected 100000 RNA reads, got {data_rna.var.Raw_Reads.sum()}" + +# check prot data +data_prot = data.mod["prot"] +assert data_prot.n_vars == len(bdabseq_panel_fasta_dict), "Number of proteins is incorrect" +assert data_prot.X.sum(axis=1).min() > 950, "Number of reads per cell is incorrect" +assert data_prot.var.Raw_Reads.sum() >= 99990 and data_prot.var.Raw_Reads.sum() <= 100010, \ + f"Expected 100000 Prot reads, got {data_prot.var.Raw_Reads.sum()}" + + +# check smk data +expected_sample_tags = (["SampleTag01_hs", "SampleTag02_hs", "SampleTag03_hs"] * 34)[:100] +expected_sample_names = (["Sample1", "Sample2", "Sample3"] * 34)[:100] +sample_tags = data_rna.obs["Sample_Tag"] +assert sample_tags.nunique() == 3, "Number of sample tags is incorrect" +assert sample_tags.tolist() == expected_sample_tags, "Sample tags are incorrect" +sample_names = data_rna.obs["Sample_Name"] +assert sample_names.nunique() == 3, "Number of sample names is incorrect" +assert sample_names.tolist() == expected_sample_names, "Sample names are incorrect" + +# TODO: add VDJ, ATAC, and targeted RNA to test + +######################################################################################### + +print("> Test successful", flush=True) diff --git a/src/bd_rhapsody/helpers/rhapsody_cell_label.py b/src/bd_rhapsody/helpers/rhapsody_cell_label.py new file mode 100644 index 00000000..601ce7be --- /dev/null +++ b/src/bd_rhapsody/helpers/rhapsody_cell_label.py @@ -0,0 +1,405 @@ +#!/usr/bin/env python + +# copied from https://bd-rhapsody-public.s3.amazonaws.com/CellLabel/rhapsody_cell_label.py.txt +# documented at https://bd-rhapsody-bioinfo-docs.genomics.bd.com/steps/steps_cell_label.html + +""" +Rhapsody cell label structure +Information on the cell label is captured by the combination of bases in three cell label sections (CLS1, CLS2, CLS3). +Two common linker sequences (L1, L2) separate the three CLS. + +--CLS1---|-L1-|--CLS2---|-L2-|--CL3---|--UMI---|-CaptureSequence- + + +Each cell label section has a whitelist of 96 or 384 possible 9 base sequences. +All the capture oligos from a single bead will have the same cell label. + +---------------- + +V1 beads: + +[A96_cell_key1] + [v1_linker1] + [A96_cell_key2] + [v1_linker2] + [A96_cell_key3] + [8 random base UMI] + [18 base polyT capture] + + +---------------- + +Enhanced beads: +Enhanced beads contain two different capture oligo types, polyT and 5prime. On any one bead, the two different capture oligo types have the same cell label sequences. +Compared to the V1 bead, enhanced beads have shorter linker sequences, longer polyT, and 0-3 diversity insert bases at the beginning of the sequence. +The cell label sections use the same 3 sequence whitelists as V1 beads. + +polyT capture oligo: +[Enh_insert 0-3 bases] + [A96_cell_key1] + [Enh_linker1] + [A96_cell_key2] + [Enh_linker2] + [A96_cell_key3] + [8 random base UMI] + [25 base polyT capture] + +5prime capture oligo: +[Enh_5p_primer] + [A96_cell_key1] + [Enh_5p_linker1] + [A96_cell_key2] + [Enh_5p_linker2] + [A96_cell_key3] + [8 random base UMI] + [Tso_capture_seq] + + +---------------- + +Enhanced V2/V3 beads: +Enhanced V2/V3 beads have the same structure as Enhanced beads, but the cell label sections have been updated with increased diversity + + +polyT capture oligo: +[Enh_insert 0-3 bases] + [B384_cell_key1] + [Enh_linker1] + [B384_cell_key2] + [Enh_linker2] + [B384_cell_key3] + [8 random base UMI] + [25 base polyT capture] + +5prime capture oligo: +[Enh_5p_primer] + [B384_cell_key1] + [Enh_5p_linker1] + [B384_cell_key2] + [Enh_5p_linker2] + [B384_cell_key3] + [8 random base UMI] + [Tso_capture_seq] + + +The only difference between Enh V2 and Enh V3 beads is a different Tso_capture_seq. + +---------------- + +The Rhapsody Sequence Analysis Pipeline will convert each cell label into a single integer representing a unique cell label sequence - which is used in the output files as the 'Cell_index'. +This cell index integer is deterministic and derived from the 3 part cell label as follows: + +- Get the 1-based index for each cell label section from the python sets of sequences below +- Apply this equation: + (CLS1index - 1) * 384 * 384 + (CLS2index - 1) * 384 + CLS3index + +(See label_sections_to_index() function below) + + +Example: Enhanced bead sequence: +ACACATTGCAGTGAAGATAGTTCGACACTCAAGACA + +Each part identified: +A CACATTGCA GTGA AGATAGTTC GACA CTCAAGACA +DiversityInsert A96_cell_key1-33 Linker1 A96_cell_key2-78 Linker2 A96_cell_key3-21 + +33-78-21 +(33 - 1) * 384 * 384 + (78 - 1) * 384 + 21 +=4748181 + + +The original sequences of cell label can be determined from the cell index integer by reversing this conversion. +See index_to_label_sections() and index_to_sequence() functions below. + +""" + +v1_linker1 = 'ACTGGCCTGCGA' +v1_linker2 = 'GGTAGCGGTGACA' + +Enh_linker1 = 'GTGA' +Enh_linker2 = 'GACA' + +Enh_5p_primer = "ACAGGAAACTCATGGTGCGT" + +Enh_5p_linker1 = "AATG" +Enh_5p_linker2 = "CCAC" + +Enh_inserts = ["", "A", "GT", "TCA"] + +Tso_capture_seq_Enh_EnhV2 = "TATGCGTAGTAGGTATG" +Tso_capture_seq_EnhV3 = "GTGGAGTCGTGATTATA" + +A96_cell_key1 = ("GTCGCTATA","CTTGTACTA","CTTCACATA","ACACGCCGG","CGGTCCAGG","AATCGAATG","CCTAGTATA","ATTGGCTAA","AAGACATGC","AAGGCGATC", + "GTGTCCTTA","GGATTAGGA","ATGGATCCA","ACATAAGCG","AACTGTATT","ACCTTGCGG","CAGGTGTAG","AGGAGATTA","GCGATTACA","ACCGGATAG", + "CCACTTGGA","AGAGAAGTT","TAAGTTCGA","ACGGATATT","TGGCTCAGA","GAATCTGTA","ACCAAGGAC","AGTATCTGT","CACACACTA","ATTAAGTGC", + "AAGTAACCC","AAATCCTGT","CACATTGCA","GCACTGTCA","ATACTTAGG","GCAATCCGA","ACGCAATCA","GAGTATTAG","GACGGATTA","CAGCTGACA", + "CAACATATT","AACTTCTCC","CTATGAAAT","ATTATTACC","TACCGAGCA","TCTCTTCAA","TAAGCGTTA","GCCTTACAA","AGCACACAG","ACAGTTCCG", + "AGTAAAGCC","CAGTTTCAC","CGTTACTAA","TTGTTCCAA","AGAAGCACT","CAGCAAGAT","CAAACCGCC","CTAACTCGC","AATATTGGG","AGAACTTCC", + "CAAAGGCAC","AAGCTCAAC","TCCAGTCGA","AGCCATCAC","AACGAGAAG","CTACAGAAC","AGAGCTATG","GAGGATGGA","TGTACCTTA","ACACACAAA", + "TCAGGAGGA","GAGGTGCTA","ACCCTGACC","ACAAGGATC","ATCCCGGAG","TATGTGGCA","GCTGCCAAT","ATCAGAGCT","TCGAAGTGA","ATAGACGAG", + "AGCCCAATC","CAGAATCGT","ATCTCCACA","ACGAAAGGT","TAGCTTGTA","ACACGAGAT","AACCGCCTC","ATTTAGATG","CAAGCAAGC","CAAAGTGTG", + "GGCAAGCAA","GAGCCAATA","ATGTAATGG","CCTGAGCAA","GAGTACATT","TGCGATCTA" + ) + +A96_cell_key2 = ("TACAGGATA","CACCAGGTA","TGTGAAGAA","GATTCATCA","CACCCAAAG","CACAAAGGC","GTGTGTCGA","CTAGGTCCT","ACAGTGGTA","TCGTTAGCA", + "AGCGACACC","AAGCTACTT","TGTTCTCCA","ACGCGAAGC","CAGAAATCG","ACCAAAATG","AGTGTTGTC","TAGGGATAC","AGGGCTGGT","TCATCCTAA", + "AATCCTGAA","ATCCTAGGA","ACGACCACC","TTCCATTGA","TAGTCTTGA","ACTGTTAGA","ATTCATCGT","ACTTCGAGC","TTGCGTACA","CAGTGCCCG", + "GACACTTAA","AGGAGGCGC","GCCTGTTCA","GTACATCTA","AATCAGTTT","ACGATGAAT","TGACAGACA","ATTAGGCAT","GGAGTCTAA","TAGAACACA", + "AAATAAATA","CCGACAAGA","CACCTACCC","AAGAGTAGA","TCATTGAGA","GACCTTAGA","CAAGACCTA","GGAATGATA","AAACGTACC","ACTATCCTC", + "CCGTATCTA","ACACATGTC","TTGGTATGA","GTGCAGTAA","AGGATTCAA","AGAATGGAG","CTCTCTCAA","GCTAACTCA","ATCAACCGA","ATGAGTTAC", + "ACTTGATGA","ACTTTAACT","TTGGAGGTA","GCCAATGTA","ATCCAACCG","GATGAACTG","CCATGCACA","TAGTGACTA","AAACTGCGC","ATTACCAAG", + "CACTCGAGA","AACTCATTG","CTTGCTTCA","ACCTGAGTC","AGGTTCGCT","AAGGACTAT","CGTTCGGTA","AGATAGTTC","CAATTGATC","GCATGGCTA", + "ACCAGGTGT","AGCTGCCGT","TATAGCCCT","AGAGGACCA","ACAATATGG","CAGCACTTC","CACTTATGT","AGTGAAAGG","AACCCTCGG","AGGCAGCTA", + "AACCAAAGT","GAGTGCGAA","CGCTAAGCA","AATTATAAC","TACTAGTCA","CAACAACGG" + ) + +A96_cell_key3 = ("AAGCCTTCT","ATCATTCTG","CACAAGTAT","ACACCTTAG","GAACGACAA","AGTCTGTAC","AAATTACAG","GGCTACAGA","AATGTATCG","CAAGTAGAA", + "GATCTCTTA","AACAACGCG","GGTGAGTTA","CAGGGAGGG","TCCGTCTTA","TGCATAGTA","ACTTACGAT","TGTATGCGA","GCTCCTTGA","GGCACAACA", + "CTCAAGACA","ACGCTGTTG","ATATTGTAA","AAGTTTACG","CAGCCTGGC","CTATTAGCC","CAAACGTGG","AAAGTCATT","GTCTTGGCA","GATCAGCGA", + "ACATTCGGC","AGTAATTAG","TGAAGCCAA","TCTACGACA","CATAACGTT","ATGGGACTC","GATAGAGGA","CTACATGCG","CAACGATCT","GTTAGCCTA", + "AGTTGCATC","AAGGGAACT","ACTACATAT","CTAAGCTTC","ACGAACCAG","TACTTCGGA","AACATCCAT","AGCCTGGTT","CAAGTTTCC","CAGGCATTT", + "ACGTGGGAG","TCTCACGGA","GCAACATTA","ATGGTCCGT","CTATCATGA","CAATACAAG","AAAGAGGCC","GTAGAAGCA","GCTATGGAA","ACTCCAGGG", + "ACAAGTGCA","GATGGTCCA","TCCTCAATA","AATAAACAA","CTGTACGGA","CTAGATAGA","AGCTATGTG","AAATGGAGG","AGCCGCAAG","ACAGTAAAC", + "AACGTGTGA","ACTGAATTC","AAGGGTCAG","TGTCTATCA","TCAGATTCA","CACGATCCG","AACAGAAAC","CATGAATGA","CGTACTACG","TTCAGCTCA", + "AAGGCCGCA","GGTTGGACA","CGTCTAGGT","AATTCGGCG","CAACCTCCA","CAATAGGGT","ACAGGCTCC","ACAACTAGT","AGTTGTTCT","AATTACCGG", + "ACAAACTTT","TCTCGGTTA","ACTAGACCG","ACTCATACG","ATCGAGTCT","CATAGGTCA" + ) + +B384_cell_key1 = ("TGTGTTCGC","TGTGGCGCC","TGTCTAGCG","TGGTTGTCC","TGGTTCCTC","TGGTGTGCT","TGGCGACCG","TGCTGTGGC","TGCTGGCAC","TGCTCTTCC", + "TGCCTCACC","TGCCATTAT","TGATGTCTC","TGATGGCCT","TGATGCTTG","TGAAGGACC","TCTGTCTCC","TCTGATTAT","TCTGAGGTT","TCTCGTTCT", + "TCTCATCCG","TCCTGGATT","TCAGCATTC","TCACGCCTT","TATGTGCAC","TATGCGGCC","TATGACGAG","TATCTCGTG","TATATGACC","TAGGCTGTG", + "TACTGCGTT","TACGTGTCC","TAATCACAT","GTTGTGTTG","GTTGTGGCT","GTTGTCTGT","GTTGTCGAG","GTTGTCCTC","GTTGTATCC","GTTGGTTCT", + "GTTGGCGTT","GTTGGAGCG","GTTGCTGCC","GTTGCGCAT","GTTGCAGGT","GTTGCACTG","GTTGATGAT","GTTGATACG","GTTGAAGTC","GTTCTGTGC", + "GTTCTCTCG","GTTCTATAT","GTTCGTATG","GTTCGGCCT","GTTCGCGGC","GTTCGATTC","GTTCCGGTT","GTTCCGACG","GTTCACGCT","GTTATCACC", + "GTTAGTCCG","GTTAGGTGT","GTTAGAGAC","GTTAGACTT","GTTACCTCT","GTTAATTCC","GTTAAGCGC","GTGTTGCTT","GTGTTCGGT","GTGTTCCAG", + "GTGTTCATC","GTGTCACAC","GTGTCAAGT","GTGTACTGC","GTGGTTAGT","GTGGTACCG","GTGGCGATC","GTGCTTCTG","GTGCGTTCC","GTGCGGTAT", + "GTGCGCCTT","GTGCGAACT","GTGCAGCCG","GTGCAATTG","GTGCAAGGC","GTCTTGCGC","GTCTGGCCG","GTCTGAGGC","GTCTCAGAT","GTCTCAACC", + "GTCTATCGT","GTCGGTGTG","GTCGGAATC","GTCGCTCCG","GTCCTCGCC","GTCCTACCT","GTCCGCTTG","GTCCATTCT","GTCCAATAC","GTCATGTAT", + + "GTCAGTGGT","GTCAGATAG","GTATTAACT","GTATCAGTC","GTATAGCCT","GTATACTTG","GTATAAGGT","GTAGCATCG","GTACCGTCC","GTACACCTC", + "GTAAGTGCC","GTAACAGAG","GGTTGTGTC","GGTTGGCTG","GGTTGACGC","GGTTCGTCG","GGTTCAGTT","GGTTATATT","GGTTAATAC","GGTGTACGT", + "GGTGCCGCT","GGTGCATGC","GGTCGTTGC","GGTCGAGGT","GGTAGGCAC","GGTAGCTTG","GGTACATAG","GGTAATCTG","GGCTTGGCC","GGCTTCACG", + "GGCTTATGT","GGCTTACTC","GGCTGTCTT","GGCTCTGTG","GGCTCCGGT","GGCTCACCT","GGCGTTGAG","GGCGTGTAC","GGCGTGCTG","GGCGTATCG", + "GGCGCTCGT","GGCGCTACC","GGCGAGCCT","GGCGAGATC","GGCGACTTG","GGCCTCTTC","GGCCTACAG","GGCCAGCGC","GGCCAACTT","GGCATTCCT", + "GGCATCCGC","GGCATAACC","GGCAACGAT","GGATGTCCG","GGATGAGAG","GGATCTGGC","GGATCCATG","GGATAGGTT","GGAGTCGTG","GGAGAAGGC", + "GGACTCCTT","GGACTAGTC","GGACCGTTG","GGAATTAGT","GGAATCTCT","GGAATCGAC","GGAAGCCTC","GCTTGTAGC","GCTTGACCG","GCTTCGGAC", + "GCTTCACAT","GCTTAGTCT","GCTGGATAT","GCTGGAACC","GCTGCGATG","GCTGATCAG","GCTGAGCGT","GCTCTTGTC","GCTCTCCTG","GCTCGGTCC", + "GCTCCAATT","GCTATTCGC","GCTATGAGT","GCTAGTGTT","GCTAGGATC","GCTAGCACT","GCTACGTAT","GCTAACCTT","GCGTTCCGC","GCGTGTGCC", + "GCGTGCATT","GCGTCGGTT","GCGTATGTG","GCGTATACT","GCGGTTCAC","GCGGTCTTG","GCGGCGTCG","GCGGCACCT","GCGCTGGAC","GCGCTCTCC", + + "GCGCGGCAG","GCGCGATAC","GCGCCGACC","GCGAGCGAG","GCGAGAGGT","GCGAATTAC","GCCTTGCAT","GCCTGCGCT","GCCTAACTG","GCCGTCCGT", + "GCCGCTGTC","GCCATGCCG","GCCAGCTAT","GCCAACCAG","GCATGGTTG","GCATCGACG","GCAGGCTAG","GCAGGACGC","GCAGCCATC","GCAGATACC", + "GCAGACGTT","GCACTATGT","GCACACGAG","GATTGTCAT","GATTGGTAG","GATTGCACC","GATTCTACT","GATTCGCTT","GATTAGGCC","GATTACGGT", + "GATGTTGGC","GATGTTATG","GATGGCCAG","GATCGTTCG","GATCGGAGC","GATCGCCTC","GATCCTCTG","GATCCAGCG","GATACACGC","GAGTTACCT", + "GAGTCGTAT","GAGTCGCCG","GAGGTGTAG","GAGGCATTG","GAGCGGACG","GAGCCTGAG","GAGATCTGT","GAGATAATT","GAGACGGCT","GACTTCGTG", + "GACTGTTCT","GACTCTTAG","GACCGCATT","GAATTGAGC","GAATATTGC","GAAGGCTCT","GAAGAGACT","GAACTGCCG","GAACGCGTG","CTTGTGTAT", + "CTTGTGCGC","CTTGTCATG","CTTGGTCTT","CTTGGTACC","CTTGGATGT","CTTGCTCAC","CTTGCAATC","CTTGAGGCC","CTTGACGGT","CTTCTGATC", + "CTTCTCGTT","CTTCTAGGC","CTTCGTTAG","CTTATGTCC","CTTATGCTT","CTTATATAG","CTTAGGTTG","CTTAGGAGC","CTTACTTAT","CTGTTCTCG", + "CTGTGCCTC","CTGTCGCAT","CTGTCGAGC","CTGTAGCTG","CTGTACGTT","CTGCTTGCC","CTGCGTAGT","CTGCACACC","CTGATGGAT","CTGAGTCAT", + "CTGACGCCG","CTGAACGAG","CTCTTGTAG","CTCTTAGTT","CTCTTACCG","CTCTGCACC","CTCTCGTCC","CTCGTATTG","CTCGACTAT","CTCCTGACG", + + "CTCACTAGC","CTATACGGC","CGTTCGCTC","CGTTCACCG","CGTATAGTT","CGGTGTTCC","CGGTGTCAG","CGGTCCTGC","CGGCGACTC","CGGCACGGT", + "CGGATAGCC","CGGAGAGAT","CGCTAATAG","CGCGTTGGC","CGCGCAGAG","CGCACTGCC","CCTTGTCTC","CCTTGGCGT","CCTTCTGAG","CCTTCTCCT", + "CCTTCGACC","CCTTACTTG","CCTGTTCGT","CCTGTATGC","CCTCGGCCG","CCGTTAATT","CCATGTGCG","CCAGTGGTT","CCAGGCATT","CCAGGATCC", + "CCAGCGTTG","CATTCCGAT","CATTATACC","CATGTTGAG","ATTGCGTGT","ATTGCGGAC","ATTGCGCCG","ATTGACTTG","ATTCGGCTG","ATTCGCGAG", + "ATTCCAAGT","ATTATCTTC","ATTACTGTT","ATTACACTC","ATGTTCTAT","ATGTTACGC","ATGTGTATC","ATGTGGCAG","ATGTCTGTG","ATGGTGCAT", + "ATGCTTACT","ATGCTGTCC","ATGCTCGGC","ATGAGGTTC","ATGAGAGTG","ATCTTGGCT","ATCTGTGCG","ATCGGTTCC","ATCATGCTC","ATCATCACT", + "ATATCTTAT","ATAGGCGCC","AGTTGGTAT","AGTTGAGCC","AGTGCGACC","AGGTGCTAC","AGGCTTGCG","AGGCCTTCC","AGGCACCTT","AGGAATATG", + "AGCGGCCAG","AGCCTGGTC","AGCCTGACT","AGCAATCCG","AGAGATGTT","AGAGAATTC","ACTCGCTTG","ACTCGACCT","ACGTACACC","ACGGATGGT", + "ACCAGTCTG","ACATTCGGC","ACATGAGGT","ACACTAATT" + ) + +B384_cell_key2 = ("TTGTGTTGT","TTGTGGTAG","TTGTGCGGA","TTGTCTGTT","TTGTCTAAG","TTGTCATAT","TTGTCACGA","TTGTATGAA","TTGTACAGT","TTGGTTAAT", + "TTGGTGCAA","TTGGTCGAG","TTGGTATTA","TTGGCACAG","TTGGATACA","TTGGAAGTG","TTGCGGTTA","TTGCCATTG","TTGCACGCG","TTGCAAGGT", + "TTGATGTAT","TTGATAATT","TTGAGACGT","TTGACTACT","TTGACCGAA","TTCTGGTCT","TTCTGCACA","TTCTCCTTA","TTCTCCGCT","TTCTAGGTA", + "TTCTAATCG","TTCGTCGTA","TTCGTAGAT","TTCGGCTTG","TTCGGAATA","TTCGCCAGA","TTCGATTGT","TTCGATCAG","TTCCTCGGT","TTCCGGCAG", + "TTCCGCATT","TTCCAATTA","TTCATTGAA","TTCATGCTG","TTCAGGAGT","TTCACTATA","TTCAACTCT","TTCAACGTG","TTATGCGTT","TTATGATTG", + "TTATCCTGT","TTATCCGAG","TTATATTAT","TTAGGCGCG","TTACTGGAA","TTACTAGTT","TTACGTGGT","TTACGATAT","TTACCTAGA","TTACATGAG", + "TTACAGCGT","TTACACGGA","TTACACACT","TTAATCAGT","TTAATAGGA","TTAAGTGTG","TTAACCTTG","TTAACACAA","TGTTCACTT","TGTTCAAGA", + "TGTTAAGTG","TGTGTTATG","TGTGTCCAA","TGTGGAGCG","TGTCAGTTA","TGTCAGAAG","TGGTTAGTT","TGGTTACAA","TGGCGTTAT","TGGCGCCAA", + "TGGAGTCTT","TGCGTATTG","TGATAGAGA","TGAGGTATT","TGAGAATCT","TCTTGGTAA","TCTTCATAG","TCTGTCCTT","TCTGGAATT","TCTACCGCG", + "TCGTTCGAA","TCGTCAGTG","TCGACGAGA","TCATGGCTT","TCACACTTA","TATTCCGAA","TATTATGGT","TATGCTATT","TATCAAGGA","TAGTTCAAT", + + "TAGCTGCTT","TAGAGGAAG","TACCTGTTA","TACACCTGT","GTTGTGCGT","GTTGGCTAT","GTTGCCAAG","GTTGACCTT","GTTCTGCTA","GTTCTGAAT", + "GTTCTATCA","GTTCGCGTG","GTTCCTTAT","GTTAGCAGT","GTTACTGTG","GTTACTCAA","GTTAAGAGA","GTTAACTTA","GTGTCGGCA","GTGTCCATT", + "GTGCTTGAG","GTGCTCGTT","GTGCTCACA","GTGCCTGGA","GTCTTGTCG","GTCTTGATT","GTCTTCCGT","GTCTTAAGA","GTCTCATCT","GTCTACGAG", + "GTCGTTGCT","GTCGTGTTA","GTCGGTAAT","GTCGGATGT","GTCGAGCTG","GTCCGGACT","GTCCAACAT","GTCAGACGA","GTCAGAATT","GTCACTCTT", + "GTCAAGGAA","GTATGTCTT","GTATGTACA","GTATCGGTT","GTATATGTA","GTATACAAT","GTAGTTAAG","GTAGTCGAT","GTAGCCTTA","GTAGATACT", + "GTACGATTA","GTACAGTCT","GTAATTCGT","GCTTGGCAG","GCTTGCTTG","GCTTGAGGA","GCTTCATTA","GCTTATGCG","GCTGTGTAG","GCTGTCATG", + "GCTGGTTGT","GCTGGACTG","GCTGCCTAA","GCTGATATT","GCTCTTAGT","GCTCTATTG","GCTCGCCGT","GCTCCGCTG","GCTATTCTG","GCTATACGA", + "GCTACTAAG","GCTACATGT","GCTAACTCT","GCGTTGTAA","GCGTTCTCT","GCGTGCGTA","GCGTCTTGA","GCGTCCGAT","GCGTAAGAG","GCGCTTACG", + "GCGCGGATT","GCGCCATAT","GCGCATGAA","GCGATCAAT","GCGAGCCTT","GCGAGATTG","GCGAGAACA","GCCTTGGTA","GCCTTCTAG","GCCTTCACA", + "GCCTGAGTG","GCCTCACGT","GCCGGCGAA","GCCGCACAA","GCCATGCTT","GCCATATAT","GCCAATTCG","GCATTCGTT","GCATGATGT","GCAGTTGGA", + + "GCAGTGTCT","GCACTTGTG","GCAATCTGT","GCAACACTT","GATTGTATT","GATTGCGAG","GATTCCAGT","GATTCATAT","GATTATCAG","GATTAGGTT", + "GATGTTGCG","GATGGATCT","GATGCTGAT","GATGCCTTG","GATCTCCTT","GATCGCTTA","GATATTGAA","GATATTACT","GAGTGTTAT","GAGCTCAGT", + "GAGCGTGCT","GAGCGTCGA","GAGCGGTTG","GAGCGACTT","GAGCCGAAT","GAGATAGAT","GAGACCTAT","GACGGTCGT","GACGCAGGT","GACGATATG", + "GACCTATCT","GAATTAGGA","GAATCAGCT","GAAGTTCAT","GAAGTGGTT","GAAGTATTG","GAAGGCATT","GAACGCTGT","CTTGTCCAG","CTTGGATTG", + "CTTGCTGAA","CTTGCCGTG","CTTGATTCT","CTTCTGTCG","CTTCGGCGT","CTTATGAGT","CTTACCGAT","CTGTTAGGT","CTGTCGTCT","CTGTATAAT", + "CTGGCTCAT","CTGGATGCG","CTGCGTGTG","CTGCGCGGT","CTGCCGATT","CTGCATTGT","CTGATTAAG","CTGAGATAT","CTGACCTGT","CTCGTATCT", + "CTCGGCAAG","CTCGCAATT","CTCCTGCTT","CTCCTAAGT","CTCCGGATG","CTCCGAGCG","CTCACAGGT","CTATTCTAT","CTATTAGTG","CTATGAATT", + "CTACATATT","CGTGGCATT","CGTCTTAAT","CGTCTGGTT","CGTCACTGT","CGTAGGTCT","CGGTTCGAG","CGGTTCATT","CGGTGCTCT","CGGTAATTG", + "CGGCCTGAT","CGGATATAG","CGGAATATT","CGCTCCAAT","CGCGTTCGT","CGCAGGTTG","CGAGGATGT","CGAGCTGTT","CGACGGCTT","CCTTGTGTG", + "CCTGTCTCA","CCTGACTAT","CCTACCTTG","CCGTAGATT","CCGGCTGGT","CATCGGACG","CATCGATAA","CATCCTTCT","CAGTTCTGT","CAGTGCCAG", + + "CAGGCACTG","CAGCCTCTT","CACTTATAT","CACTGGTCG","CACTGCATG","CACGCGTTG","CACGATGTT","CACCATCTG","CACAGGCGT","ATTGTACAA", + "ATTGGTATG","ATTGCTAAT","ATTGCATAG","ATTGCAGTT","ATTCTGCAG","ATTCTACGT","ATTCGGATT","ATTCCGTTG","ATTCATCAA","ATTCAAGAG", + "ATTAGCCTT","ATTAATATT","ATGTTAGAG","ATGTTAACT","ATGTAGTCG","ATGGTGTAG","ATGGATTAT","ATCTTGAAG","ATCTGATAT","ATCTCAGAA", + "ATCGCTCAA","ATCGCGTCG","ATCCATGGT","ATCATGAGA","ATCATAGTT","ATCAGCGAG","ATCACCATT","ATAGTAATT","ATAGCTGTG","ATACTCTCG", + "ATACCTCAT","AGTTGCGCG","AGTTGAATT","AGTTATGAT","AGTGTCCGT","AGTGGCTTG","AGTGCTTCT","AGTATCATT","AGTACACAA","AGGTATGCG", + "AGGTATAGT","AGGCTACTT","AGGCCAGGT","AGGAGCGAT","AGCTTATAG","AGCTCTAGA","AGCGTGTAT","AGCGTCACA","AGCCTTCAT","AGCCTGTCG", + "AGCCTCGAG","AGCACTGAA","AGATGTACG","AGAGTTAAT","AGACCTCTG","ACTTCTATA","ACTGTCGAG","ACTGTATGT","ACTCTGTAA","ACTCGCGAA", + "ACTAGATCT","ACTAACGTT","ACGTTACTG","ACGTGGAAT","ACGGACTCT","ACGCCTAAT","ACGCCGTTA","ACGACGTGT","ACCTCGCAT","ACCATCATA", + "ACATATATT","ACAGGCACA","ACACCTGAG","ACACATTCT" + ) + +B384_cell_key3 = ("TTGTGGCTG","TTGTGGAGT","TTGTGCGAC","TTGTCTTCA","TTGTAAGAT","TTGGTTCTG","TTGGTGCGT","TTGGTCTAC","TTGGTAACT","TTGGCGTGC", + "TTGGATTAG","TTGGAGACG","TTGGAATCA","TTGCGGCGA","TTGCGCTCG","TTGCCTTAC","TTGCCGGAT","TTGCATGCT","TTGCACGTC","TTGCACCAT", + "TTGAACCTG","TTCTCGCGT","TTCTCAACT","TTCTACTCA","TTCGTCCAT","TTCGGATAC","TTCGGACGT","TTCGCAATC","TTCCGGTGC","TTCCGACTG", + "TTCATTATG","TTCATGGAT","TTCAGCGCA","TTCACCTCG","TTCAAGCAG","TTCAACTAC","TTATGCCAG","TTATGCATC","TTATCGTAC","TTATACCTA", + "TTATAATAG","TTATAAGTC","TTAGTTAGC","TTAGCTCAT","TTAGCACTA","TTAGATATG","TTACTACGA","TTACCGTCA","TTACAGAGC","TTAATTGCA", + "TTAACAGAT","TGTTGGCTA","TGTTGATGA","TGTTAAGCT","TGTGGCCGA","TGTGCTAGC","TGTGCGTCA","TGTCGCAGT","TGTCGAGCA","TGTACAACG", + "TGGTTCCGA","TGGTTCACT","TGGTCAAGT","TGGCTTGTA","TGGCTGTCG","TGGCGTATG","TGGCGCGCT","TGGATGTAC","TGGACTTGC","TGGAATACT", + "TGCTAGCGA","TGCGTTGCT","TGCGGTCTG","TGCGCTTAG","TGCGCGACG","TGCCTGCAT","TGCCTAGAC","TGCACGAGT","TGAGTGTGC","TGAGGCTCG", + "TCTTCCGTC","TCTTATAGT","TCTTACCAT","TCTGTTGTC","TCTGTTACT","TCTGGCTAG","TCTCAGATC","TCTAGTTGA","TCTAGTACG","TCGTACTAC", + "TCGGTGTAG","TCGGCTGCT","TCGCTACTG","TCGATCACG","TCGAGGCAT","TCCGGCGTC","TCCGGAGCT","TCCGCTCGT","TCCGAGTAC","TCCATTCAT", + + "TCCATGGTC","TCCAAGTCG","TCATTACGT","TCATGCACT","TCAGGTTGC","TCAGACCGT","TCACTCAGT","TCAAGCTCA","TATTGCGCA","TATTCGGCT", + "TATTCCAGC","TATTCATCA","TATGTTCAG","TATGGTATG","TATGCAAGT","TATCTGGTC","TATCTGACT","TATCCAGAT","TATCAGTCG","TATCACGCT", + "TAGGCGCGA","TAGGCACAT","TAGGATCGT","TAGCATTGC","TAGAGTTAC","TAGACTGAT","TACTTGTCG","TACGTCCGA","TACCGTACT","TACCGCGAT", + "TACCAGGAC","TACAGAAGT","TAAGTGCAT","TAAGCTACT","GTTGACCGA","GTTCTCGAC","GTTCCTGCT","GTTATGATG","GTGCTTGCA","GTGCCGCGT", + "GTATTGCTG","GTATTCCGA","GTATTAAGC","GTATGACGT","GTAGTTGTC","GTAGTACAT","GTAGCTCGA","GGTTGCTCA","GGTTGAGTA","GGTTAACGT", + "GGTGTGGCA","GGTCTTCAG","GGTCGTCTA","GGTCGGCGT","GGTCCGACT","GGTCATGTC","GGTCACATG","GGTAGTGCT","GGTAGCGTC","GGTACCAGT", + "GGTAAGGAT","GGCTTGTGC","GGCTTGACT","GGCTTACGA","GGCTGTAGT","GGCTGGCAG","GGCTCCATC","GGCGTGGAT","GGCGTAATC","GGCGCAAGT", + "GGCGAGTAG","GGCGACCGT","GGCCTGTCA","GGCCATTGC","GGCACTCTG","GGATGTCAT","GGAGTAACT","GGAGAACGA","GGACTGGCT","GGACGTTCA", + "GGAACGTGC","GCTGTCCAT","GCTGGTTCA","GCTGCAACT","GCTCGTTAC","GCTATAGAT","GCTAGTCGT","GCTACCATG","GCGTTCTGA","GCGTGTTAG", + "GCGGTATCG","GCGGAGCAT","GCGCGGTGC","GCGCCTAGT","GCGCCGGCT","GCCTTCATG","GCCATACTG","GCATGTTGA","GCATGCTAC","GCAGTATAC", + + "GCAGGTACT","GCAGCGCGT","GCACCTCAT","GCAATTCGA","GATTGCCGT","GATGAACAT","GATCTTCGA","GATCTGCAT","GAGTGGCAT","GAGTCGGAC", + "GAGTATGAT","GAGGCGAGT","GAGGCAACG","GAGCGCACT","GAATAGGCT","ATTGTCACT","ATTGTATCA","ATTGGTCAG","ATTGGCGAT","ATTGATCGT", + "ATTCGTAGT","ATTCATACG","ATTCAGGAC","ATTACTTCA","ATTAATTAG","ATTAAGCAT","ATGTCTCTA","ATGTAGCGT","ATGGCATAC","ATGGAGATC", + "ATGGACTCG","ATGGAACGA","ATGCTTCAT","ATGCTCGCT","ATGCGACGT","ATGCCGTAG","ATGAGTTCG","ATGACTATC","ATGACCGAC","ATCTTATGC", + "ATCTTACTA","ATCTATCAG","ATCGTGTAC","ATCGTCTGA","ATCGGCATG","ATCGCGAGC","ATCGCAACG","ATCGATGCT","ATCGAATAG","ATCCTTCTG", + "ATCCTGCGT","ATCCGCACT","ATCCATTAC","ATCCAAGCA","ATCAGATCA","ATCACACAT","ATCAACGTC","ATCAACCGA","ATATTGAGT","ATATTCGTC", + "ATATTACAG","ATATCTTGA","ATATCGCAT","ATATCAATC","ATAGTCCTG","ATAGGTCTA","ATAGCTGAC","ATAGCGGTA","AGTTCGCTG","AGTTACAGC", + "AGTTAACTA","AGTGCAATC","AGTCTGGTA","AGTCTGAGC","AGTCTACAT","AGTCGAACT","AGTCCATCG","AGTCATTCA","AGTATCCAG","AGTAGACTG", + "AGTAATCGA","AGTAAGTGC","AGGTTGGCT","AGGTTCTAG","AGGTGTTCA","AGGTGCCAT","AGGTCTGAT","AGGTCGTAC","AGGTCAGCA","AGGCTTATC", + "AGGCTATGA","AGGCCGACG","AGGCCAAGC","AGGCAGGTC","AGGCAAGAT","AGGAGCAGT","AGGACCGCT","AGGAATTAC","AGCTTGGAC","AGCTTAAGT", + + "AGCTACACG","AGCGTTACG","AGCGGTGCA","AGCGGAGTC","AGCGGACGA","AGCGCGCTA","AGCGATAGC","AGCGACTCA","AGCCTCTAC","AGCCGTCGT", + "AGCATGATC","AGCACTTCG","AGCACGGCA","AGATTCTGA","AGATTAGAT","AGATGATAG","AGATATGTA","AGATACCGT","AGAGTGCGT","AGAGCCGAT", + "AGACTCACT","ACTTGCCTA","ACTTGAGCA","ACTTCTAGC","ACTTCGACT","ACTTAGTAC","ACTGTTGAT","ACTGTAACG","ACTGGTATC","ACTGACGTC", + "ACTGAAGCT","ACTCTGATG","ACTCCTGAC","ACTCCGCTA","ACTCAACTG","ACTATTGCA","ACTAGGCAG","ACTACGCGT","ACTAATACT","ACGTTCGTA", + "ACGTGTGCT","ACGTGTATG","ACGTGGAGC","ACGTCTTCG","ACGTCAGTC","ACGGTCTCA","ACGGTCCGT","ACGGTACAG","ACGGCGCTG","ACGCTGCGA", + "ACGCGTGTA","ACGCGCCAG","ACGATGTCG","ACGATGGAT","ACGATCTAC","ACGAGCTGA","ACGAGCATC","ACGAATCGT","ACGAACGCA","ACCTTGTAG", + "ACCTGTTGC","ACCTGTCAT","ACCTCGATC","ACCTAGGTA","ACCTACTGA","ACCTAATCG","ACCGTAGCA","ACCGGTAGT","ACCGGCTAC","ACCGCTTCA", + "ACATTGTGC","ACATTCTCG","ACATGGCTG","ACATGACGA","ACATATGAT","ACATATACG","ACAGCGTAC","ACACTTGCT","ACACTATCA","ACACGCATG", + "ACACCAGTA","ACACCAACT","ACACATAGT","ACACACCTA" + ) + + +def label_sections_to_index(label): + """ + Return the cell_index integer based on input 3 part cell label string + + """ + + cl1, cl2, cl3 = [int(n) for n in label.split('-')] + return (cl1 - 1) * 384 * 384 + (cl2 - 1) * 384 + (cl3 - 1) + 1 + + +# print(label_sections_to_index('1-1-1')) +# print(label_sections_to_index('33-78-21')) +# print(label_sections_to_index('43-12-77')) +# print(label_sections_to_index('96-96-96')) +# print(label_sections_to_index('135-43-344')) +# print(label_sections_to_index('384-384-384')) +# print('-') + +#---------------------------------- + + +def index_to_label_sections(index): + + zerobased = int(index) - 1 + + cl1 = (int((zerobased) / 384 / 384) % 384) + 1 + cl2 = (int((zerobased) / 384) % 384) + 1 + cl3 = (zerobased % 384) + 1 + + return f'{cl1}-{cl2}-{cl3}' + + +# print(index_to_label_sections(1)) +# print(index_to_label_sections(4748181)) +# print(index_to_label_sections(6197453)) +# print(index_to_label_sections(14044896)) +# print(index_to_label_sections(19775576)) +# print(index_to_label_sections(56623104)) +# print('-') +#---------------------------------- + + +def index_to_sequence(index, bead_version): + + zerobased = int(index) - 1 + + cl1 = (int((zerobased) / 384 / 384) % 384) + 1 + cl2 = (int((zerobased) / 384) % 384) + 1 + cl3 = (zerobased % 384) + 1 + + if bead_version == 'v1': + cls1_sequence = A96_cell_key1[cl1-1] + cls2_sequence = A96_cell_key2[cl2-1] + cls3_sequence = A96_cell_key3[cl3-1] + + return f'{cls1_sequence}{v1_linker1}{cls2_sequence}{v1_linker2}{cls3_sequence}' + + elif bead_version == 'Enh': + + diversityInsert = '' + + if 1 <= cl1 <= 24: + diversityInsert = '' + elif 25 <= cl1 <= 48: + diversityInsert = 'A' + elif 49 <= cl1 <= 72: + diversityInsert = 'GT' + else: # 73 <= cl1 <= 96: + diversityInsert = 'TCA' + + cls1_sequence = A96_cell_key1[cl1-1] + cls2_sequence = A96_cell_key2[cl2-1] + cls3_sequence = A96_cell_key3[cl3-1] + + return f'{diversityInsert}{cls1_sequence}{Enh_linker1}{cls2_sequence}{Enh_linker2}{cls3_sequence}' + + elif bead_version == 'EnhV2': + + diversityInsert = '' + subIndex = ((cl1-1) % 96) + 1 + + if 1 <= subIndex <= 24: + diversityInsert = '' + elif 25 <= subIndex <= 48: + diversityInsert = 'A' + elif 49 <= subIndex <= 72: + diversityInsert = 'GT' + else: # 73 <= subIndex <= 96: + diversityInsert = 'TCA' + + cls1_sequence = B384_cell_key1[cl1-1] + cls2_sequence = B384_cell_key2[cl2-1] + cls3_sequence = B384_cell_key3[cl3-1] + + return f'{diversityInsert}{cls1_sequence}{Enh_linker1}{cls2_sequence}{Enh_linker2}{cls3_sequence}' + + +# print(index_to_sequence(4748181, 'Enh')) +# print(index_to_sequence(52923177, 'EnhV2')) + +#---------------------------------- + + +def create_cell_index_fasta_V1(): + with open('Rhapsody_cellBarcodeV1_IndexToSequence.fasta', 'w') as f: + for cl1 in range(1, 96+1): + for cl2 in range(1, 96+1): + for cl3 in range(1, 96+1): + index = label_sections_to_index(f'{cl1}-{cl2}-{cl3}') + sequence = index_to_sequence(index, 'v1') + f.write(f'>{index}\n') + f.write(f'{sequence}\n') + +#create_cell_index_fasta_V1() + + +def create_cell_index_fasta_Enh(): + with open('Rhapsody_cellBarcodeEnh_IndexToSequence.fasta', 'w') as f: + for cl1 in range(1, 96+1): + for cl2 in range(1, 96+1): + for cl3 in range(1, 96+1): + index = label_sections_to_index(f'{cl1}-{cl2}-{cl3}') + sequence = index_to_sequence(index, 'Enh') + f.write(f'>{index}\n') + f.write(f'{sequence}\n') + +#create_cell_index_fasta_Enh() + +def create_cell_index_fasta_EnhV2(): + with open('Rhapsody_cellBarcodeEnhV2_IndexToSequence.fasta', 'w') as f: + for cl1 in range(1, 384+1): + for cl2 in range(1, 384+1): + for cl3 in range(1, 384+1): + index = label_sections_to_index(f'{cl1}-{cl2}-{cl3}') + sequence = index_to_sequence(index, 'EnhV2') + f.write(f'>{index}\n') + f.write(f'{sequence}\n') + +#create_cell_index_fasta_EnhV2() diff --git a/src/bd_rhapsody/test_data/BDAbSeq_ImmuneDiscoveryPanel.fasta b/src/bd_rhapsody/test_data/BDAbSeq_ImmuneDiscoveryPanel.fasta new file mode 100644 index 00000000..930add4a --- /dev/null +++ b/src/bd_rhapsody/test_data/BDAbSeq_ImmuneDiscoveryPanel.fasta @@ -0,0 +1,60 @@ +>CD11c:B-LY6|ITGAX|AHS0056|pAbO Catalog_940024 +ATGCGTTGCGAGAGATATGCGTAGGTTGCTGATTGG +>CD14:MPHIP9|CD14|AHS0037|pAbO Catalog_940005 +TGGCCCGTGGTAGCGCAATGTGAGATCGTAATAAGT +>CXCR5|CXCR5|AHS0039|pAbO Catalog_940042 +AGGAAGGTCGATTGTATAACGCGGCATTGTAACGGC +>CD19:SJ25C1|CD19|AHS0030|pAbO Catalog_940004 +TAGTAATGTGTTCGTAGCCGGTAATAATCTTCGTGG +>CD25:2A3|IL2RA|AHS0026|pAbO Catalog_940009 +AGTTGTATGGGTTAGCCGAGAGTAGTGCGTATGATT +>CD27:M-T271|CD27|AHS0025|pAbO Catalog_940018 +TGTCCGGTTTAGCGAATTGGGTTGAGTCACGTAGGT +>CD278|ICOS|AHS0012|pAbO Catalog_940043 +ATAGTCCGCCGTAATCGTTGTGTCGCTGAAAGGGTT +>CD279:EH12-1|PDCD1|AHS0014|pAbO Catalog_940015 +ATGGTAGTATCACGACGTAGTAGGGTAATTGGCAGT +>CD3:UCHT1|CD3E|AHS0231|pAbO Catalog_940307 +AGCTAGGTGTTATCGGCAAGTTGTACGGTGAAGTCG +>GITR|TNFRSF18|AHS0104|pAbO Catalog_940096 +TCTGTGTGTCGGGTTGAATCGTAGTGAGTTAGCGTG +>Tim3|HAVCR2|AHS0016|pAbO Catalog_940066 +TAGGTAGTAGTCCCGTATATCCGATCCGTGTTGTTT +>CD4:SK3|CD4|AHS0032|pAbO Catalog_940001 +TCGGTGTTATGAGTAGGTCGTCGTGCGGTTTGATGT +>CD45RA:HI100|PTPRC|AHS0009|pAbO Catalog_940011 +AAGCGATTGCGAAGGGTTAGTCAGTACGTTATGTTG +>CD56:NCAM16.2|NCAM1|AHS0019|pAbO Catalog_940007 +AGAGGTTGAGTCGTAATAATAATCGGAAGGCGTTGG +>CD62L:DREG-56|SELL|AHS0049|pAbO Catalog_940041 +ATGGTAAATATGGGCGAATGCGGGTTGTGCTAAAGT +>CCR7|CCR7|AHS0273|pAbO Catalog_940394 +AATGTGTGATCGGCAAAGGGTTCTCGGGTTAATATG +>CXCR6|CXCR6|AHS0148|pAbO Catalog_940234 +GTGGTTGGTTATTCGGACGGTTCTATTGTGAGCGCT +>CD127|IL7R|AHS0028|pAbO Catalog_940012 +AGTTATTAGGCTCGTAGGTATGTTTAGGTTATCGCG +>CD134:ACT35|TNFRSF4|AHS0013|pAbO Catalog_940060 +GGTGTTGGTAAGACGGACGGAGTAGATATTCGAGGT +>CD28:L293|CD28|AHS0138|pAbO Catalog_940226 +TTGTTGAGGATACGATGAAGCGGTTTAAGGGTGTGG +>CD272|BTLA|AHS0052|pAbO Catalog_940105 +GTAGGTTGATAGTCGGCGATAGTGCGGTTGAAAGCT +>CD8:SK1|CD8A|AHS0228|pAbO Catalog_940305 +AGGACATAGAGTAGGACGAGGTAGGCTTAAATTGCT +>HLA-DR|CD74|AHS0035|pAbO Catalog_940010 +TGTTGGTTATTCGTTAGTGCATCCGTTTGGGCGTGG +>CD16:3G8|FCGR3A|AHS0053|pAbO Catalog_940006 +TAAATCTAATCGCGGTAACATAACGGTGGGTAAGGT +>CD183|CXCR3|AHS0031|pAbO Catalog_940030 +AAAGTGTTGGCGTTATGTGTTCGTTAGCGGTGTGGG +>CD196|CCR6|AHS0034|pAbO Catalog_940033 +ACGTGTTATGGTGTTGTTCGAATTGTGGTAGTCAGT +>CD137|TNFRSF9|AHS0003|pAbO Catalog_940055 +TGACAAGCAACGAGCGATACGAAAGGCGAAATTAGT +>CD161:HP-3G10|KLRB1|AHS0205|pAbO Catalog_940283 +TTTAGGACGATTAGTTGTGCGGCATAGGAGGTGTTC +>IgM|IGHM|AHS0198|pAbO Catalog_940276 +TTTGGAGGGTAGCTAGTTGCAGTTCGTGGTCGTTTC +>IgD|IGHD|AHS0058|pAbO Catalog_940026 +TGAGGGATGTATAGCGAGAATTGCGACCGTAGACTT diff --git a/src/bd_rhapsody/test_data/SampleTagSequences_HomoSapiens_ver1.fasta b/src/bd_rhapsody/test_data/SampleTagSequences_HomoSapiens_ver1.fasta new file mode 100644 index 00000000..3d5a42fa --- /dev/null +++ b/src/bd_rhapsody/test_data/SampleTagSequences_HomoSapiens_ver1.fasta @@ -0,0 +1,24 @@ +>SampleTag01_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGATTCAAGGGCAGCCGCGTCACGATTGGATACGACTGTTGGACCGG +>SampleTag02_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGTGGATGGGATAAGTGCGTGATGGACCGAAGGGACCTCGTGGCCGG +>SampleTag03_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGCGGCTCGTGCTGCGTCGTCTCAAGTCCAGAAACTCCGTGTATCCT +>SampleTag04_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGATTGGGAGGCTTTCGTACCGCTGCCGCCACCAGGTGATACCCGCT +>SampleTag05_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGCTCCCTGGTGTTCAATACCCGATGTGGTGGGCAGAATGTGGCTGG +>SampleTag06_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGTTACCCGCAGGAAGACGTATACCCCTCGTGCCAGGCGACCAATGC +>SampleTag07_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGTGTCTACGTCGGACCGCAAGAAGTGAGTCAGAGGCTGCACGCTGT +>SampleTag08_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGCCCCACCAGGTTGCTTTGTCGGACGAGCCCGCACAGCGCTAGGAT +>SampleTag09_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGGTGATCCGCGCAGGCACACATACCGACTCAGATGGGTTGTCCAGG +>SampleTag10_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGGCAGCCGGCGTCGTACGAGGCACAGCGGAGACTAGATGAGGCCCC +>SampleTag11_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGCGCGTCCAATTTCCGAAGCCCCGCCCTAGGAGTTCCCCTGCGTGC +>SampleTag12_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGGCCCATTCATTGCACCCGCCAGTGATCGACCCTAGTGGAGCTAAG diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.fa b/src/bd_rhapsody/test_data/reference_small.fa similarity index 100% rename from src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.fa rename to src/bd_rhapsody/test_data/reference_small.fa diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.gtf b/src/bd_rhapsody/test_data/reference_small.gtf similarity index 100% rename from src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.gtf rename to src/bd_rhapsody/test_data/reference_small.gtf diff --git a/src/bd_rhapsody/test_data/script.sh b/src/bd_rhapsody/test_data/script.sh new file mode 100644 index 00000000..f8db0313 --- /dev/null +++ b/src/bd_rhapsody/test_data/script.sh @@ -0,0 +1,141 @@ +#!/bin/bash + +TMP_DIR=/tmp/bd_rhapsody_make_reference +OUT_DIR=src/bd_rhapsody/test_data + +# check if seqkit is installed +if ! command -v seqkit &> /dev/null; then + echo "seqkit could not be found" + exit 1 +fi + +# create temporary directory and clean up on exit +mkdir -p $TMP_DIR +function clean_up { + rm -rf "$TMP_DIR" +} +trap clean_up EXIT + +# fetch reference +ORIG_FA=$TMP_DIR/reference.fa.gz +if [ ! -f $ORIG_FA ]; then + wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz \ + -O $ORIG_FA +fi + +ORIG_GTF=$TMP_DIR/reference.gtf.gz +if [ ! -f $ORIG_GTF ]; then + wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz \ + -O $ORIG_GTF +fi + +# create small reference +START=30000 +END=31500 +CHR=chr1 + +# subset to small region +seqkit grep -r -p "^$CHR\$" "$ORIG_FA" | \ + seqkit subseq -r "$START:$END" > $OUT_DIR/reference_small.fa + +zcat "$ORIG_GTF" | \ + awk -v FS='\t' -v OFS='\t' " + \$1 == \"$CHR\" && \$4 >= $START && \$5 <= $END { + \$4 = \$4 - $START + 1; + \$5 = \$5 - $START + 1; + print; + }" > $OUT_DIR/reference_small.gtf + +# download bdabseq immunediscoverypanel fasta +# note: was contained in http://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-Demo-Data-Inputs/12WTA-ABC-SMK-EB-5kJRT.tar +cat > $OUT_DIR/BDAbSeq_ImmuneDiscoveryPanel.fasta <CD11c:B-LY6|ITGAX|AHS0056|pAbO Catalog_940024 +ATGCGTTGCGAGAGATATGCGTAGGTTGCTGATTGG +>CD14:MPHIP9|CD14|AHS0037|pAbO Catalog_940005 +TGGCCCGTGGTAGCGCAATGTGAGATCGTAATAAGT +>CXCR5|CXCR5|AHS0039|pAbO Catalog_940042 +AGGAAGGTCGATTGTATAACGCGGCATTGTAACGGC +>CD19:SJ25C1|CD19|AHS0030|pAbO Catalog_940004 +TAGTAATGTGTTCGTAGCCGGTAATAATCTTCGTGG +>CD25:2A3|IL2RA|AHS0026|pAbO Catalog_940009 +AGTTGTATGGGTTAGCCGAGAGTAGTGCGTATGATT +>CD27:M-T271|CD27|AHS0025|pAbO Catalog_940018 +TGTCCGGTTTAGCGAATTGGGTTGAGTCACGTAGGT +>CD278|ICOS|AHS0012|pAbO Catalog_940043 +ATAGTCCGCCGTAATCGTTGTGTCGCTGAAAGGGTT +>CD279:EH12-1|PDCD1|AHS0014|pAbO Catalog_940015 +ATGGTAGTATCACGACGTAGTAGGGTAATTGGCAGT +>CD3:UCHT1|CD3E|AHS0231|pAbO Catalog_940307 +AGCTAGGTGTTATCGGCAAGTTGTACGGTGAAGTCG +>GITR|TNFRSF18|AHS0104|pAbO Catalog_940096 +TCTGTGTGTCGGGTTGAATCGTAGTGAGTTAGCGTG +>Tim3|HAVCR2|AHS0016|pAbO Catalog_940066 +TAGGTAGTAGTCCCGTATATCCGATCCGTGTTGTTT +>CD4:SK3|CD4|AHS0032|pAbO Catalog_940001 +TCGGTGTTATGAGTAGGTCGTCGTGCGGTTTGATGT +>CD45RA:HI100|PTPRC|AHS0009|pAbO Catalog_940011 +AAGCGATTGCGAAGGGTTAGTCAGTACGTTATGTTG +>CD56:NCAM16.2|NCAM1|AHS0019|pAbO Catalog_940007 +AGAGGTTGAGTCGTAATAATAATCGGAAGGCGTTGG +>CD62L:DREG-56|SELL|AHS0049|pAbO Catalog_940041 +ATGGTAAATATGGGCGAATGCGGGTTGTGCTAAAGT +>CCR7|CCR7|AHS0273|pAbO Catalog_940394 +AATGTGTGATCGGCAAAGGGTTCTCGGGTTAATATG +>CXCR6|CXCR6|AHS0148|pAbO Catalog_940234 +GTGGTTGGTTATTCGGACGGTTCTATTGTGAGCGCT +>CD127|IL7R|AHS0028|pAbO Catalog_940012 +AGTTATTAGGCTCGTAGGTATGTTTAGGTTATCGCG +>CD134:ACT35|TNFRSF4|AHS0013|pAbO Catalog_940060 +GGTGTTGGTAAGACGGACGGAGTAGATATTCGAGGT +>CD28:L293|CD28|AHS0138|pAbO Catalog_940226 +TTGTTGAGGATACGATGAAGCGGTTTAAGGGTGTGG +>CD272|BTLA|AHS0052|pAbO Catalog_940105 +GTAGGTTGATAGTCGGCGATAGTGCGGTTGAAAGCT +>CD8:SK1|CD8A|AHS0228|pAbO Catalog_940305 +AGGACATAGAGTAGGACGAGGTAGGCTTAAATTGCT +>HLA-DR|CD74|AHS0035|pAbO Catalog_940010 +TGTTGGTTATTCGTTAGTGCATCCGTTTGGGCGTGG +>CD16:3G8|FCGR3A|AHS0053|pAbO Catalog_940006 +TAAATCTAATCGCGGTAACATAACGGTGGGTAAGGT +>CD183|CXCR3|AHS0031|pAbO Catalog_940030 +AAAGTGTTGGCGTTATGTGTTCGTTAGCGGTGTGGG +>CD196|CCR6|AHS0034|pAbO Catalog_940033 +ACGTGTTATGGTGTTGTTCGAATTGTGGTAGTCAGT +>CD137|TNFRSF9|AHS0003|pAbO Catalog_940055 +TGACAAGCAACGAGCGATACGAAAGGCGAAATTAGT +>CD161:HP-3G10|KLRB1|AHS0205|pAbO Catalog_940283 +TTTAGGACGATTAGTTGTGCGGCATAGGAGGTGTTC +>IgM|IGHM|AHS0198|pAbO Catalog_940276 +TTTGGAGGGTAGCTAGTTGCAGTTCGTGGTCGTTTC +>IgD|IGHD|AHS0058|pAbO Catalog_940026 +TGAGGGATGTATAGCGAGAATTGCGACCGTAGACTT +EOF + +# this was obtained by running the command: +# docker run bdgenomics/rhapsody:2.2.1 cat /rhapsody/control_files/SampleTagSequences_HomoSapiens_ver1.fasta +cat > $OUT_DIR/SampleTagSequences_HomoSapiens_ver1.fasta <SampleTag01_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGATTCAAGGGCAGCCGCGTCACGATTGGATACGACTGTTGGACCGG +>SampleTag02_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGTGGATGGGATAAGTGCGTGATGGACCGAAGGGACCTCGTGGCCGG +>SampleTag03_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGCGGCTCGTGCTGCGTCGTCTCAAGTCCAGAAACTCCGTGTATCCT +>SampleTag04_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGATTGGGAGGCTTTCGTACCGCTGCCGCCACCAGGTGATACCCGCT +>SampleTag05_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGCTCCCTGGTGTTCAATACCCGATGTGGTGGGCAGAATGTGGCTGG +>SampleTag06_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGTTACCCGCAGGAAGACGTATACCCCTCGTGCCAGGCGACCAATGC +>SampleTag07_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGTGTCTACGTCGGACCGCAAGAAGTGAGTCAGAGGCTGCACGCTGT +>SampleTag08_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGCCCCACCAGGTTGCTTTGTCGGACGAGCCCGCACAGCGCTAGGAT +>SampleTag09_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGGTGATCCGCGCAGGCACACATACCGACTCAGATGGGTTGTCCAGG +>SampleTag10_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGGCAGCCGGCGTCGTACGAGGCACAGCGGAGACTAGATGAGGCCCC +>SampleTag11_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGCGCGTCCAATTTCCGAAGCCCCGCCCTAGGAGTTCCCCTGCGTGC +>SampleTag12_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGGCCCATTCATTGCACCCGCCAGTGATCGACCCTAGTGGAGCTAAG +EOF From 619f1bbb6d040e650233d3b0380f5298e624ecef Mon Sep 17 00:00:00 2001 From: Emma Rousseau Date: Wed, 18 Sep 2024 15:45:08 +0200 Subject: [PATCH 24/28] Rsem-calculate-expression (#93) * initial commit dedup * Revert "initial commit dedup" This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2. * three rsem components initial commit * update container setup * Simplified container configuration * temporarily remove version recording from config * Complete config file * add tests and complete config file * change test dataset * functional test, adjustements to scripts * Update changelog * Simplified test data and help.txt contents * suggested changes, typos * simplify, get rid of test_data folder * Update CHANGELOG.md * Update CHANGELOG.md --------- Co-authored-by: Robrecht Cannoodt --- CHANGELOG.md | 2 + .../rsem_calculate_expression/config.vsh.yaml | 479 ++++++++ src/rsem/rsem_calculate_expression/help.txt | 1002 +++++++++++++++++ src/rsem/rsem_calculate_expression/script.sh | 103 ++ src/rsem/rsem_calculate_expression/test.sh | 116 ++ 5 files changed, 1702 insertions(+) create mode 100644 src/rsem/rsem_calculate_expression/config.vsh.yaml create mode 100644 src/rsem/rsem_calculate_expression/help.txt create mode 100644 src/rsem/rsem_calculate_expression/script.sh create mode 100644 src/rsem/rsem_calculate_expression/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 07a83c15..9bfb5606 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ * `bd_rhapsody/bd_rhapsody_sequence_analysis`: BD Rhapsody Sequence Analysis CWL pipeline (PR #96). +* `rsem/rsem_calculate_expression`: Calculate expression levels (PR #93). + ## MINOR CHANGES * Upgrade to Viash 0.9.0. diff --git a/src/rsem/rsem_calculate_expression/config.vsh.yaml b/src/rsem/rsem_calculate_expression/config.vsh.yaml new file mode 100644 index 00000000..2cd950cb --- /dev/null +++ b/src/rsem/rsem_calculate_expression/config.vsh.yaml @@ -0,0 +1,479 @@ +name: "rsem_calculate_expression" +namespace: "rsem" +description: | + Calculate expression with RSEM. +keywords: [Transcriptome, Index, Alignment, RSEM] +links: + homepage: https://deweylab.github.io/RSEM/ + documentation: https://deweylab.github.io/RSEM/rsem-calculate-expression.html + repository: https://github.com/deweylab/RSEM +references: + doi: https://doi.org/10.1186/1471-2105-12-323 +license: GPL-3.0 + + +argument_groups: +- name: "Input" + arguments: + - name: "--id" + type: string + description: Sample ID. + - name: "--strandedness" + type: string + description: Sample strand-specificity. Must be one of unstranded, forward, reverse + choices: [forward, reverse, unstranded] + - name: "--paired" + type: boolean_true + description: Paired-end reads or not? + - name: "--input" + type: file + description: Input reads for quantification. + multiple: true + - name: "--index" + type: file + must_exist: false + description: RSEM index. + - name: "--extra_args" + type: string + description: Extra rsem-calculate-expression arguments in addition to the examples. + +- name: "Output" + arguments: + - name: "--counts_gene" + type: file + description: Expression counts on gene level + example: $id.genes.results + direction: output + - name: "--counts_transcripts" + type: file + description: Expression counts on transcript level + example: $id.isoforms.results + direction: output + - name: "--stat" + type: file + description: RSEM statistics + example: $id.stat + direction: output + - name: "--logs" + type: file + description: RSEM logs + example: $id.log + direction: output + - name: "--bam_star" + type: file + description: BAM file generated by STAR (optional) + example: $id.STAR.genome.bam + direction: output + - name: "--bam_genome" + type: file + description: Genome BAM file (optional) + example: $id.genome.bam + direction: output + - name: "--bam_transcript" + type: file + description: Transcript BAM file (optional) + example: $id.transcript.bam + direction: output + - name: "--sort_bam_by_read_name" + type: boolean_true + description: | + Sort BAM file aligned under transcript coordidate by read name. Setting this option on will produce + deterministic maximum likelihood estimations from independent runs. Note that sorting will take long + time and lots of memory. + - name: "--no_bam_output" + type: boolean_true + description: Do not output any BAM file. + - name: "--sampling_for_bam" + type: boolean_true + description: | + When RSEM generates a BAM file, instead of outputting all alignments a read has with their posterior + probabilities, one alignment is sampled according to the posterior probabilities. The sampling procedure + includes the alignment to the "noise" transcript, which does not appear in the BAM file. Only the + sampled alignment has a weight of 1. All other alignments have weight 0. If the "noise" transcript is + sampled, all alignments appeared in the BAM file should have weight 0. + - name: "--output_genome_bam" + type: boolean_true + description: | + Generate a BAM file, 'sample_name.genome.bam', with alignments mapped to genomic coordinates and + annotated with their posterior probabilities. In addition, RSEM will call samtools (included in RSEM + package) to sort and index the bam file. 'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' + will be generated. + - name: "--sort_bam_by_coordinate" + type: boolean_true + description: | + Sort RSEM generated transcript and genome BAM files by coordinates and build associated indices. + +- name: "Basic Options" + arguments: + - name: "--no_qualities" + type: boolean_true + description: Input reads do not contain quality scores. + - name: "--alignments" + type: boolean_true + description: | + Input file contains alignments in SAM/BAM/CRAM format. The exact file format will be determined + automatically. + - name: "--fai" + type: file + description: | + If the header section of input alignment file does not contain reference sequence information, + this option should be turned on. is a FAI format file containing each reference sequence's + name and length. Please refer to the SAM official website for the details of FAI format. + - name: "--bowtie2" + type: boolean_true + description: | + Use Bowtie 2 instead of Bowtie to align reads. Since currently RSEM does not handle indel, local + and discordant alignments, the Bowtie2 parameters are set in a way to avoid those alignments. In + particular, we use options '--sensitive --dpad 0 --gbar 99999999 --mp 1,1 --np 1 --score_min L,0,-0.1' + by default. The last parameter of '--score_min', '-0.1', is the negative of maximum mismatch rate. + This rate can be set by option '--bowtie2_mismatch_rate'. If reads are paired-end, we additionally + use options '--no_mixed' and '--no_discordant'. + - name: "--star" + type: boolean_true + description: | + Use STAR to align reads. Alignment parameters are from ENCODE3's STAR-RSEM pipeline. To save + computational time and memory resources, STAR's Output BAM file is unsorted. It is stored in RSEM's + temporary directory with name as 'sample_name.bam'. Each STAR job will have its own private copy of + the genome in memory. + - name: "--hisat2_hca" + type: boolean_true + description: | + Use HISAT2 to align reads to the transcriptome according to Human Cell Atlast. + - name: "--append_names" + type: boolean_true + description: | + If gene_name/transcript_name is available, append it to the end of gene_id/transcript_id (separated + by '_') in files 'sample_name.isoforms.results' and 'sample_name.genes.results'. + - name: "--seed" + type: integer + description: | + Set the seed for the random number generators used in calculating posterior mean estimates and + credibility intervals. The seed must be a non-negative 32 bit integer. + - name: "--single_cell_prior" + type: boolean_true + description: | + By default, RSEM uses Dirichlet(1) as the prior to calculate posterior mean estimates and credibility + intervals. However, much less genes are expressed in single cell RNA-Seq data. Thus, if you want to + compute posterior mean estimates and/or credibility intervals and you have single-cell RNA-Seq data, + you are recommended to turn on this option. Then RSEM will use Dirichlet(0.1) as the prior which + encourage the sparsity of the expression levels. + - name: "--calc_pme" + type: boolean_true + description: Run RSEM's collapsed Gibbs sampler to calculate posterior mean estimates. + - name: "--calc_ci" + type: boolean_true + description: | + Calculate 95% credibility intervals and posterior mean estimates. The credibility level can be + changed by setting '--ci_credibility_level'. + - name: "--quiet" + alternatives: "-q" + type: boolean_true + description: Suppress the output of logging information. + +- name: "Aligner Options" + arguments: + - name: "--seed_length" + type: integer + description: | + Seed length used by the read aligner. Providing the correct value is important for RSEM. If RSEM + runs Bowtie, it uses this value for Bowtie's seed length parameter. Any read with its or at least + one of its mates' (for paired-end reads) length less than this value will be ignored. If the + references are not added poly(A) tails, the minimum allowed value is 5, otherwise, the minimum + allowed value is 25. Note that this script will only check if the value >= 5 and give a warning + message if the value < 25 but >= 5. (Default: 25) + example: 25 + - name: "--phred64_quals" + type: boolean_true + description: | + Input quality scores are encoded as Phred+64 (default for GA Pipeline ver. >= 1.3). This option is + used by Bowtie, Bowtie 2 and HISAT2. Otherwise, quality score will be encoded as Phred+33. (Default: false) + - name: "--solexa_quals" + type: boolean_true + description: | + Input quality scores are solexa encoded (from GA Pipeline ver. < 1.3). This option is used by + Bowtie, Bowtie 2 and HISAT2. Otherwise, quality score will be encoded as Phred+33. (Default: false) + - name: "--bowtie_n" + type: integer + description: | + (Bowtie parameter) max # of mismatches in the seed. (Range: 0-3, Default: 2) + choices: [0, 1, 2, 3] + example: 2 + - name: "--bowtie_e" + type: integer + description: | + (Bowtie parameter) max sum of mismatch quality scores across the alignment. (Default: 99999999) + example: 99999999 + - name: "--bowtie_m" + type: integer + description: | + (Bowtie parameter) suppress all alignments for a read if > valid alignments exist. (Default: 200) + example: 200 + - name: "--bowtie_chunkmbs" + type: integer + description: | + (Bowtie parameter) memory allocated for best first alignment calculation (Default: 0 - use Bowtie's default) + example: 0 + - name: "--bowtie2_mismatch_rate" + type: double + description: | + (Bowtie 2 parameter) The maximum mismatch rate allowed. (Default: 0.1) + example: 0.1 + - name: "--bowtie2_k" + type: integer + description: | + (Bowtie 2 parameter) Find up to alignments per read. (Default: 200) + example: 200 + - name: "--bowtie2_sensitivity_level" + type: string + description: | + (Bowtie 2 parameter) Set Bowtie 2's preset options in --end-to-end mode. This option controls how + hard Bowtie 2 tries to find alignments. must be one of "very_fast", "fast", "sensitive" + and "very_sensitive". The four candidates correspond to Bowtie 2's "--very-fast", "--fast", + "--sensitive" and "--very-sensitive" options. (Default: "sensitive" - use Bowtie 2's default) + choices: ["very_fast", "fast", "sensitive", "very_sensitive"] + example: sensitive + - name: "--star_gzipped_read_file" + type: boolean_true + description: | + Input read file(s) is compressed by gzip. (Default: false) + - name: "--star_bzipped_read_file" + type: boolean_true + description: | + Input read file(s) is compressed by bzip2. (Default: false) + - name: "--star_output_genome_bam" + type: boolean_true + description: | + Save the BAM file from STAR alignment under genomic coordinate to 'sample_name.STAR.genome.bam'. + This file is NOT sorted by genomic coordinate. In this file, according to STAR's manual, 'paired + ends of an alignment are always adjacent, and multiple alignments of a read are adjacent as well'. + (Default: false) + +- name: "Advanced Options" + arguments: + - name: "--tag" + type: string + description: | + The name of the optional field used in the SAM input for identifying a read with too many valid + alignments. The field should have the format :i:, where a bigger than 0 + indicates a read with too many alignments. (Default: "") + example: "" + - name: "--fragment_length_min" + type: integer + description: | + Minimum read/insert length allowed. This is also the value for the Bowtie/Bowtie2 -I option. + (Default: 1) + example: 1 + - name: "--fragment_length_max" + type: integer + description: | + Maximum read/insert length allowed. This is also the value for the Bowtie/Bowtie 2 -X option. + (Default: 1000) + example: 1000 + - name: "--fragment_length_mean" + type: integer + description: | + (single-end data only) The mean of the fragment length distribution, which is assumed to be a + Gaussian. (Default: -1, which disables use of the fragment length distribution) + example: -1 + - name: "--gragment_length_sd" + type: double + description: | + (single-end data only) The standard deviation of the fragment length distribution, which is + assumed to be a Gaussian. (Default: 0, which assumes that all fragments are of the same length, + given by the rounded value of --fragment_length_mean). + example: 0 + - name: "--estimate_rspd" + type: boolean_true + description: | + Set this option if you want to estimate the read start position distribution (RSPD) from data. + Otherwise, RSEM will use a uniform RSPD. + - name: "--num_rspd_bins" + type: integer + description: | + Number of bins in the RSPD. Only relevant when '--estimate_rspd' is specified. Use of the default + setting is recommended. (Default: 20) + example: 20 + - name: "--gibbs_burnin" + type: integer + description: | + The number of burn-in rounds for RSEM's Gibbs sampler. Each round passes over the entire data set + once. If RSEM can use multiple threads, multiple Gibbs samplers will start at the same time and all + samplers share the same burn-in number. (Default: 200) + example: 200 + - name: "--gibbs_number_of_samples" + type: integer + description: | + The total number of count vectors RSEM will collect from its Gibbs samplers. (Default: 1000) + example: 1000 + - name: "--gibbs_sampling_gap" + type: integer + description: | + The number of rounds between two succinct count vectors RSEM collects. If the count vector after + round N is collected, the count vector after round N + will also be collected. (Default: 1) + example: 1 + - name: "--ci_credibility_level" + type: double + description: | + The credibility level for credibility intervals. (Default: 0.95) + example: 0.95 + - name: "--ci_number_of_samples_per_count_vector" + type: integer + description: | + The number of read generating probability vectors sampled per sampled count vector. The crebility + intervals are calculated by first sampling P(C | D) and then sampling P(Theta | C) for each sampled + count vector. This option controls how many Theta vectors are sampled per sampled count vector. + (Default: 50) + example: 50 + - name: "--keep_intermediate_files" + type: boolean_true + description: | + Keep temporary files generated by RSEM. RSEM creates a temporary directory, 'sample_name.temp', + into which it puts all intermediate output files. If this directory already exists, RSEM overwrites + all files generated by previous RSEM runs inside of it. By default, after RSEM finishes, the + temporary directory is deleted. Set this option to prevent the deletion of this directory and the + intermediate files inside of it. + - name: "--temporary_folder" + type: string + description: | + Set where to put the temporary files generated by RSEM. If the folder specified does not exist, + RSEM will try to create it. (Default: sample_name.temp) + example: sample_name.temp + - name: "--time" + type: boolean_true + description: | + Output time consumed by each step of RSEM to 'sample_name.time'. + +- name: "Prior-Enhanced RSEM Options" + arguments: + - name: "--run_pRSEM" + type: boolean_true + description: | + Running prior-enhanced RSEM (pRSEM). Prior parameters, i.e. isoform's initial pseudo-count for + RSEM's Gibbs sampling, will be learned from input RNA-seq data and an external data set. When pRSEM + needs and only needs ChIP-seq peak information to partition isoforms (e.g. in pRSEM's default + partition model), either ChIP-seq peak file (with the '--chipseq_peak_file' option) or ChIP-seq + FASTQ files for target and input and the path for Bowtie executables are required (with the + '--chipseq_target_read_files ', '--chipseq_control_read_files ', and '--bowtie_path + options), otherwise, ChIP-seq FASTQ files for target and control and the path to Bowtie + executables are required. + - name: "--chipseq_peak_file" + type: file + must_exist: true + description: | + Full path to a ChIP-seq peak file in ENCODE's narrowPeak, i.e. BED6+4, format. This file is used + when running prior-enhanced RSEM in the default two-partition model. It partitions isoforms by + whether they have ChIP-seq overlapping with their transcription start site region or not. Each + partition will have its own prior parameter learned from a training set. This file can be either + gzipped or ungzipped. + - name: "--chipseq_target_read_files" + type: file + must_exist: true + description: | + Comma-separated full path of FASTQ read file(s) for ChIP-seq target. This option is used when running + prior-enhanced RSEM. It provides information to calculate ChIP-seq peaks and signals. The file(s) + can be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. The options '--bowtie_path ' + and '--chipseq_control_read_files ' must be defined when this option is specified. + - name: "--chipseq_control_read_files" + type: file + must_exist: true + description: | + Comma-separated full path of FASTQ read file(s) for ChIP-seq conrol. This option is used when running + prior-enhanced RSEM. It provides information to call ChIP-seq peaks. The file(s) can be either + ungzipped or gzipped with a suffix '.gz' or '.gzip'. The options '--bowtie_path ' and + '--chipseq_target_read_files ' must be defined when this option is specified. + - name: "--chipseq_read_files_multi_targets" + type: file + must_exist: true + description: | + Comma-separated full path of FASTQ read files for multiple ChIP-seq targets. This option is used when + running prior-enhanced RSEM, where prior is learned from multiple complementary data sets. It provides + information to calculate ChIP-seq signals. All files can be either ungzipped or gzipped with a suffix + '.gz' or '.gzip'. When this option is specified, the option '--bowtie_path ' must be defined and + the option '--partition_model ' will be set to 'cmb_lgt' automatically. + - name: "--chipseq_bed_files_multi_targets" + type: file + must_exist: true + description: | + Comma-separated full path of BED files for multiple ChIP-seq targets. This option is used when running + prior-enhanced RSEM, where prior is learned from multiple complementary data sets. It provides information + of ChIP-seq signals and must have at least the first six BED columns. All files can be either ungzipped + or gzipped with a suffix '.gz' or '.gzip'. When this option is specified, the option '--partition_model + ' will be set to 'cmb_lgt' automatically. + - name: "--cap_stacked_chipseq_reads" + type: boolean_true + description: | + Keep a maximum number of ChIP-seq reads that aligned to the same genomic interval. This option is used + when running prior-enhanced RSEM, where prior is learned from multiple complementary data sets. This + option is only in use when either '--chipseq_read_files_multi_targets ' or + '--chipseq_bed_files_multi_targets ' is specified. + - name: "--n_max_stacked_chipseq_reads" + type: integer + description: | + The maximum number of stacked ChIP-seq reads to keep. This option is used when running prior-enhanced + RSEM, where prior is learned from multiple complementary data sets. This option is only in use when the + option '--cap_stacked_chipseq_reads' is set. + - name: "--partition_model" + type: string + description: | + A keyword to specify the partition model used by prior-enhanced RSEM. It must be one of the following + keywords: + * pk + * pk_lgtnopk + * lm3, lm4, lm5, or lm6 + * nopk_lm2pk, nopk_lm3pk, nopk_lm4pk, or nopk_lm5pk + * pk_lm2nopk, pk_lm3nopk, pk_lm4nopk, or pk_lm5nopk + * cmb_lgt + Parameters for all the above models are learned from a training set. For detailed explanations, please + see prior-enhanced RSEM's paper. (Default: 'pk') + example: "pk" + + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: ubuntu:22.04 + setup: + - type: apt + packages: + - build-essential + - gcc + - g++ + - make + - wget + - zlib1g-dev + - unzip + - type: docker + env: + - STAR_VERSION=2.7.11b + - RSEM_VERSION=1.3.3 + run: | + apt-get update && \ + apt-get clean && \ + wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/2.7.11a.zip && \ + unzip 2.7.11a.zip && \ + cp STAR-2.7.11a/bin/Linux_x86_64_static/STAR /usr/local/bin && \ + cd && \ + wget --no-check-certificate https://github.com/deweylab/RSEM/archive/refs/tags/v1.3.3.zip && \ + unzip v1.3.3.zip && \ + cd RSEM-1.3.3 && \ + make && \ + make install + - type: docker + run: | + echo "RSEM: `rsem-calculate-expression --version | sed -e 's/Current version: RSEM v//g'`" > /var/software_versions.txt && \ + echo "STAR: `STAR --version`" >> /var/software_versions.txt && \ + echo "bowtie2: `bowtie2 --version | grep -oP '\d+\.\d+\.\d+'`" >> /var/software_versions.txt && \ + echo "bowtie: `bowtie --version | grep -oP 'bowtie-align-s version \K\d+\.\d+\.\d+'`" >> /var/software_versions.txt && \ + echo "HISAT2: `hisat2 --version | grep -oP 'hisat2-align-s version \K\d+\.\d+\.\d+'`" >> /var/software_versions.txt +runners: + - type: executable + - type: nextflow + + diff --git a/src/rsem/rsem_calculate_expression/help.txt b/src/rsem/rsem_calculate_expression/help.txt new file mode 100644 index 00000000..edfa3333 --- /dev/null +++ b/src/rsem/rsem_calculate_expression/help.txt @@ -0,0 +1,1002 @@ +NAME + rsem-calculate-expression - Estimate gene and isoform expression from + RNA-Seq data. + +SYNOPSIS + rsem-calculate-expression [options] upstream_read_file(s) reference_name sample_name + rsem-calculate-expression [options] --paired-end upstream_read_file(s) downstream_read_file(s) reference_name sample_name + rsem-calculate-expression [options] --alignments [--paired-end] input reference_name sample_name + +ARGUMENTS + upstream_read_files(s) + Comma-separated list of files containing single-end reads or + upstream reads for paired-end data. By default, these files are + assumed to be in FASTQ format. If the --no-qualities option is + specified, then FASTA format is expected. + + downstream_read_file(s) + Comma-separated list of files containing downstream reads which are + paired with the upstream reads. By default, these files are assumed + to be in FASTQ format. If the --no-qualities option is specified, + then FASTA format is expected. + + input + SAM/BAM/CRAM formatted input file. If "-" is specified for the + filename, the input is instead assumed to come from standard input. + RSEM requires all alignments of the same read group together. For + paired-end reads, RSEM also requires the two mates of any alignment + be adjacent. In addition, RSEM does not allow the SEQ and QUAL + fields to be empty. See Description section for how to make input + file obey RSEM's requirements. + + reference_name + The name of the reference used. The user must have run + 'rsem-prepare-reference' with this reference_name before running + this program. + + sample_name + The name of the sample analyzed. All output files are prefixed by + this name (e.g., sample_name.genes.results) + +BASIC OPTIONS + --paired-end + Input reads are paired-end reads. (Default: off) + + --no-qualities + Input reads do not contain quality scores. (Default: off) + + --strandedness + This option defines the strandedness of the RNA-Seq reads. It + recognizes three values: 'none', 'forward', and 'reverse'. 'none' + refers to non-strand-specific protocols. 'forward' means all + (upstream) reads are derived from the forward strand. 'reverse' + means all (upstream) reads are derived from the reverse strand. If + 'forward'/'reverse' is set, the '--norc'/'--nofw' Bowtie/Bowtie 2 + option will also be enabled to avoid aligning reads to the opposite + strand. For Illumina TruSeq Stranded protocols, please use + 'reverse'. (Default: 'none') + + -p/--num-threads + Number of threads to use. Both Bowtie/Bowtie2, expression estimation + and 'samtools sort' will use this many threads. (Default: 1) + + --alignments + Input file contains alignments in SAM/BAM/CRAM format. The exact + file format will be determined automatically. (Default: off) + + --fai + If the header section of input alignment file does not contain + reference sequence information, this option should be turned on. + is a FAI format file containing each reference sequence's + name and length. Please refer to the SAM official website for the + details of FAI format. (Default: off) + + --bowtie2 + Use Bowtie 2 instead of Bowtie to align reads. Since currently RSEM + does not handle indel, local and discordant alignments, the Bowtie2 + parameters are set in a way to avoid those alignments. In + particular, we use options '--sensitive --dpad 0 --gbar 99999999 + --mp 1,1 --np 1 --score-min L,0,-0.1' by default. The last parameter + of '--score-min', '-0.1', is the negative of maximum mismatch rate. + This rate can be set by option '--bowtie2-mismatch-rate'. If reads + are paired-end, we additionally use options '--no-mixed' and + '--no-discordant'. (Default: off) + + --star + Use STAR to align reads. Alignment parameters are from ENCODE3's + STAR-RSEM pipeline. To save computational time and memory resources, + STAR's Output BAM file is unsorted. It is stored in RSEM's temporary + directory with name as 'sample_name.bam'. Each STAR job will have + its own private copy of the genome in memory. (Default: off) + + --hisat2-hca + Use HISAT2 to align reads to the transcriptome according to Human + Cell Atlast SMART-Seq2 pipeline. In particular, we use HISAT + parameters "-k 10 --secondary --rg-id=$sampleToken --rg + SM:$sampleToken --rg LB:$sampleToken --rg PL:ILLUMINA --rg + PU:$sampleToken --new-summary --summary-file $sampleName.log + --met-file $sampleName.hisat2.met.txt --met 5 --mp 1,1 --np 1 + --score-min L,0,-0.1 --rdg 99999999,99999999 --rfg 99999999,99999999 + --no-spliced-alignment --no-softclip --seed 12345". If inputs are + paired-end reads, we additionally use parameters "--no-mixed + --no-discordant". (Default: off) + + --append-names + If gene_name/transcript_name is available, append it to the end of + gene_id/transcript_id (separated by '_') in files + 'sample_name.isoforms.results' and 'sample_name.genes.results'. + (Default: off) + + --seed + Set the seed for the random number generators used in calculating + posterior mean estimates and credibility intervals. The seed must be + a non-negative 32 bit integer. (Default: off) + + --single-cell-prior + By default, RSEM uses Dirichlet(1) as the prior to calculate + posterior mean estimates and credibility intervals. However, much + less genes are expressed in single cell RNA-Seq data. Thus, if you + want to compute posterior mean estimates and/or credibility + intervals and you have single-cell RNA-Seq data, you are recommended + to turn on this option. Then RSEM will use Dirichlet(0.1) as the + prior which encourage the sparsity of the expression levels. + (Default: off) + + --calc-pme + Run RSEM's collapsed Gibbs sampler to calculate posterior mean + estimates. (Default: off) + + --calc-ci + Calculate 95% credibility intervals and posterior mean estimates. + The credibility level can be changed by setting + '--ci-credibility-level'. (Default: off) + + -q/--quiet + Suppress the output of logging information. (Default: off) + + -h/--help + Show help information. + + --version + Show version information. + +OUTPUT OPTIONS + --sort-bam-by-read-name + Sort BAM file aligned under transcript coordidate by read name. + Setting this option on will produce deterministic maximum likelihood + estimations from independent runs. Note that sorting will take long + time and lots of memory. (Default: off) + + --no-bam-output + Do not output any BAM file. (Default: off) + + --sampling-for-bam + When RSEM generates a BAM file, instead of outputting all alignments + a read has with their posterior probabilities, one alignment is + sampled according to the posterior probabilities. The sampling + procedure includes the alignment to the "noise" transcript, which + does not appear in the BAM file. Only the sampled alignment has a + weight of 1. All other alignments have weight 0. If the "noise" + transcript is sampled, all alignments appeared in the BAM file + should have weight 0. (Default: off) + + --output-genome-bam + Generate a BAM file, 'sample_name.genome.bam', with alignments + mapped to genomic coordinates and annotated with their posterior + probabilities. In addition, RSEM will call samtools (included in + RSEM package) to sort and index the bam file. + 'sample_name.genome.sorted.bam' and + 'sample_name.genome.sorted.bam.bai' will be generated. (Default: + off) + + --sort-bam-by-coordinate + Sort RSEM generated transcript and genome BAM files by coordinates + and build associated indices. (Default: off) + + --sort-bam-memory-per-thread + Set the maximum memory per thread that can be used by 'samtools + sort'. represents the memory and accepts suffices 'K/M/G'. + RSEM will pass to the '-m' option of 'samtools sort'. Note + that the default used here is different from the default used by + samtools. (Default: 1G) + +ALIGNER OPTIONS + --seed-length + Seed length used by the read aligner. Providing the correct value is + important for RSEM. If RSEM runs Bowtie, it uses this value for + Bowtie's seed length parameter. Any read with its or at least one of + its mates' (for paired-end reads) length less than this value will + be ignored. If the references are not added poly(A) tails, the + minimum allowed value is 5, otherwise, the minimum allowed value is + 25. Note that this script will only check if the value >= 5 and give + a warning message if the value < 25 but >= 5. (Default: 25) + + --phred33-quals + Input quality scores are encoded as Phred+33. This option is used by + Bowtie, Bowtie 2 and HISAT2. (Default: on) + + --phred64-quals + Input quality scores are encoded as Phred+64 (default for GA + Pipeline ver. >= 1.3). This option is used by Bowtie, Bowtie 2 and + HISAT2. (Default: off) + + --solexa-quals + Input quality scores are solexa encoded (from GA Pipeline ver. < + 1.3). This option is used by Bowtie, Bowtie 2 and HISAT2. (Default: + off) + + --bowtie-path + The path to the Bowtie executables. (Default: the path to the Bowtie + executables is assumed to be in the user's PATH environment + variable) + + --bowtie-n + (Bowtie parameter) max # of mismatches in the seed. (Range: 0-3, + Default: 2) + + --bowtie-e + (Bowtie parameter) max sum of mismatch quality scores across the + alignment. (Default: 99999999) + + --bowtie-m + (Bowtie parameter) suppress all alignments for a read if > + valid alignments exist. (Default: 200) + + --bowtie-chunkmbs + (Bowtie parameter) memory allocated for best first alignment + calculation (Default: 0 - use Bowtie's default) + + --bowtie2-path + (Bowtie 2 parameter) The path to the Bowtie 2 executables. (Default: + the path to the Bowtie 2 executables is assumed to be in the user's + PATH environment variable) + + --bowtie2-mismatch-rate + (Bowtie 2 parameter) The maximum mismatch rate allowed. (Default: + 0.1) + + --bowtie2-k + (Bowtie 2 parameter) Find up to alignments per read. (Default: + 200) + + --bowtie2-sensitivity-level + (Bowtie 2 parameter) Set Bowtie 2's preset options in --end-to-end + mode. This option controls how hard Bowtie 2 tries to find + alignments. must be one of "very_fast", "fast", "sensitive" + and "very_sensitive". The four candidates correspond to Bowtie 2's + "--very-fast", "--fast", "--sensitive" and "--very-sensitive" + options. (Default: "sensitive" - use Bowtie 2's default) + + --star-path + The path to STAR's executable. (Default: the path to STAR executable + is assumed to be in user's PATH environment variable) + + --star-gzipped-read-file + (STAR parameter) Input read file(s) is compressed by gzip. (Default: + off) + + --star-bzipped-read-file + (STAR parameter) Input read file(s) is compressed by bzip2. + (Default: off) + + --star-output-genome-bam + (STAR parameter) Save the BAM file from STAR alignment under genomic + coordinate to 'sample_name.STAR.genome.bam'. This file is NOT sorted + by genomic coordinate. In this file, according to STAR's manual, + 'paired ends of an alignment are always adjacent, and multiple + alignments of a read are adjacent as well'. (Default: off) + + --hisat2-path + The path to HISAT2's executable. (Default: the path to HISAT2 + executable is assumed to be in user's PATH environment variable) + +ADVANCED OPTIONS + --tag + The name of the optional field used in the SAM input for identifying + a read with too many valid alignments. The field should have the + format :i:, where a bigger than 0 indicates + a read with too many alignments. (Default: "") + + --fragment-length-min + Minimum read/insert length allowed. This is also the value for the + Bowtie/Bowtie2 -I option. (Default: 1) + + --fragment-length-max + Maximum read/insert length allowed. This is also the value for the + Bowtie/Bowtie 2 -X option. (Default: 1000) + + --fragment-length-mean + (single-end data only) The mean of the fragment length distribution, + which is assumed to be a Gaussian. (Default: -1, which disables use + of the fragment length distribution) + + --fragment-length-sd + (single-end data only) The standard deviation of the fragment length + distribution, which is assumed to be a Gaussian. (Default: 0, which + assumes that all fragments are of the same length, given by the + rounded value of --fragment-length-mean) + + --estimate-rspd + Set this option if you want to estimate the read start position + distribution (RSPD) from data. Otherwise, RSEM will use a uniform + RSPD. (Default: off) + + --num-rspd-bins + Number of bins in the RSPD. Only relevant when '--estimate-rspd' is + specified. Use of the default setting is recommended. (Default: 20) + + --gibbs-burnin + The number of burn-in rounds for RSEM's Gibbs sampler. Each round + passes over the entire data set once. If RSEM can use multiple + threads, multiple Gibbs samplers will start at the same time and all + samplers share the same burn-in number. (Default: 200) + + --gibbs-number-of-samples + The total number of count vectors RSEM will collect from its Gibbs + samplers. (Default: 1000) + + --gibbs-sampling-gap + The number of rounds between two succinct count vectors RSEM + collects. If the count vector after round N is collected, the count + vector after round N + will also be collected. (Default: 1) + + --ci-credibility-level + The credibility level for credibility intervals. (Default: 0.95) + + --ci-memory + Maximum size (in memory, MB) of the auxiliary buffer used for + computing credibility intervals (CI). (Default: 1024) + + --ci-number-of-samples-per-count-vector + The number of read generating probability vectors sampled per + sampled count vector. The crebility intervals are calculated by + first sampling P(C | D) and then sampling P(Theta | C) for each + sampled count vector. This option controls how many Theta vectors + are sampled per sampled count vector. (Default: 50) + + --keep-intermediate-files + Keep temporary files generated by RSEM. RSEM creates a temporary + directory, 'sample_name.temp', into which it puts all intermediate + output files. If this directory already exists, RSEM overwrites all + files generated by previous RSEM runs inside of it. By default, + after RSEM finishes, the temporary directory is deleted. Set this + option to prevent the deletion of this directory and the + intermediate files inside of it. (Default: off) + + --temporary-folder + Set where to put the temporary files generated by RSEM. If the + folder specified does not exist, RSEM will try to create it. + (Default: sample_name.temp) + + --time + Output time consumed by each step of RSEM to 'sample_name.time'. + (Default: off) + +PRIOR-ENHANCED RSEM OPTIONS + --run-pRSEM + Running prior-enhanced RSEM (pRSEM). Prior parameters, i.e. + isoform's initial pseudo-count for RSEM's Gibbs sampling, will be + learned from input RNA-seq data and an external data set. When pRSEM + needs and only needs ChIP-seq peak information to partition isoforms + (e.g. in pRSEM's default partition model), either ChIP-seq peak file + (with the '--chipseq-peak-file' option) or ChIP-seq FASTQ files for + target and input and the path for Bowtie executables are required + (with the '--chipseq-target-read-files ', + '--chipseq-control-read-files ', and '--bowtie-path + options), otherwise, ChIP-seq FASTQ files for target and control and + the path to Bowtie executables are required. (Default: off) + + --chipseq-peak-file + Full path to a ChIP-seq peak file in ENCODE's narrowPeak, i.e. + BED6+4, format. This file is used when running prior-enhanced RSEM + in the default two-partition model. It partitions isoforms by + whether they have ChIP-seq overlapping with their transcription + start site region or not. Each partition will have its own prior + parameter learned from a training set. This file can be either + gzipped or ungzipped. (Default: "") + + --chipseq-target-read-files + Comma-separated full path of FASTQ read file(s) for ChIP-seq target. + This option is used when running prior-enhanced RSEM. It provides + information to calculate ChIP-seq peaks and signals. The file(s) can + be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. The + options '--bowtie-path ' and '--chipseq-control-read-files + ' must be defined when this option is specified. (Default: + "") + + --chipseq-control-read-files + Comma-separated full path of FASTQ read file(s) for ChIP-seq conrol. + This option is used when running prior-enhanced RSEM. It provides + information to call ChIP-seq peaks. The file(s) can be either + ungzipped or gzipped with a suffix '.gz' or '.gzip'. The options + '--bowtie-path ' and '--chipseq-target-read-files ' + must be defined when this option is specified. (Default: "") + + --chipseq-read-files-multi-targets + Comma-separated full path of FASTQ read files for multiple ChIP-seq + targets. This option is used when running prior-enhanced RSEM, where + prior is learned from multiple complementary data sets. It provides + information to calculate ChIP-seq signals. All files can be either + ungzipped or gzipped with a suffix '.gz' or '.gzip'. When this + option is specified, the option '--bowtie-path ' must be + defined and the option '--partition-model ' will be set to + 'cmb_lgt' automatically. (Default: "") + + --chipseq-bed-files-multi-targets + Comma-separated full path of BED files for multiple ChIP-seq + targets. This option is used when running prior-enhanced RSEM, where + prior is learned from multiple complementary data sets. It provides + information of ChIP-seq signals and must have at least the first six + BED columns. All files can be either ungzipped or gzipped with a + suffix '.gz' or '.gzip'. When this option is specified, the option + '--partition-model ' will be set to 'cmb_lgt' automatically. + (Default: "") + + --cap-stacked-chipseq-reads + Keep a maximum number of ChIP-seq reads that aligned to the same + genomic interval. This option is used when running prior-enhanced + RSEM, where prior is learned from multiple complementary data sets. + This option is only in use when either + '--chipseq-read-files-multi-targets ' or + '--chipseq-bed-files-multi-targets ' is specified. (Default: + off) + + --n-max-stacked-chipseq-reads + The maximum number of stacked ChIP-seq reads to keep. This option is + used when running prior-enhanced RSEM, where prior is learned from + multiple complementary data sets. This option is only in use when + the option '--cap-stacked-chipseq-reads' is set. (Default: 5) + + --partition-model + A keyword to specify the partition model used by prior-enhanced + RSEM. It must be one of the following keywords: + + - pk + Partitioned by whether an isoform has a ChIP-seq peak overlapping + with its transcription start site (TSS) region. The TSS region is + defined as [TSS-500bp, TSS+500bp]. For simplicity, we refer this + type of peak as 'TSS peak' when explaining other keywords. + + - pk_lgtnopk + First partitioned by TSS peak. Then, for isoforms in the 'no TSS + peak' set, a logistic model is employed to further classify them + into two partitions. + + - lm3, lm4, lm5, or lm6 + Based on their ChIP-seq signals, isoforms are classified into 3, + 4, 5, or 6 partitions by a linear regression model. + + - nopk_lm2pk, nopk_lm3pk, nopk_lm4pk, or nopk_lm5pk + First partitioned by TSS peak. Then, for isoforms in the 'with TSS + peak' set, a linear regression model is employed to further + classify them into 2, 3, 4, or 5 partitions. + + - pk_lm2nopk, pk_lm3nopk, pk_lm4nopk, or pk_lm5nopk + First partitioned by TSS peak. Then, for isoforms in the 'no TSS + peak' set, a linear regression model is employed to further + classify them into 2, 3, 4, or 5 partitions. + + - cmb_lgt + Using a logistic regression to combine TSS signals from multiple + complementary data sets and partition training set isoform into + 'expressed' and 'not expressed'. This partition model is only in + use when either '--chipseq-read-files-multi-targets ' or + '--chipseq-bed-files-multi-targets is specified. + + Parameters for all the above models are learned from a training set. + For detailed explanations, please see prior-enhanced RSEM's paper. + (Default: 'pk') + +DEPRECATED OPTIONS + The options in this section are deprecated. They are here only for + compatibility reasons and may be removed in future releases. + + --sam + Inputs are alignments in SAM format. (Default: off) + + --bam + Inputs are alignments in BAM format. (Default: off) + + --strand-specific + Equivalent to '--strandedness forward'. (Default: off) + + --forward-prob + Probability of generating a read from the forward strand of a + transcript. Set to 1 for a strand-specific protocol where all + (upstream) reads are derived from the forward strand, 0 for a + strand-specific protocol where all (upstream) read are derived from + the reverse strand, or 0.5 for a non-strand-specific protocol. + (Default: off) + +DESCRIPTION + In its default mode, this program aligns input reads against a reference + transcriptome with Bowtie and calculates expression values using the + alignments. RSEM assumes the data are single-end reads with quality + scores, unless the '--paired-end' or '--no-qualities' options are + specified. Alternatively, users can use STAR to align reads using the + '--star' option. RSEM has provided options in 'rsem-prepare-reference' + to prepare STAR's genome indices. Users may use an alternative aligner + by specifying '--alignments', and providing an alignment file in + SAM/BAM/CRAM format. However, users should make sure that they align + against the indices generated by 'rsem-prepare-reference' and the + alignment file satisfies the requirements mentioned in ARGUMENTS + section. + + One simple way to make the alignment file satisfying RSEM's requirements + is to use the 'convert-sam-for-rsem' script. This script accepts + SAM/BAM/CRAM files as input and outputs a BAM file. For example, type + the following command to convert a SAM file, 'input.sam', to a + ready-for-use BAM file, 'input_for_rsem.bam': + + convert-sam-for-rsem input.sam input_for_rsem + + For details, please refer to 'convert-sam-for-rsem's documentation page. + +NOTES + 1. Users must run 'rsem-prepare-reference' with the appropriate + reference before using this program. + + 2. For single-end data, it is strongly recommended that the user provide + the fragment length distribution parameters (--fragment-length-mean and + --fragment-length-sd). For paired-end data, RSEM will automatically + learn a fragment length distribution from the data. + + 3. Some aligner parameters have default values different from their + original settings. + + 4. With the '--calc-pme' option, posterior mean estimates will be + calculated in addition to maximum likelihood estimates. + + 5. With the '--calc-ci' option, 95% credibility intervals and posterior + mean estimates will be calculated in addition to maximum likelihood + estimates. + + 6. The temporary directory and all intermediate files will be removed + when RSEM finishes unless '--keep-intermediate-files' is specified. + + With the '--run-pRSEM' option and associated options (see section + 'PRIOR-ENHANCED RSEM OPTIONS' above for details), prior-enhanced RSEM + will be running. Prior parameters will be learned from supplied external + data set(s) and assigned as initial pseudo-counts for isoforms in the + corresponding partition for Gibbs sampling. + +OUTPUT + sample_name.isoforms.results + File containing isoform level expression estimates. The first line + contains column names separated by the tab character. The format of + each line in the rest of this file is: + + transcript_id gene_id length effective_length expected_count TPM + FPKM IsoPct [posterior_mean_count + posterior_standard_deviation_of_count pme_TPM pme_FPKM + IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound + TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound + FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation] + + Fields are separated by the tab character. Fields within "[]" are + optional. They will not be presented if neither '--calc-pme' nor + '--calc-ci' is set. + + 'transcript_id' is the transcript name of this transcript. 'gene_id' + is the gene name of the gene which this transcript belongs to + (denote this gene as its parent gene). If no gene information is + provided, 'gene_id' and 'transcript_id' are the same. + + 'length' is this transcript's sequence length (poly(A) tail is not + counted). 'effective_length' counts only the positions that can + generate a valid fragment. If no poly(A) tail is added, + 'effective_length' is equal to transcript length - mean fragment + length + 1. If one transcript's effective length is less than 1, + this transcript's both effective length and abundance estimates are + set to 0. + + 'expected_count' is the sum of the posterior probability of each + read comes from this transcript over all reads. Because 1) each read + aligning to this transcript has a probability of being generated + from background noise; 2) RSEM may filter some alignable low quality + reads, the sum of expected counts for all transcript are generally + less than the total number of reads aligned. + + 'TPM' stands for Transcripts Per Million. It is a relative measure + of transcript abundance. The sum of all transcripts' TPM is 1 + million. 'FPKM' stands for Fragments Per Kilobase of transcript per + Million mapped reads. It is another relative measure of transcript + abundance. If we define l_bar be the mean transcript length in a + sample, which can be calculated as + + l_bar = \sum_i TPM_i / 10^6 * effective_length_i (i goes through + every transcript), + + the following equation is hold: + + FPKM_i = 10^3 / l_bar * TPM_i. + + We can see that the sum of FPKM is not a constant across samples. + + 'IsoPct' stands for isoform percentage. It is the percentage of this + transcript's abandunce over its parent gene's abandunce. If its + parent gene has only one isoform or the gene information is not + provided, this field will be set to 100. + + 'posterior_mean_count', 'pme_TPM', 'pme_FPKM' are posterior mean + estimates calculated by RSEM's Gibbs sampler. + 'posterior_standard_deviation_of_count' is the posterior standard + deviation of counts. 'IsoPct_from_pme_TPM' is the isoform percentage + calculated from 'pme_TPM' values. + + 'TPM_ci_lower_bound', 'TPM_ci_upper_bound', 'FPKM_ci_lower_bound' + and 'FPKM_ci_upper_bound' are lower(l) and upper(u) bounds of 95% + credibility intervals for TPM and FPKM values. The bounds are + inclusive (i.e. [l, u]). + + 'TPM_coefficient_of_quartile_variation' and + 'FPKM_coefficient_of_quartile_variation' are coefficients of + quartile variation (CQV) for TPM and FPKM values. CQV is a robust + way of measuring the ratio between the standard deviation and the + mean. It is defined as + + CQV := (Q3 - Q1) / (Q3 + Q1), + + where Q1 and Q3 are the first and third quartiles. + + sample_name.genes.results + File containing gene level expression estimates. The first line + contains column names separated by the tab character. The format of + each line in the rest of this file is: + + gene_id transcript_id(s) length effective_length expected_count TPM + FPKM [posterior_mean_count posterior_standard_deviation_of_count + pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound + TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound + FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation] + + Fields are separated by the tab character. Fields within "[]" are + optional. They will not be presented if neither '--calc-pme' nor + '--calc-ci' is set. + + 'transcript_id(s)' is a comma-separated list of transcript_ids + belonging to this gene. If no gene information is provided, + 'gene_id' and 'transcript_id(s)' are identical (the + 'transcript_id'). + + A gene's 'length' and 'effective_length' are defined as the weighted + average of its transcripts' lengths and effective lengths (weighted + by 'IsoPct'). A gene's abundance estimates are just the sum of its + transcripts' abundance estimates. + + sample_name.alleles.results + Only generated when the RSEM references are built with + allele-specific transcripts. + + This file contains allele level expression estimates for + allele-specific expression calculation. The first line contains + column names separated by the tab character. The format of each line + in the rest of this file is: + + allele_id transcript_id gene_id length effective_length + expected_count TPM FPKM AlleleIsoPct AlleleGenePct + [posterior_mean_count posterior_standard_deviation_of_count pme_TPM + pme_FPKM AlleleIsoPct_from_pme_TPM AlleleGenePct_from_pme_TPM + TPM_ci_lower_bound TPM_ci_upper_bound + TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound + FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation] + + Fields are separated by the tab character. Fields within "[]" are + optional. They will not be presented if neither '--calc-pme' nor + '--calc-ci' is set. + + 'allele_id' is the allele-specific name of this allele-specific + transcript. + + 'AlleleIsoPct' stands for allele-specific percentage on isoform + level. It is the percentage of this allele-specific transcript's + abundance over its parent transcript's abundance. If its parent + transcript has only one allele variant form, this field will be set + to 100. + + 'AlleleGenePct' stands for allele-specific percentage on gene level. + It is the percentage of this allele-specific transcript's abundance + over its parent gene's abundance. + + 'AlleleIsoPct_from_pme_TPM' and 'AlleleGenePct_from_pme_TPM' have + similar meanings. They are calculated based on posterior mean + estimates. + + Please note that if this file is present, the fields 'length' and + 'effective_length' in 'sample_name.isoforms.results' should be + interpreted similarly as the corresponding definitions in + 'sample_name.genes.results'. + + sample_name.transcript.bam + Only generated when --no-bam-output is not specified. + + 'sample_name.transcript.bam' is a BAM-formatted file of read + alignments in transcript coordinates. The MAPQ field of each + alignment is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)), + where w is the posterior probability of that alignment being the + true mapping of a read. In addition, RSEM pads a new tag ZW:f:value, + where value is a single precision floating number representing the + posterior probability. Because this file contains all alignment + lines produced by bowtie or user-specified aligners, it can also be + used as a replacement of the aligner generated BAM/SAM file. + + sample_name.transcript.sorted.bam and + sample_name.transcript.sorted.bam.bai + Only generated when --no-bam-output is not specified and + --sort-bam-by-coordinate is specified. + + 'sample_name.transcript.sorted.bam' and + 'sample_name.transcript.sorted.bam.bai' are the sorted BAM file and + indices generated by samtools (included in RSEM package). + + sample_name.genome.bam + Only generated when --no-bam-output is not specified and + --output-genome-bam is specified. + + 'sample_name.genome.bam' is a BAM-formatted file of read alignments + in genomic coordinates. Alignments of reads that have identical + genomic coordinates (i.e., alignments to different isoforms that + share the same genomic region) are collapsed into one alignment. The + MAPQ field of each alignment is set to min(100, floor(-10 * + log10(1.0 - w) + 0.5)), where w is the posterior probability of that + alignment being the true mapping of a read. In addition, RSEM pads a + new tag ZW:f:value, where value is a single precision floating + number representing the posterior probability. If an alignment is + spliced, a XS:A:value tag is also added, where value is either '+' + or '-' indicating the strand of the transcript it aligns to. + + sample_name.genome.sorted.bam and sample_name.genome.sorted.bam.bai + Only generated when --no-bam-output is not specified, and + --sort-bam-by-coordinate and --output-genome-bam are specified. + + 'sample_name.genome.sorted.bam' and + 'sample_name.genome.sorted.bam.bai' are the sorted BAM file and + indices generated by samtools (included in RSEM package). + + sample_name.time + Only generated when --time is specified. + + It contains time (in seconds) consumed by aligning reads, estimating + expression levels and calculating credibility intervals. + + sample_name.log + Only generated when --alignments is not specified. + + It captures alignment statistics outputted from the user-specified + aligner. + + sample_name.stat + This is a folder instead of a file. All model related statistics are + stored in this folder. Use 'rsem-plot-model' can generate plots + using this folder. + + 'sample_name.stat/sample_name.cnt' contains alignment statistics. + The format and meanings of each field are described in + 'cnt_file_description.txt' under RSEM directory. + + 'sample_name.stat/sample_name.model' stores RNA-Seq model parameters + learned from the data. The format and meanings of each filed of this + file are described in 'model_file_description.txt' under RSEM + directory. + + The following four output files will be generated only by + prior-enhanced RSEM + + - 'sample_name.stat/sample_name_prsem.all_tr_features' + It stores isofrom features for deriving and assigning pRSEM prior. + The first line is a header and the rest is one isoform per line. + The description for each column is: + + * trid: transcript ID from input annotation + + * geneid: gene ID from input anntation + + * chrom: isoform's chromosome name + + * strand: isoform's strand name + + * start: isoform's end with the lowest genomic loci + + * end: isoform's end with the highest genomic loci + + * tss_mpp: average mappability of [TSS-500bp, TSS+500bp], where + TSS is isoform's transcription start site, i.e. 5'-end + + * body_mpp: average mappability of (TSS+500bp, TES-500bp), where + TES is isoform's transcription end site, i.e. 3'-end + + * tes_mpp: average mappability of [TES-500bp, TES+500bp] + + * pme_count: isoform's fragment or read count from RSEM's + posterior mean estimates + + * tss: isoform's TSS loci + + * tss_pk: equal to 1 if isoform's [TSS-500bp, TSS+500bp] region + overlaps with a RNA Pol II peak; 0 otherwise + + * is_training: equal to 1 if isoform is in the training set where + Pol II prior is learned; 0 otherwise + + - 'sample_name.stat/sample_name_prsem.all_tr_prior' + It stores prior parameters for every isoform. This file does not + have a header. Each line contains a prior parameter and an + isoform's transcript ID delimited by ` # `. + + - 'sample_name.stat/sample_name_uniform_prior_1.isoforms.results' + RSEM's posterior mean estimates on the isoform level with an + initial pseudo-count of one for every isoform. It is in the same + format as the 'sample_name.isoforms.results'. + + - 'sample_name.stat/sample_name_uniform_prior_1.genes.results' + RSEM's posterior mean estimates on the gene level with an initial + pseudo-count of one for every isoform. It is in the same format as + the 'sample_name.genes.results'. + + When learning prior from multiple external data sets in + prior-enhanced RSEM, two additional output files will be generated. + + - 'sample_name.stat/sample_name.pval_LL' + It stores a p-value and a log-likelihood. The p-value indicates + whether the combination of multiple complementary data sets is + informative for RNA-seq quantification. The log-likelihood shows + how well pRSEM's Dirichlet-multinomial model fits the read counts + of partitioned training set isoforms. + + - 'sample_name.stat/sample_name.lgt_mdl.RData' + It stores an R object named 'glmmdl', which is a logistic + regression model on the training set isoforms and multiple + external data sets. + + In addition, extra columns will be added to + 'sample_name.stat/all_tr_features' + + * is_expr: equal to 1 if isoform has an abundance >= 1 TPM and a + non-zero read count from RSEM's posterior mean estimates; 0 + otherwise + + * "$external_data_set_basename": log10 of external data's signal at + [TSS-500, TSS+500]. Signal is the number of reads aligned within + that interval and normalized to RPKM by read depth and interval + length. It will be set to -4 if no read aligned to that interval. + + There are multiple columns like this one, where each represents an + external data set. + + * prd_expr_prob: predicted probability from logistic regression + model on whether this isoform is expressed or not. A probability + higher than 0.5 is considered as expressed + + * partition: group index, to which this isoforms is partitioned + + * prior: prior parameter for this isoform + +EXAMPLES + Assume the path to the bowtie executables is in the user's PATH + environment variable. Reference files are under '/ref' with name + 'mouse_125'. + + 1) '/data/mmliver.fq', single-end reads with quality scores. Quality + scores are encoded as for 'GA pipeline version >= 1.3'. We want to use 8 + threads and generate a genome BAM file. In addition, we want to append + gene/transcript names to the result files: + + rsem-calculate-expression --phred64-quals \ + -p 8 \ + --append-names \ + --output-genome-bam \ + /data/mmliver.fq \ + /ref/mouse_125 \ + mmliver_single_quals + + 2) '/data/mmliver_1.fq' and '/data/mmliver_2.fq', stranded paired-end + reads with quality scores. Suppose the library is prepared using TruSeq + Stranded Kit, which means the first mate should map to the reverse + strand. Quality scores are in SANGER format. We want to use 8 threads + and do not generate a genome BAM file: + + rsem-calculate-expression -p 8 \ + --paired-end \ + --strandedness reverse \ + /data/mmliver_1.fq \ + /data/mmliver_2.fq \ + /ref/mouse_125 \ + mmliver_paired_end_quals + + 3) '/data/mmliver.fa', single-end reads without quality scores. We want + to use 8 threads: + + rsem-calculate-expression -p 8 \ + --no-qualities \ + /data/mmliver.fa \ + /ref/mouse_125 \ + mmliver_single_without_quals + + 4) Data are the same as 1). This time we assume the bowtie executables + are under '/sw/bowtie'. We want to take a fragment length distribution + into consideration. We set the fragment length mean to 150 and the + standard deviation to 35. In addition to a BAM file, we also want to + generate credibility intervals. We allow RSEM to use 1GB of memory for + CI calculation: + + rsem-calculate-expression --bowtie-path /sw/bowtie \ + --phred64-quals \ + --fragment-length-mean 150.0 \ + --fragment-length-sd 35.0 \ + -p 8 \ + --output-genome-bam \ + --calc-ci \ + --ci-memory 1024 \ + /data/mmliver.fq \ + /ref/mouse_125 \ + mmliver_single_quals + + 5) '/data/mmliver_paired_end_quals.bam', BAM-formatted alignments for + paired-end reads with quality scores. We want to use 8 threads: + + rsem-calculate-expression --paired-end \ + --alignments \ + -p 8 \ + /data/mmliver_paired_end_quals.bam \ + /ref/mouse_125 \ + mmliver_paired_end_quals + + 6) '/data/mmliver_1.fq.gz' and '/data/mmliver_2.fq.gz', paired-end reads + with quality scores and read files are compressed by gzip. We want to + use STAR to aligned reads and assume STAR executable is '/sw/STAR'. + Suppose we want to use 8 threads and do not generate a genome BAM file: + + rsem-calculate-expression --paired-end \ + --star \ + --star-path /sw/STAR \ + --gzipped-read-file \ + --paired-end \ + -p 8 \ + /data/mmliver_1.fq.gz \ + /data/mmliver_2.fq.gz \ + /ref/mouse_125 \ + mmliver_paired_end_quals + + 7) In the above example, suppose we want to run prior-enhanced RSEM + instead. Assuming we want to learn priors from a ChIP-seq peak file + '/data/mmlive.narrowPeak.gz': + + rsem-calculate-expression --star \ + --star-path /sw/STAR \ + --gzipped-read-file \ + --paired-end \ + --calc-pme \ + --run-pRSEM \ + --chipseq-peak-file /data/mmliver.narrowPeak.gz \ + -p 8 \ + /data/mmliver_1.fq.gz \ + /data/mmliver_2.fq.gz \ + /ref/mouse_125 \ + mmliver_paired_end_quals + + 8) Similar to the example in 7), suppose we want to use the partition + model 'pk_lm2nopk' (partitioning isoforms by Pol II TSS peak first and + then partitioning 'no TSS peak' isoforms into two bins by a linear + regression model), and we want to partition isoforms by RNA Pol II's + ChIP-seq read files '/data/mmliver_PolIIRep1.fq.gz' and + '/data/mmliver_PolIIRep2.fq.gz', and the control ChIP-seq read files + '/data/mmliver_ChIPseqCtrl.fq.gz'. Also, assuming Bowtie's executables + are under '/sw/bowtie/': + + rsem-calculate-expression --star \ + --star-path /sw/STAR \ + --gzipped-read-file \ + --paired-end \ + --calc-pme \ + --run-pRSEM \ + --chipseq-target-read-files /data/mmliver_PolIIRep1.fq.gz,/data/mmliver_PolIIRep2.fq.gz \ + --chipseq-control-read-files /data/mmliver_ChIPseqCtrl.fq.gz \ + --partition-model pk_lm2nopk \ + --bowtie-path /sw/bowtie \ + -p 8 \ + /data/mmliver_1.fq.gz \ + /data/mmliver_2.fq.gz \ + /ref/mouse_125 \ + mmliver_paired_end_quals + + 9) Similar to the example in 8), suppose we want to derive prior from + four histone modification ChIP-seq read data sets: + '/data/H3K27Ac.fastq.gz', '/data/H3K4me1.fastq.gz', + '/data/H3K4me2.fastq.gz', and '/data/H3K4me3.fastq.gz'. Also, assuming + Bowtie's executables are under '/sw/bowtie/': + + rsem-calculate-expression --star \ + --star-path /sw/STAR \ + --gzipped-read-file \ + --paired-end \ + --calc-pme \ + --run-pRSEM \ + --partition-model cmb_lgt \ + --chipseq-read-files-multi-targets /data/H3K27Ac.fastq.gz,/data/H3K4me1.fastq.gz,/data/H3K4me2.fastq.gz,/data/H3K4me3.fastq.gz \ + --bowtie-path /sw/bowtie \ + -p 8 \ + /data/mmliver_1.fq.gz \ + /data/mmliver_2.fq.gz \ + /ref/mouse_125 \ + mmliver_paired_end_quals + diff --git a/src/rsem/rsem_calculate_expression/script.sh b/src/rsem/rsem_calculate_expression/script.sh new file mode 100644 index 00000000..e8c6ce5d --- /dev/null +++ b/src/rsem/rsem_calculate_expression/script.sh @@ -0,0 +1,103 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -eo pipefail + +function clean_up { + rm -rf "$tmpdir" +} +trap clean_up EXIT + +tmpdir=$(mktemp -d "$meta_temp_dir/$meta_functionality_name-XXXXXXXX") + +if [ "$par_strandedness" == 'forward' ]; then + strandedness='--strandedness forward' +elif [ "$par_strandedness" == 'reverse' ]; then + strandedness="--strandedness reverse" +else + strandedness='' +fi + +IFS=";" read -ra input <<< $par_input + +INDEX=$(find -L $meta_resources_dir/$par_index -name "*.grp" | sed 's/\.grp$//') + +unset_if_false=( par_paired par_quiet par_no_bam_output par_sampling_for_bam par_no_qualities + par_alignments par_bowtie2 par_star par_hisat2_hca par_append_names + par_single_cell_prior par_calc_pme par_calc_ci par_phred64_quals + par_solexa_quals par_star_gzipped_read_file par_star_bzipped_read_file + par_star_output_genome_bam par_estimate_rspd par_keep_intermediate_files + par_time par_run_pRSEM par_cap_stacked_chipseq_reads par_sort_bam_by_read_name ) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +rsem-calculate-expression \ + ${par_quiet:+-q} \ + ${par_no_bam_output:+--no-bam-output} \ + ${par_sampling_for_bam:+--sampling-for-bam} \ + ${par_no_qualities:+--no-qualities} \ + ${par_alignments:+--alignments} \ + ${par_bowtie2:+--bowtie2} \ + ${par_star:+--star} \ + ${par_hisat2_hca:+--hisat2-hca} \ + ${par_append_names:+--append-names} \ + ${par_single_cell_prior:+--single-cell-prior} \ + ${par_calc_pme:+--calc-pme} \ + ${par_calc_ci:+--calc-ci} \ + ${par_phred64_quals:+--phred64-quals} \ + ${par_solexa_quals:+--solexa-quals} \ + ${par_star_gzipped_read_file:+--star-gzipped-read-file} \ + ${par_star_bzipped_read_file:+--star-bzipped-read-file} \ + ${par_star_output_genome_bam:+--star-output-genome-bam} \ + ${par_estimate_rspd:+--estimate-rspd} \ + ${par_keep_intermediate_files:+--keep-intermediate-files} \ + ${par_time:+--time} \ + ${par_run_pRSEM:+--run-pRSEM} \ + ${par_cap_stacked_chipseq_reads:+--cap-stacked-chipseq-reads} \ + ${par_sort_bam_by_read_name:+--sort-bam-by-read-name} \ + ${par_counts_gene:+--counts-gene "$par_counts_gene"} \ + ${par_counts_transcripts:+--counts-transcripts "$par_counts_transcripts"} \ + ${par_stat:+--stat "$par_stat"} \ + ${par_bam_star:+--bam-star "$par_bam_star"} \ + ${par_bam_genome:+--bam-genome "$par_bam_genome"} \ + ${par_bam_transcript:+--bam-transcript "$par_bam_transcript"} \ + ${par_fai:+--fai "$par_fai"} \ + ${par_seed:+--seed "$par_seed"} \ + ${par_seed_length:+--seed-length "$par_seed_length"} \ + ${par_bowtie_n:+--bowtie-n "$par_bowtie_n"} \ + ${par_bowtie_e:+--bowtie-e "$par_bowtie_e"} \ + ${par_bowtie_m:+--bowtie-m "$par_bowtie_m"} \ + ${par_bowtie_chunkmbs:+--bowtie-chunkmbs "$par_bowtie_chunkmbs"} \ + ${par_bowtie2_mismatch_rate:+--bowtie2-mismatch-rate "$par_bowtie2_mismatch_rate"} \ + ${par_bowtie2_k:+--bowtie2-k "$par_bowtie2_k"} \ + ${par_bowtie2_sensitivity_level:+--bowtie2-sensitivity-level "$par_bowtie2_sensitivity_level"} \ + ${par_tag:+--tag "$par_tag"} \ + ${par_fragment_length_min:+--fragment-length-min "$par_fragment_length_min"} \ + ${par_fragment_length_max:+--fragment-length-max "$par_fragment_length_max"} \ + ${par_fragment_length_mean:+--fragment-length-mean "$par_fragment_length_mean"} \ + ${par_fragment_length_sd:+--fragment-length-sd "$par_fragment_length_sd"} \ + ${par_num_rspd_bins:+--num-rspd-bins "$par_num_rspd_bins"} \ + ${par_gibbs_burnin:+--gibbs-burnin "$par_gibbs_burnin"} \ + ${par_gibbs_number_of_samples:+--gibbs-number-of-samples "$par_gibbs_number_of_samples"} \ + ${par_gibbs_sampling_gap:+--gibbs-sampling-gap "$par_gibbs_sampling_gap"} \ + ${par_ci_credibility_level:+--ci-credibility-level "$par_ci_credibility_level"} \ + ${par_ci_number_of_samples_per_count_vector:+--ci-number-of-samples-per-count-vector "$par_ci_number_of_samples_per_count_vector"} \ + ${par_temporary_folder:+--temporary-folder "$par_temporary_folder"} \ + ${par_chipseq_peak_file:+--chipseq-peak-file "$par_chipseq_peak_file"} \ + ${par_chipseq_target_read_files:+--chipseq-target-read-files "$par_chipseq_target_read_files"} \ + ${par_chipseq_control_read_files:+--chipseq-control-read-files "$par_chipseq_control_read_files"} \ + ${par_chipseq_read_files_multi_targets:+--chipseq-read-files-multi-targets "$par_chipseq_read_files_multi_targets"} \ + ${par_chipseq_bed_files_multi_targets:+--chipseq-bed-files-multi-targets "$par_chipseq_bed_files_multi_targets"} \ + ${par_n_max_stacked_chipseq_reads:+--n-max-stacked-chipseq-reads "$par_n_max_stacked_chipseq_reads"} \ + ${par_partition_model:+--partition-model "$par_partition_model"} \ + $strandedness \ + ${par_paired:+--paired-end} \ + ${input[*]} \ + $INDEX \ + $par_id + diff --git a/src/rsem/rsem_calculate_expression/test.sh b/src/rsem/rsem_calculate_expression/test.sh new file mode 100644 index 00000000..c9ede884 --- /dev/null +++ b/src/rsem/rsem_calculate_expression/test.sh @@ -0,0 +1,116 @@ +#!/bin/bash + +echo ">>> Testing $meta_executable" + +test_dir="${meta_resources_dir}/test_data" + +# wget https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq3/reference/rsem.tar.gz +# gunzip -k rsem.tar.gz +# tar -xf rsem.tar +# mv $test_dir/rsem $meta_resources_dir + +echo "> Prepare test data" + +cat > reads_R1.fastq <<'EOF' +@SEQ_ID1 +ACGCTGCCTCATAAGCCTCACACAT ++ +IIIIIIIIIIIIIIIIIIIIIIIII +@SEQ_ID2 +ACCCGCAAGATTAGGCTCCGTACAC ++ +!!!!!!!!!!!!!!!!!!!!!!!!! +EOF + +cat > reads_R2.fastq <<'EOF' +@SEQ_ID1 +ATGTGTGAGGCTTATGAGGCAGCGT ++ +IIIIIIIIIIIIIIIIIIIIIIIII +@SEQ_ID2 +GTGTACGGAGCCTAATCTTGCAGGG ++ +!!!!!!!!!!!!!!!!!!!!!!!!! +EOF + +cat > genome.fasta <<'EOF' +>chr1 +TGGCATGAGCCAACGAACGCTGCCTCATAAGCCTCACACATCCGCGCCTATGTTGTGACTCTCTGTGAGCGTTCGTGGG +GCTCGTCACCACTATGGTTGGCCGGTTAGTAGTGTGACTCCTGGTTTTCTGGAGCTTCTTTAAACCGTAGTCCAGTCAA +TGCGAATGGCACTTCACGACGGACTGTCCTTAGGTGTGAGGCTTATGAGGCACTCAGGGGA +EOF + +cat > genes.gtf <<'EOF' +chr1 example_source gene 0 50 . + . gene_id "gene1"; transcript_id "transcript1"; +chr1 example_source exon 20 40 . + . gene_id "gene1"; transcript_id "transcript1"; +chr1 example_source gene 100 219 . + . gene_id "gene2"; transcript_id "transcript2"; +chr1 example_source exon 191 210 . + . gene_id "gene2"; transcript_id "transcript2"; +EOF + +cat > ref.cnt <<'EOF' +1 0 0 1 +0 0 0 +0 3 +0 1 +Inf 0 +EOF + +cat > ref.genes.results <<'EOF' +gene_id transcript_id(s) length effective_length expected_count TPM FPKM +gene1 transcript1 21.00 21.00 0.00 0.00 0.00 +gene2 transcript2 20.00 20.00 0.00 0.00 0.00 +EOF + +cat > ref.isoforms.results <<'EOF' +transcript_id gene_id length effective_length expected_count TPM FPKM IsoPct +transcript1 gene1 21 21.00 0.00 0.00 0.00 0.00 +transcript2 gene2 20 20.00 0.00 0.00 0.00 0.00 +EOF + + +echo "> Generate index" + +rsem-prepare-reference \ + --gtf "genes.gtf" \ + "genome.fasta" \ + "index" + +mkdir index +mv index.* index/ + +STAR \ + ${meta_cpus:+--runThreadN $meta_cpus} \ + --runMode genomeGenerate \ + --genomeDir "index/" \ + --genomeFastaFiles "genome.fasta" \ + --sjdbGTFfile "genes.gtf" \ + --genomeSAindexNbases 2 + +######################################################################################### + +echo ">>> Test 1: Paired-end reads using STAR to align reads" +"$meta_executable" \ + --star \ + --paired \ + --input "reads_R1.fastq;reads_R2.fastq" \ + --index index \ + --id test \ + --seed 1 \ + --quiet + +echo ">>> Checking whether output exists" +[ ! -f "test.genes.results" ] && echo "Gene level expression counts file does not exist!" && exit 1 +[ ! -s "test.genes.results" ] && echo "Gene level expression counts file is empty!" && exit 1 +[ ! -f "test.isoforms.results" ] && echo "Transcript level expression counts file does not exist!" && exit 1 +[ ! -s "test.isoforms.results" ] && echo "Transcript level expression counts file is empty!" && exit 1 +[ ! -d "test.stat" ] && echo "Stats file does not exist!" && exit 1 + +echo ">>> Check wheter output is correct" +diff ref.genes.results test.genes.results || { echo "Gene level expression counts file is incorrect!"; exit 1; } +diff ref.isoforms.results test.isoforms.results || { echo "Transcript level expression counts file is incorrect!"; exit 1; } +diff ref.cnt test.stat/test.cnt || { echo "Stats file is incorrect!"; exit 1; } + +##################################################################################################### + +echo "All tests succeeded!" +exit 0 From bc9cc0a6ce4e0b87a4ce47561b4812b449e101ca Mon Sep 17 00:00:00 2001 From: Emma Rousseau Date: Thu, 19 Sep 2024 05:48:45 +0200 Subject: [PATCH 25/28] Kallisto quant (#152) * initial commit dedup * Revert "initial commit dedup" This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2. * complete component * Update changelog * add help.txt * apply suggested changes (changelog, config) --- CHANGELOG.md | 16 ++- src/kallisto/kallisto_quant/config.vsh.yaml | 105 ++++++++++++++++++ src/kallisto/kallisto_quant/help.txt | 33 ++++++ src/kallisto/kallisto_quant/script.sh | 46 ++++++++ src/kallisto/kallisto_quant/test.sh | 53 +++++++++ .../kallisto_quant/test_data/abundance_1.tsv | 2 + .../kallisto_quant/test_data/abundance_2.tsv | 2 + .../test_data/index/transcriptome.idx | Bin 0 -> 1583 bytes .../kallisto_quant/test_data/reads/A_R1.fastq | 4 + .../kallisto_quant/test_data/reads/A_R2.fastq | 4 + .../kallisto_quant/test_data/script.sh | 11 ++ 11 files changed, 270 insertions(+), 6 deletions(-) create mode 100644 src/kallisto/kallisto_quant/config.vsh.yaml create mode 100644 src/kallisto/kallisto_quant/help.txt create mode 100644 src/kallisto/kallisto_quant/script.sh create mode 100644 src/kallisto/kallisto_quant/test.sh create mode 100644 src/kallisto/kallisto_quant/test_data/abundance_1.tsv create mode 100644 src/kallisto/kallisto_quant/test_data/abundance_2.tsv create mode 100644 src/kallisto/kallisto_quant/test_data/index/transcriptome.idx create mode 100644 src/kallisto/kallisto_quant/test_data/reads/A_R1.fastq create mode 100644 src/kallisto/kallisto_quant/test_data/reads/A_R2.fastq create mode 100755 src/kallisto/kallisto_quant/test_data/script.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 9bfb5606..5d380e54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -65,6 +65,16 @@ * `fastqc`: High throughput sequence quality control analysis tool (PR #92). +* `sortmerna`: Local sequence alignment tool for mapping, clustering, and filtering rRNA from + metatranscriptomic data (PR #146). + +* `fq_subsample`: Sample a subset of records from single or paired FASTQ files (PR #147). + +* `kallisto`: + - `kallisto_index`: Create a kallisto index (PR #149). + - `kallisto_quant`: Quantifying abundances of transcripts from RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads (PR #152). + + ## MINOR CHANGES * `busco` components: update BUSCO to `5.7.1` (PR #72). @@ -161,13 +171,7 @@ - `bedtools_getfasta`: extract sequences from a FASTA file for each of the intervals defined in a BED/GFF/VCF file (PR #59). -* `sortmerna`: Local sequence alignment tool for mapping, clustering, and filtering rRNA from metatranscriptomic - data. (PR #146) - -* `fq_subsample`: Sample a subset of records from single or paired FASTQ files (PR #147). -* `kallisto`: - - `kallisto_index`: Create a kallisto index (PR #149). ## MINOR CHANGES diff --git a/src/kallisto/kallisto_quant/config.vsh.yaml b/src/kallisto/kallisto_quant/config.vsh.yaml new file mode 100644 index 00000000..e92ac6b3 --- /dev/null +++ b/src/kallisto/kallisto_quant/config.vsh.yaml @@ -0,0 +1,105 @@ +name: kallisto_quant +namespace: kallisto +description: | + Quantifying abundances of transcripts from RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads. +keywords: [kallisto, quant, pseudoalignment] +links: + homepage: https://pachterlab.github.io/kallisto/about + documentation: https://pachterlab.github.io/kallisto/manual + repository: https://github.com/pachterlab/kallisto + issue_tracker: https://github.com/pachterlab/kallisto/issues +references: + doi: 10.1038/nbt.3519 +license: BSD 2-Clause License + +argument_groups: +- name: "Input" + arguments: + - name: "--input" + type: file + description: List of input FastQ files of size 1 and 2 for single-end and paired-end data, respectively. + direction: "input" + multiple: true + required: true + - name: "--index" + alternatives: ["-i"] + type: file + description: Kallisto genome index. + must_exist: true + required: true + +- name: "Output" + arguments: + - name: "--output_dir" + alternatives: ["-o"] + type: string + description: Directory to write output to. + required: true + +- name: "Options" + arguments: + - name: "--single" + type: boolean_true + description: Single end mode. + - name: "--single_overhang" + type: boolean_true + description: Include reads where unobserved rest of fragment is predicted to lie outside a transcript. + - name: "--fr_stranded" + type: boolean_true + description: Strand specific reads, first read forward. + - name: "--rf_stranded" + type: boolean_true + description: Strand specific reads, first read reverse. + - name: "--fragment_length" + alternatives: ["-l"] + type: double + description: The estimated average fragment length. + - name: "--sd" + alternatives: ["-s"] + type: double + description: | + The estimated standard deviation of the fragment length (default: -l, -s values are estimated + from paired end data, but are required when using --single). + - name: "--plaintext" + type: boolean_true + description: Output plaintext instead of HDF5. + - name: "--bootstrap_samples" + alternatives: ["-b"] + type: integer + description: | + Number of bootstrap samples to draw. Default: '0' + example: 0 + - name: "--seed" + type: integer + description: | + Random seed for bootstrap. Default: '42' + example: 42 + + +resources: +- type: bash_script + path: script.sh + +test_resources: +- type: bash_script + path: test.sh +- type: file + path: test_data + +engines: + - type: docker + image: ubuntu:22.04 + setup: + - type: docker + run: | + apt-get update && \ + apt-get install -y --no-install-recommends wget && \ + wget --no-check-certificate https://github.com/pachterlab/kallisto/releases/download/v0.50.1/kallisto_linux-v0.50.1.tar.gz && \ + tar -xzf kallisto_linux-v0.50.1.tar.gz && \ + mv kallisto/kallisto /usr/local/bin/ + - type: docker + run: | + echo "kallisto: $(kallisto version | sed 's/kallisto, version //')" > /var/software_versions.txt +runners: + - type: executable + - type: nextflow diff --git a/src/kallisto/kallisto_quant/help.txt b/src/kallisto/kallisto_quant/help.txt new file mode 100644 index 00000000..7022571b --- /dev/null +++ b/src/kallisto/kallisto_quant/help.txt @@ -0,0 +1,33 @@ +``` +kallisto quant +``` + +kallisto 0.50.1 +Computes equivalence classes for reads and quantifies abundances + +Usage: kallisto quant [arguments] FASTQ-files + +Required arguments: +-i, --index=STRING Filename for the kallisto index to be used for + quantification +-o, --output-dir=STRING Directory to write output to + +Optional arguments: +-b, --bootstrap-samples=INT Number of bootstrap samples (default: 0) + --seed=INT Seed for the bootstrap sampling (default: 42) + --plaintext Output plaintext instead of HDF5 + --single Quantify single-end reads + --single-overhang Include reads where unobserved rest of fragment is + predicted to lie outside a transcript + --fr-stranded Strand specific reads, first read forward + --rf-stranded Strand specific reads, first read reverse +-l, --fragment-length=DOUBLE Estimated average fragment length +-s, --sd=DOUBLE Estimated standard deviation of fragment length + (default: -l, -s values are estimated from paired + end data, but are required when using --single) +-p, --priors Priors for the EM algorithm, either as raw counts or as + probabilities. Pseudocounts are added to raw reads to + prevent zero valued priors. Supplied in the same order + as the transcripts in the transcriptome +-t, --threads=INT Number of threads to use (default: 1) + --verbose Print out progress information every 1M proccessed reads \ No newline at end of file diff --git a/src/kallisto/kallisto_quant/script.sh b/src/kallisto/kallisto_quant/script.sh new file mode 100644 index 00000000..a7105cd1 --- /dev/null +++ b/src/kallisto/kallisto_quant/script.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -eo pipefail + +unset_if_false=( par_single par_single_overhang par_rf_stranded par_fr_stranded par_plaintext ) + +for var in "${unset_if_false[@]}"; do + temp_var="${!var}" + [[ "$temp_var" == "false" ]] && unset $var +done + +IFS=";" read -ra input <<< $par_input + +# Check if par_single is not set and ensure even number of input files +if [ -z "$par_single" ]; then + if [ $((${#input[@]} % 2)) -ne 0 ]; then + echo "Error: When running in paired-end mode, the number of input files must be even." + echo "Number of input files provided: ${#input[@]}" + exit 1 + fi +fi + + +mkdir -p $par_output_dir + + +kallisto quant \ + ${meta_cpus:+--threads $meta_cpus} \ + -i $par_index \ + ${par_gtf:+--gtf "${par_gtf}"} \ + ${par_single:+--single} \ + ${par_single_overhang:+--single-overhang} \ + ${par_fr_stranded:+--fr-stranded} \ + ${par_rf_stranded:+--rf-stranded} \ + ${par_plaintext:+--plaintext} \ + ${par_bootstrap_samples:+--bootstrap-samples "${par_bootstrap_samples}"} \ + ${par_fragment_length:+--fragment-length "${par_fragment_length}"} \ + ${par_sd:+--sd "${par_sd}"} \ + ${par_seed:+--seed "${par_seed}"} \ + -o $par_output_dir \ + ${input[*]} + + diff --git a/src/kallisto/kallisto_quant/test.sh b/src/kallisto/kallisto_quant/test.sh new file mode 100644 index 00000000..28e2e3ad --- /dev/null +++ b/src/kallisto/kallisto_quant/test.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +echo ">>> Testing $meta_functionality_name" + +echo ">>> Test 1: Testing for paired-end reads" +"$meta_executable" \ + --index "$meta_resources_dir/test_data/index/transcriptome.idx" \ + --rf_stranded \ + --output_dir . \ + --input "$meta_resources_dir/test_data/reads/A_R1.fastq;$meta_resources_dir/test_data/reads/A_R2.fastq" + +echo ">>> Checking whether output exists" +[ ! -f "run_info.json" ] && echo "run_info.json does not exist!" && exit 1 +[ ! -s "run_info.json" ] && echo "run_info.json is empty!" && exit 1 +[ ! -f "abundance.tsv" ] && echo "abundance.tsv does not exist!" && exit 1 +[ ! -s "abundance.tsv" ] && echo "abundance.tsv is empty!" && exit 1 +[ ! -f "abundance.h5" ] && echo "abundance.h5 does not exist!" && exit 1 +[ ! -s "abundance.h5" ] && echo "abundance.h5 is empty!" && exit 1 + +echo ">>> Checking if output is correct" +diff "abundance.tsv" "$meta_resources_dir/test_data/abundance_1.tsv" || { echo "abundance.tsv is not correct"; exit 1; } + +rm -rf abundance.tsv abundance.h5 run_info.json + +################################################################################ + +echo ">>> Test 2: Testing for single-end reads" +"$meta_executable" \ + --index "$meta_resources_dir/test_data/index/transcriptome.idx" \ + --rf_stranded \ + --output_dir . \ + --single \ + --input "$meta_resources_dir/test_data/reads/A_R1.fastq" \ + --fragment_length 101 \ + --sd 50 + +echo ">>> Checking whether output exists" +[ ! -f "run_info.json" ] && echo "run_info.json does not exist!" && exit 1 +[ ! -s "run_info.json" ] && echo "run_info.json is empty!" && exit 1 +[ ! -f "abundance.tsv" ] && echo "abundance.tsv does not exist!" && exit 1 +[ ! -s "abundance.tsv" ] && echo "abundance.tsv is empty!" && exit 1 +[ ! -f "abundance.h5" ] && echo "abundance.h5 does not exist!" && exit 1 +[ ! -s "abundance.h5" ] && echo "abundance.h5 is empty!" && exit 1 + +echo ">>> Checking if output is correct" +diff "abundance.tsv" "$meta_resources_dir/test_data/abundance_2.tsv" || { echo "abundance.tsv is not correct"; exit 1; } + +rm -rf abundance.tsv abundance.h5 run_info.json + +################################################################################ + +echo "All tests succeeded!" +exit 0 diff --git a/src/kallisto/kallisto_quant/test_data/abundance_1.tsv b/src/kallisto/kallisto_quant/test_data/abundance_1.tsv new file mode 100644 index 00000000..1de99e54 --- /dev/null +++ b/src/kallisto/kallisto_quant/test_data/abundance_1.tsv @@ -0,0 +1,2 @@ +target_id length eff_length est_counts tpm +Sheila 35 36 0 -nan diff --git a/src/kallisto/kallisto_quant/test_data/abundance_2.tsv b/src/kallisto/kallisto_quant/test_data/abundance_2.tsv new file mode 100644 index 00000000..6b3e9055 --- /dev/null +++ b/src/kallisto/kallisto_quant/test_data/abundance_2.tsv @@ -0,0 +1,2 @@ +target_id length eff_length est_counts tpm +Sheila 35 15.0373 0 -nan diff --git a/src/kallisto/kallisto_quant/test_data/index/transcriptome.idx b/src/kallisto/kallisto_quant/test_data/index/transcriptome.idx new file mode 100644 index 0000000000000000000000000000000000000000..194fec14b9b858e324345e535e00764a36100db7 GIT binary patch literal 1583 zcmd;OfPjTinh{9b$1B#!18H#}2Jt~a8A36bm2ogIJfAt+63T~DAce8EHEN$uybqZC zfg=W{5v~BrV208#c|}Et0E`c#VftWv7=48WCIhA&B!LvnOc?C|Rl)?N*?=^%HkesZ zX$A)<1EwA(4x?e}ahVTO2ct*TLqcLSJR#vQnjS{e1FUQS(dg*`Sq@nqrq10t#1V*{ z9o-$_AjH`nh=7E Date: Mon, 23 Sep 2024 14:46:08 +0200 Subject: [PATCH 26/28] Add trimgalore (#117) * add trimgalore * fix test * make output arguments optional * fix script * fix script and update test * update changelog * apply code review suggestions * separate input fastqc file arguments from other arguments * apply suggested change --------- Co-authored-by: Robrecht Cannoodt --- CHANGELOG.md | 2 + src/trimgalore/config.vsh.yaml | 297 +++++++++++++++++++++++++++ src/trimgalore/help.txt | 355 +++++++++++++++++++++++++++++++++ src/trimgalore/script.sh | 126 ++++++++++++ src/trimgalore/test.sh | 125 ++++++++++++ 5 files changed, 905 insertions(+) create mode 100644 src/trimgalore/config.vsh.yaml create mode 100644 src/trimgalore/help.txt create mode 100755 src/trimgalore/script.sh create mode 100644 src/trimgalore/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d380e54..0613fa25 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -75,6 +75,8 @@ - `kallisto_quant`: Quantifying abundances of transcripts from RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads (PR #152). +* `trimgalore`: Quality and adapter trimming for fastq files (PR #117). + ## MINOR CHANGES * `busco` components: update BUSCO to `5.7.1` (PR #72). diff --git a/src/trimgalore/config.vsh.yaml b/src/trimgalore/config.vsh.yaml new file mode 100644 index 00000000..ae12fb10 --- /dev/null +++ b/src/trimgalore/config.vsh.yaml @@ -0,0 +1,297 @@ +name: trimgalore +description: | + A wrapper tool around Cutadapt and FastQC to consistently apply quality and adapter trimming to FastQ files. +keywords: ["trimming", "adapters"] +links: + homepage: https://github.com/FelixKrueger/TrimGalore + documentation: https://github.com/FelixKrueger/TrimGalore/blob/master/Docs/Trim_Galore_User_Guide.md + repository: https://github.com/FelixKrueger/TrimGalore +references: + doi: 10.5281/zenodo.7598955 +license: GPL-3.0 +requirements: + commands: [trim_galore] +authors: + - __merge__: /src/_authors/sai_nirmayi_yasa.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Input + arguments: + - name: "--input" + type: file + description: Input files. Note that paired-end files need to be supplied in a pairwise fashion, e.g. file1_1.fq file1_2.fq SRR2_1.fq.gz SRR2_2.fq.gz + required: true + multiple: true + example: sample1_r1.fq;sample1_r2.fq;sample2_r1.fq;sample2_r2.fq + - name: Trimming options + arguments: + - name: --quality + alternatives: -q + type: integer + description: Trim low-quality ends (below the specified Phred score) from reads in addition to adapter removal. For RRBS samples, quality trimming will be performed first, and adapter trimming is carried in a second round. Other files are quality and adapter trimmed in a single pass. The algorithm is the same as the one used by BWA (Subtract INT from all qualities; compute partial sums from all indices to the end of the sequence; cut sequence at the index at which the sum is minimal). + example: 20 + - name: --phred33 + type: boolean_true + description: Instructs Cutadapt to use ASCII+33 quality scores as Phred scores (Sanger/Illumina 1.9+ encoding) for quality trimming. + - name: --phred64 + type: boolean_true + description: Instructs Cutadapt to use ASCII+64 quality scores as Phred scores (Illumina 1.5 encoding) for quality trimming. + - name: --fastqc + type: boolean_true + description: Run FastQC in the default mode on the FastQ file once trimming is complete. + - name: --fastqc_args + type: string + description: Passes extra arguments (excluding files) to FastQC. If more than one argument is to be passed to FastQC they must be in the form "arg1 arg2 ...". Passing extra arguments will automatically invoke FastQC, so --fastqc does not have to be specified separately. + example: "--nogroup --noextract" + - name: --fastqc_contaminants + type: file + description: Specifies a non-default file which contains the list of contaminants for FastQC to screen overrepresented sequences against. The file must contain sets of named contaminants in the form name[tab]sequence. Lines prefixed with a hash will be ignored. + example: "contaminants.txt" + - name: --fastqc_adapters + type: file + description: Specifies a non-default file which contains the list of adapter sequences which which FasstQC will explicity search against the library. The file must contain sets of named adapters in the form name[tab]sequence. Lines prefixed with a hash will be ignored. + example: "adapters.txt" + - name: --fastqc_limits + type: file + description: Specifies a non-default file which contains a set of criteria which FastQC will use to determine the warn/error limits for the various modules. This file can also be used to selectively remove some modules from the output all together. The format needs to mirror the default limits.txt file found in the Configuration folder. + example: "limits.txt" + - name: --adapter + alternatives: -a + type: string + description: | + Adapter sequence to be trimmed. If not specified explicitly, Trim Galore will try to auto-detect whether the Illumina universal, Nextera transposase or Illumina small RNA adapter sequence was used. A single base may also be given as e.g. -a A{10}, to be expanded to -a AAAAAAAAAA. + At a special request, multiple adapters can also be specified like so: + -a " AGCTCCCG -a TTTCATTATAT -a TTTATTCGGATTTAT" -a2 " AGCTAGCG -a TCTCTTATAT -a TTTCGGATTTAT", + or so: + -a "file:../multiple_adapters.fa" -a2 "file:../different_adapters.fa" + Potentially in conjucntion with the parameter "-n 3" to trim all adapters. + example: AGCTCCCG + - name: --adapter2 + alternatives: -a2 + type: string + description: Optional adapter sequence to be trimmed off read 2 of paired-end files. This option requires '--paired' to be specified as well. If the libraries to be trimmed are smallRNA then a2 will be set to the Illumina small RNA 5' adapter automatically (GATCGTCGGACT). A single base may also be given as e.g. -a2 A{10}, to be expanded to -a2 AAAAAAAAAA. + required: false + example: AGCTCCCG + - name: --illumina + type: boolean_true + description: Adapter sequence to be trimmed is the first 13bp of the Illumina universal adapter 'AGATCGGAAGAGC' instead of the default auto-detection of adapter sequence. + - name: --stranded_illumina + type: boolean_true + description: Adapter sequence to be trimmed is the first 13bp of the Illumina stranded mRNA or Total RNA adapter 'ACTGTCTCTTATA' instead of the default auto-detection of adapter sequence. + - name: --nextera + type: boolean_true + description: Adapter sequence to be trimmed is the first 12bp of the Nextera adapter 'CTGTCTCTTATA' instead of the default auto-detection of adapter sequence. + - name: --small_rna + type: boolean_true + description: Adapter sequence to be trimmed is the first 12bp of the Illumina Small RNA 3' Adapter 'TGGAATTCTCGG' instead of the default auto-detection of adapter sequence. Selecting to trim smallRNA adapters will also lower the --length value to 18bp. If the smallRNA libraries are paired-end then a automatically (GATCGTCGGACT) unless -a 2 had been defined explicitly. + - name: --consider_already_trimmed + type: integer + description: During adapter auto-detection, the limit set by this argument allows the user to set a threshold up to which the file is considered already adapter-trimmed. If no adapter sequence exceeds this threshold, no additional adapter trimming will be performed (technically, the adapter is set to '-a X'). Quality trimming is still performed as usual. + required: false + - name: --max_length + type: integer + description: Discard reads that are longer than the specified value after trimming. This is only advised for smallRNA sequencing to remove non-small RNA sequences. + required: false + - name: --stringency + type: integer + description: Overlap with adapter sequence required to trim a sequence. Defaults to a very stringent setting of 1, i.e. even a single bp of overlapping sequence will be trimmed off from the 3' end of any read. + required: false + example: 1 + - name: --error_rate + alternatives: -e + type: double + description: Maximum allowed error rate (no. of errors divided by the length of the matching region) + required: false + example: 0.1 + - name: --gzip + type: boolean_true + description: Compress the output file with GZIP. If the input files are GZIP-compressed the output files will automatically be GZIP compressed as well. As of v0.2.8 the compression will take place on the fly. + - name: --dont_gzip + type: boolean_true + description: Output files won't be compressed with GZIP. This option overrides --gzip. + - name: --length + type: integer + description: Discard reads that became shorter than the specified length because of either quality or adapter trimming. A value of '0' effectively disables this behaviour. For paired-end files, both reads of a read-pair need to be longer than the specified length to be printed out to validated paired-end files. If only one read became too short there is the possibility of keeping such unpaired single-end reads using the --retain_unpaired option. + required: false + example: 20 + - name: --max_n + type: integer + description: The total number of Ns a read may contain before it will be removed altogether.In a paired-end setting, either read exceeding this limit will result in the entire pair being removed from the trimmed output files. If COUNT is a number between 0 and 1, it is interpreted as a fraction of the read length. + required: false + - name: --trim_n + type: boolean_true + description: Removes Ns from either side of the read. This option does currently not work in RRBS mode. + - name: --no_report_file + type: boolean_true + description: If specified no report file will be generated. + - name: --suppress_warn + type: boolean_true + description: If specified any output to STDOUT or STDERR will be suppressed. + - name: --clip_R1 + type: integer + description: Instructs TrimGalore to remove given number of bp from the 5' end of read 1 (or single-end reads). This may be useful if the qualities were very poor, or if there is some sort of unwanted bias at the 5' end. + required: false + - name: --clip_R2 + type: integer + description: Instructs TrimGalore to remove given number bp from the 5' end of read 2 (paired-end reads only). This may be useful if the qualities were very poor, or if there is some sort of unwanted bias at the 5' end. For paired-end BS-Seq, it is recommended to remove the first few bp because the end-repair reaction may introduce a bias towards low methylation. + required: false + - name: --three_prime_clip_R1 + type: integer + description: Instructs Trim Galore to remove spacified number of bp from the 3' end of read 1 (or single-end reads) AFTER adapter/quality trimming has been performed. This may remove some bias from the 3' end that is not directly related to adapter sequence or basecall quality. + required: false + - name: --three_prime_clip_R2 + type: integer + description: Instructs Trim Galore to remove bp from the 3' end of read 2 AFTER adapter/quality trimming has been performed. This may remove some unwanted bias from the 3' end that is not directly related to adapter sequence or basecall quality. + required: false + - name: --nextseq + type: integer + description: This enables the option '--nextseq-trim=3'CUTOFF' within Cutadapt, which will set a quality cutoff (that is normally given with -q instead), but qualities of G bases are ignored. This trimming is in common for the NextSeq- and NovaSeq-platforms, where basecalls without any signal are called as high-quality G bases. This is mutually exlusive with '-q INT'. + required: false + - name: --basename + type: string + description: Use specified name (PREFERRED_NAME) as the basename for output files, instead of deriving the filenames from the input files. Single-end data would be called PREFERRED_NAME_trimmed.fq(.gz), or PREFERRED_NAME_val_1.fq(.gz) and PREFERRED_NAME_val_2.fq(.gz) for paired-end data. --basename only works when 1 file (single-end) or 2 files (paired-end) are specified, but not for longer lists. + required: false + - name: Specific trimming options without adapter/quality trimming + arguments: + - name: --hardtrim5 + type: integer + description: Instead of performing adapter-/quality trimming, this option will simply hard-trim sequences to bp at the 5'-end. Once hard-trimming of files is complete, Trim Galore will exit. Hard-trimmed output files will end in ._5prime.fq(.gz). + required: false + - name: --hardtrim3 + type: integer + description: Instead of performing adapter-/quality trimming, this option will simply hard-trim sequences to bp at the 3'-end. Once hard-trimming of files is complete, Trim Galore will exit. Hard-trimmed output files will end in ._3prime.fq(.gz). + required: false + - name: --clock + type: boolean_true + description: In this mode, reads are trimmed in a specific way that is currently used for the Mouse Epigenetic Clock. + - name: --polyA + type: boolean_true + description: This is a new, still experimental, trimming mode to identify and remove poly-A tails from sequences. When --polyA is selected, Trim Galore attempts to identify from the first supplied sample whether sequences contain more often a stretch of either 'AAAAAAAAAA' or 'TTTTTTTTTT'. This determines if Read 1 of a paired-end end file, or single-end files, are trimmed for PolyA or PolyT. In case of paired-end sequencing, Read2 is trimmed for the complementary base from the start of the reads. The auto-detection uses a default of A{20} for Read1 (3'-end trimming) and T{150} for Read2 (5'-end trimming). These values may be changed manually using the options -a and -a2. In addition to trimming the sequences, white spaces are replaced with _ and it records in the read ID how many bases were trimmed so it can later be used to identify PolyA trimmed sequences. This is currently done by writing tags to both the start ("32:A:") and end ("_PolyA:32") of the reads. The poly-A trimming mode expects that sequences were both adapter and quality before looking for Poly-A tails, and it is the user's responsibility to carry out an initial round of trimming. + - name: --implicon + type: boolean_true + description: | + This is a special mode of operation for paired-end data, such as required for the IMPLICON method, where a UMI sequence is getting transferred from the start of Read 2 to the readID of both reads. Following this, Trim Galore will exit. In it's current implementation, the UMI carrying reads come in the following format + Read 1 5' FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 3' + Read 2 3' UUUUUUUUFFFFFFFFFFFFFFFFFFFFFFFFFFFF 5' + Where UUUUUUUU is a random 8-mer unique molecular identifier (UMI) and FFFFFFF... is the actual fragment to be sequenced. The UMI of Read 2 (R2) is written into the read ID of both reads and removed from the actual sequence. + - name: RRBS-specific options + arguments: + - name: --rrbs + type: boolean_true + description: Specifies that the input file was an MspI digested RRBS sample (recognition site is CCGG). Single-end or Read 1 sequences (paired-end) which were adapter-trimmed will have a further 2 bp removed from their 3' end. Sequences which were merely trimmed because of poor quality will not be shortened further. Read 2 of paired-end libraries will in addition have the first 2 bp removed from the 5' end (by setting '--clip_r2 2'). This is to avoid using artificial methylation calls from the filled-in cytosine positions close to the 3' MspI site in sequenced fragments. This option is not recommended for users of the Tecan Ovation RRBS Methyl-Seq with TrueMethyl oxBS 1-16 kit (see below). + - name: --non_directional + type: boolean_true + description: Selecting this option for non-directional RRBS libraries will screen quality-trimmed sequences for 'CAA' or 'CGA' at the start of the read and, if found, removes the first two basepairs. Like with the option '--rrbs' this avoids using cytosine positions that were filled-in during the end-repair step. '--non_directional' requires '--rrbs' to be specified as well. Note that this option does not set '--clip_r2 2' in paired-end mode. + - name: --keep + type: boolean_true + description: Keep the quality trimmed intermediate file. + - name: Paired-end specific options + arguments: + - name: --paired + type: boolean_true + description: This option performs length trimming of quality/adapter/RRBS trimmed reads for paired-end files. To pass the validation test, both sequences of a sequence pair are required to have a certain minimum length which is governed by the option --length (see above). If only one read passes this length threshold the other read can be rescued (see option --retain_unpaired). Using this option lets you discard too short read pairs without disturbing the sequence-by-sequence order of FastQ files which is required by many aligners. Trim Galore expects paired-end files to be supplied in a pairwise fashion, e.g. file1_1.fq file1_2.fq SRR2_1.fq.gz SRR2_2.fq.gz ... . + - name: --retain_unpaired + type: boolean_true + description: If only one of the two paired-end reads became too short, the longer read will be written to either '.unpaired_1.fq' or '.unpaired_2.fq' output files. The length cutoff for unpaired single-end reads is governed by the parameters -r1/--length_1 and -r2/--length_2. + - name: --length_1 + alternatives: -r1 + type: integer + description: Unpaired single-end read length cutoff needed for read 1 to be written to '.unpaired_1.fq' output file. These reads may be mapped in single-end mode. + example: 35 + required: false + - name: --length_2 + alternatives: -r2 + type: integer + description: Unpaired single-end read length cutoff needed for read 2 to be written to '.unpaired_2.fq' output file. These reads may be mapped in single-end mode. + required: false + example: 35 + - name: Output + arguments: + - name: --output_dir + alternatives: -o + type: file + description: If specified all output will be written to this directory instead of the current directory. + direction: output + required: true + default: trimmed_output + - name: --trimmed_r1 + type: file + required: false + description: Output file for read 1. Only works when 1 file (single-end) or 2 files (paired-end) are specified, but not for longer lists. + direction: output + example: read_1.fastq + - name: --trimmed_r2 + type: file + required: false + description: Output file for read 2. Only works when 1 file (single-end) or 2 files (paired-end) are specified, but not for longer lists. + direction: output + example: read_2.fastq + - name: --trimming_report_r1 + type: file + required: false + description: Trimming report for read 1. Only works when 1 file (single-end) or 2 files (paired-end) are specified, but not for longer lists. + direction: output + example: read_1.trimming_report.txt + - name: --trimming_report_r2 + type: file + description: Trimming report for read 1. Only works when 1 file (single-end) or 2 files (paired-end) are specified, but not for longer lists. + direction: output + required: false + example: read_2.trimming_report.txt + - name: --trimmed_fastqc_html_1 + type: file + required: false + description: FastQC report for trimmed (single-end) reads (or read 1 for paired-end). Only works when 1 file (single-end) or 2 files (paired-end) are specified, but not for longer lists. + direction: output + example: read_1.fastqc.html + - name: --trimmed_fastqc_html_2 + type: file + description: FastQC report for trimmed reads (read2 for paired-end). Only works when 1 file (single-end) or 2 files (paired-end) are specified, but not for longer lists. + direction: output + required: false + example: read_2.fastqc.html + - name: --trimmed_fastqc_zip_1 + type: file + required: false + description: FastQC results for trimmed (single-end) reads (or read 1 for paired-end). Only works when 1 file (single-end) or 2 files (paired-end) are specified, but not for longer lists. + direction: output + example: read_1.fastqc.zip + - name: --trimmed_fastqc_zip_2 + type: file + description: FastQC results for trimmed reads (read2 for paired-end). Only works when 1 file (single-end) or 2 files (paired-end) are specified, but not for longer lists. + direction: output + required: false + example: read_2.fastqc.zip + - name: --unpaired_r1 + type: file + required: false + description: Output file for unpired read 1. Only works when 1 file (single-end) or 2 files (paired-end) are specified, but not for longer lists. + direction: output + example: unpaired_read_1.fastq + - name: --unpaired_r2 + type: file + required: false + description: Output file for unpaired read 2. Only works when 1 file (single-end) or 2 files (paired-end) are specified, but not for longer lists. + direction: output + example: unpaired_read_2.fastq + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: +- type: docker + image: quay.io/biocontainers/trim-galore:0.6.10--hdfd78af_0 + setup: + - type: docker + run: | + echo "TrimGalore: `trim_galore --version | sed -n 's/.*version\s\+\([0-9]\+\.[0-9]\+\.[0-9]\+\).*/\1/p'`" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/trimgalore/help.txt b/src/trimgalore/help.txt new file mode 100644 index 00000000..4bf38e99 --- /dev/null +++ b/src/trimgalore/help.txt @@ -0,0 +1,355 @@ + + USAGE: + +trim_galore [options] + + +-h/--help Print this help message and exits. + +-v/--version Print the version information and exits. + +-q/--quality Trim low-quality ends from reads in addition to adapter removal. For + RRBS samples, quality trimming will be performed first, and adapter + trimming is carried in a second round. Other files are quality and adapter + trimmed in a single pass. The algorithm is the same as the one used by BWA + (Subtract INT from all qualities; compute partial sums from all indices + to the end of the sequence; cut sequence at the index at which the sum is + minimal). Default Phred score: 20. + +--phred33 Instructs Cutadapt to use ASCII+33 quality scores as Phred scores + (Sanger/Illumina 1.9+ encoding) for quality trimming. Default: ON. + +--phred64 Instructs Cutadapt to use ASCII+64 quality scores as Phred scores + (Illumina 1.5 encoding) for quality trimming. + +--fastqc Run FastQC in the default mode on the FastQ file once trimming is complete. + +--fastqc_args "" Passes extra arguments to FastQC. If more than one argument is to be passed + to FastQC they must be in the form "arg1 arg2 etc.". An example would be: + --fastqc_args "--nogroup --outdir /home/". Passing extra arguments will + automatically invoke FastQC, so --fastqc does not have to be specified + separately. + +-a/--adapter Adapter sequence to be trimmed. If not specified explicitly, Trim Galore will + try to auto-detect whether the Illumina universal, Nextera transposase or Illumina + small RNA adapter sequence was used. Also see '--illumina', '--nextera' and + '--small_rna'. If no adapter can be detected within the first 1 million sequences + of the first file specified or if there is a tie between several adapter sequences, + Trim Galore defaults to '--illumina' (as long as the Illumina adapter was one of the + options, else '--nextera' is the default). A single base + may also be given as e.g. -a A{10}, to be expanded to -a AAAAAAAAAA. + + At a special request, multiple adapters can also be specified like so: + -a " AGCTCCCG -a TTTCATTATAT -a TTTATTCGGATTTAT" + -a2 " AGCTAGCG -a TCTCTTATAT -a TTTCGGATTTAT", or so: + -a "file:../multiple_adapters.fa" + -a2 "file:../different_adapters.fa" + Potentially in conjucntion with the parameter "-n 3" to trim all adapters. Please note + that this is NOT needed for standard trimming! + More Information here: https://github.com/FelixKrueger/TrimGalore/issues/86 + +-a2/--adapter2 Optional adapter sequence to be trimmed off read 2 of paired-end files. This + option requires '--paired' to be specified as well. If the libraries to be trimmed + are smallRNA then a2 will be set to the Illumina small RNA 5' adapter automatically + (GATCGTCGGACT). A single base may also be given as e.g. -a2 A{10}, to be expanded + to -a2 AAAAAAAAAA. + +--illumina Adapter sequence to be trimmed is the first 13bp of the Illumina universal adapter + 'AGATCGGAAGAGC' instead of the default auto-detection of adapter sequence. + +--stranded_illumina Adapter sequence to be trimmed is the first 13bp of the Illumina stranded mRNA or Total + RNA adapter 'ACTGTCTCTTATA' instead of the default auto-detection of adapter sequence. + Note that this sequence resembles the Nextera sequence with an additional A from A-tailing. + Please also see https://github.com/FelixKrueger/TrimGalore/issues/127 or + https://support.illumina.com/bulletins/2020/06/trimming-t-overhang-options-for-the-illumina-rna-library-prep-wo.html + for further information. This sequence is currently NOT included in the adapter auto-detection. + +--nextera Adapter sequence to be trimmed is the first 12bp of the Nextera adapter + 'CTGTCTCTTATA' instead of the default auto-detection of adapter sequence. + +--small_rna Adapter sequence to be trimmed is the first 12bp of the Illumina Small RNA 3' Adapter + 'TGGAATTCTCGG' instead of the default auto-detection of adapter sequence. Selecting + to trim smallRNA adapters will also lower the --length value to 18bp. If the smallRNA + libraries are paired-end then a2 will be set to the Illumina small RNA 5' adapter + automatically (GATCGTCGGACT) unless -a 2 had been defined explicitly. + +--consider_already_trimmed During adapter auto-detection, the limit set by allows the user to + set a threshold up to which the file is considered already adapter-trimmed. If no adapter + sequence exceeds this threshold, no additional adapter trimming will be performed (technically, + the adapter is set to '-a X'). Quality trimming is still performed as usual. + Default: NOT SELECTED (i.e. normal auto-detection precedence rules apply). + +--max_length Discard reads that are longer than bp after trimming. This is only advised for + smallRNA sequencing to remove non-small RNA sequences. + + +--stringency Overlap with adapter sequence required to trim a sequence. Defaults to a + very stringent setting of 1, i.e. even a single bp of overlapping sequence + will be trimmed off from the 3' end of any read. + +-e Maximum allowed error rate (no. of errors divided by the length of the matching + region) (default: 0.1) + +--gzip Compress the output file with GZIP. If the input files are GZIP-compressed + the output files will automatically be GZIP compressed as well. As of v0.2.8 the + compression will take place on the fly. + +--dont_gzip Output files won't be compressed with GZIP. This option overrides --gzip. + +--length Discard reads that became shorter than length INT because of either + quality or adapter trimming. A value of '0' effectively disables + this behaviour. Default: 20 bp. + + For paired-end files, both reads of a read-pair need to be longer than + bp to be printed out to validated paired-end files (see option --paired). + If only one read became too short there is the possibility of keeping such + unpaired single-end reads (see --retain_unpaired). Default pair-cutoff: 20 bp. + +--max_n COUNT The total number of Ns a read may contain before it will be removed altogether. + In a paired-end setting, either read exceeding this limit will result in the entire + pair being removed from the trimmed output files. If COUNT is a number between 0 and 1, + it is interpreted as a fraction of the read length. + +--trim-n Removes Ns from either side of the read. This option does currently not work in RRBS mode. + +-o/--output_dir If specified all output will be written to this directory instead of the current + directory. If the directory doesn't exist it will be created for you. + +--no_report_file If specified no report file will be generated. + +--suppress_warn If specified any output to STDOUT or STDERR will be suppressed. + +--clip_R1 Instructs Trim Galore to remove bp from the 5' end of read 1 (or single-end + reads). This may be useful if the qualities were very poor, or if there is some + sort of unwanted bias at the 5' end. Default: OFF. + +--clip_R2 Instructs Trim Galore to remove bp from the 5' end of read 2 (paired-end reads + only). This may be useful if the qualities were very poor, or if there is some sort + of unwanted bias at the 5' end. For paired-end BS-Seq, it is recommended to remove + the first few bp because the end-repair reaction may introduce a bias towards low + methylation. Please refer to the M-bias plot section in the Bismark User Guide for + some examples. Default: OFF. + +--three_prime_clip_R1 Instructs Trim Galore to remove bp from the 3' end of read 1 (or single-end + reads) AFTER adapter/quality trimming has been performed. This may remove some unwanted + bias from the 3' end that is not directly related to adapter sequence or basecall quality. + Default: OFF. + +--three_prime_clip_R2 Instructs Trim Galore to remove bp from the 3' end of read 2 AFTER + adapter/quality trimming has been performed. This may remove some unwanted bias from + the 3' end that is not directly related to adapter sequence or basecall quality. + Default: OFF. + +--2colour/--nextseq INT This enables the option '--nextseq-trim=3'CUTOFF' within Cutadapt, which will set a quality + cutoff (that is normally given with -q instead), but qualities of G bases are ignored. + This trimming is in common for the NextSeq- and NovaSeq-platforms, where basecalls without + any signal are called as high-quality G bases. This is mutually exlusive with '-q INT'. + + +--path_to_cutadapt You may use this option to specify a path to the Cutadapt executable, + e.g. /my/home/cutadapt-1.7.1/bin/cutadapt. Else it is assumed that Cutadapt is in + the PATH. + +--basename Use PREFERRED_NAME as the basename for output files, instead of deriving the filenames from + the input files. Single-end data would be called PREFERRED_NAME_trimmed.fq(.gz), or + PREFERRED_NAME_val_1.fq(.gz) and PREFERRED_NAME_val_2.fq(.gz) for paired-end data. --basename + only works when 1 file (single-end) or 2 files (paired-end) are specified, but not for longer lists. + +-j/--cores INT Number of cores to be used for trimming [default: 1]. For Cutadapt to work with multiple cores, it + requires Python 3 as well as parallel gzip (pigz) installed on the system. Trim Galore attempts to detect + the version of Python used by calling Cutadapt. If Python 2 is detected, --cores is set to 1. If the Python + version cannot be detected, Python 3 is assumed and we let Cutadapt handle potential issues itself. + + If pigz cannot be detected on your system, Trim Galore reverts to using gzip compression. Please note + that gzip compression will slow down multi-core processes so much that it is hardly worthwhile, please + see: https://github.com/FelixKrueger/TrimGalore/issues/16#issuecomment-458557103 for more info). + + Actual core usage: It should be mentioned that the actual number of cores used is a little convoluted. + Assuming that Python 3 is used and pigz is installed, --cores 2 would use 2 cores to read the input + (probably not at a high usage though), 2 cores to write to the output (at moderately high usage), and + 2 cores for Cutadapt itself + 2 additional cores for Cutadapt (not sure what they are used for) + 1 core + for Trim Galore itself. So this can be up to 9 cores, even though most of them won't be used at 100% for + most of the time. Paired-end processing uses twice as many cores for the validation (= writing out) step. + --cores 4 would then be: 4 (read) + 4 (write) + 4 (Cutadapt) + 2 (extra Cutadapt) + 1 (Trim Galore) = 15. + + It seems that --cores 4 could be a sweet spot, anything above has diminishing returns. + + + +SPECIFIC TRIMMING - without adapter/quality trimming + +--hardtrim5 Instead of performing adapter-/quality trimming, this option will simply hard-trim sequences + to bp at the 5'-end. Once hard-trimming of files is complete, Trim Galore will exit. + Hard-trimmed output files will end in ._5prime.fq(.gz). Here is an example: + + before: CCTAAGGAAACAAGTACACTCCACACATGCATAAAGGAAATCAAATGTTATTTTTAAGAAAATGGAAAAT + --hardtrim5 20: CCTAAGGAAACAAGTACACT + +--hardtrim3 Instead of performing adapter-/quality trimming, this option will simply hard-trim sequences + to bp at the 3'-end. Once hard-trimming of files is complete, Trim Galore will exit. + Hard-trimmed output files will end in ._3prime.fq(.gz). Here is an example: + + before: CCTAAGGAAACAAGTACACTCCACACATGCATAAAGGAAATCAAATGTTATTTTTAAGAAAATGGAAAAT + --hardtrim3 20: TTTTTAAGAAAATGGAAAAT + +--clock In this mode, reads are trimmed in a specific way that is currently used for the Mouse + Epigenetic Clock (see here: Multi-tissue DNA methylation age predictor in mouse, Stubbs et al., + Genome Biology, 2017 18:68 https://doi.org/10.1186/s13059-017-1203-5). Following this, Trim Galore + will exit. + + In it's current implementation, the dual-UMI RRBS reads come in the following format: + + Read 1 5' UUUUUUUU CAGTA FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF TACTG UUUUUUUU 3' + Read 2 3' UUUUUUUU GTCAT FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF ATGAC UUUUUUUU 5' + + Where UUUUUUUU is a random 8-mer unique molecular identifier (UMI), CAGTA is a constant region, + and FFFFFFF... is the actual RRBS-Fragment to be sequenced. The UMIs for Read 1 (R1) and + Read 2 (R2), as well as the fixed sequences (F1 or F2), are written into the read ID and + removed from the actual sequence. Here is an example: + + R1: @HWI-D00436:407:CCAETANXX:1:1101:4105:1905 1:N:0: CGATGTTT + ATCTAGTTCAGTACGGTGTTTTCGAATTAGAAAAATATGTATAGAGGAAATAGATATAAAGGCGTATTCGTTATTG + R2: @HWI-D00436:407:CCAETANXX:1:1101:4105:1905 3:N:0: CGATGTTT + CAATTTTGCAGTACAAAAATAATACCTCCTCTATTTATCCAAAATCACAAAAAACCACCCACTTAACTTTCCCTAA + + R1: @HWI-D00436:407:CCAETANXX:1:1101:4105:1905 1:N:0: CGATGTTT:R1:ATCTAGTT:R2:CAATTTTG:F1:CAGT:F2:CAGT + CGGTGTTTTCGAATTAGAAAAATATGTATAGAGGAAATAGATATAAAGGCGTATTCGTTATTG + R2: @HWI-D00436:407:CCAETANXX:1:1101:4105:1905 3:N:0: CGATGTTT:R1:ATCTAGTT:R2:CAATTTTG:F1:CAGT:F2:CAGT + CAAAAATAATACCTCCTCTATTTATCCAAAATCACAAAAAACCACCCACTTAACTTTCCCTAA + + Following clock trimming, the resulting files (.clock_UMI.R1.fq(.gz) and .clock_UMI.R2.fq(.gz)) + should be adapter- and quality trimmed with Trim Galore as usual. In addition, reads need to be trimmed + by 15bp from their 3' end to get rid of potential UMI and fixed sequences. The command is: + + trim_galore --paired --three_prime_clip_R1 15 --three_prime_clip_R2 15 *.clock_UMI.R1.fq.gz *.clock_UMI.R2.fq.gz + + Following this, reads should be aligned with Bismark and deduplicated with UmiBam + in '--dual_index' mode (see here: https://github.com/FelixKrueger/Umi-Grinder). UmiBam recognises + the UMIs within this pattern: R1:(ATCTAGTT):R2:(CAATTTTG): as (UMI R1) and (UMI R2). + +--polyA This is a new, still experimental, trimming mode to identify and remove poly-A tails from sequences. + When --polyA is selected, Trim Galore attempts to identify from the first supplied sample whether + sequences contain more often a stretch of either 'AAAAAAAAAA' or 'TTTTTTTTTT'. This determines + if Read 1 of a paired-end end file, or single-end files, are trimmed for PolyA or PolyT. In case of + paired-end sequencing, Read2 is trimmed for the complementary base from the start of the reads. The + auto-detection uses a default of A{20} for Read1 (3'-end trimming) and T{150} for Read2 (5'-end trimming). + These values may be changed manually using the options -a and -a2. + + In addition to trimming the sequences, white spaces are replaced with _ and it records in the read ID + how many bases were trimmed so it can later be used to identify PolyA trimmed sequences. This is currently done + by writing tags to both the start ("32:A:") and end ("_PolyA:32") of the reads in the following example: + + @READ-ID:1:1102:22039:36996 1:N:0:CCTAATCC + GCCTAAGGAAACAAGTACACTCCACACATGCATAAAGGAAATCAAATGTTATTTTTAAGAAAATGGAAAATAAAAACTTTATAAACACCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + + @32:A:READ-ID:1:1102:22039:36996_1:N:0:CCTAATCC_PolyA:32 + GCCTAAGGAAACAAGTACACTCCACACATGCATAAAGGAAATCAAATGTTATTTTTAAGAAAATGGAAAATAAAAACTTTATAAACACC + + PLEASE NOTE: The poly-A trimming mode expects that sequences were both adapter and quality trimmed + before looking for Poly-A tails, and it is the user's responsibility to carry out an initial round of + trimming. The following sequence: + + 1) trim_galore file.fastq.gz + 2) trim_galore --polyA file_trimmed.fq.gz + 3) zcat file_trimmed_trimmed.fq.gz | grep -A 3 PolyA | grep -v ^-- > PolyA_trimmed.fastq + + Will 1) trim qualities and Illumina adapter contamination, 2) find and remove PolyA contamination. + Finally, if desired, 3) will specifically find PolyA trimmed sequences to a specific FastQ file of your choice. + +--implicon This is a special mode of operation for paired-end data, such as required for the IMPLICON method, where a UMI sequence + is getting transferred from the start of Read 2 to the readID of both reads. Following this, Trim Galore will exit. + + In it's current implementation, the UMI carrying reads come in the following format: + + Read 1 5' FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 3' + Read 2 3' UUUUUUUUFFFFFFFFFFFFFFFFFFFFFFFFFFFF 5' + + Where UUUUUUUU is a random 8-mer unique molecular identifier (UMI) and FFFFFFF... is the actual fragment to be + sequenced. The UMI of Read 2 (R2) is written into the read ID of both reads and removed from the actual sequence. + Here is an example: + + R1: @HWI-D00436:407:CCAETANXX:1:1101:4105:1905 1:N:0: CGATGTTT + ATCTAGTTCAGTACGGTGTTTTCGAATTAGAAAAATATGTATAGAGGAAATAGATATAAAGGCGTATTCGTTATTG + R2: @HWI-D00436:407:CCAETANXX:1:1101:4105:1905 3:N:0: CGATGTTT + CAATTTTGCAGTACAAAAATAATACCTCCTCTATTTATCCAAAATCACAAAAAACCACCCACTTAACTTTCCCTAA + + After --implicon trimming: + R1: @HWI-D00436:407:CCAETANXX:1:1101:4105:1905 1:N:0: CGATGTTT:CAATTTTG + ATCTAGTTCAGTACGGTGTTTTCGAATTAGAAAAATATGTATAGAGGAAATAGATATAAAGGCGTATTCGTTATTG + R2: @HWI-D00436:407:CCAETANXX:1:1101:4105:1905 3:N:0: CGATGTTT:CAATTTTG + CAGTACAAAAATAATACCTCCTCTATTTATCCAAAATCACAAAAAACCACCCACTTAACTTTCCCTAA + +RRBS-specific options (MspI digested material): + +--rrbs Specifies that the input file was an MspI digested RRBS sample (recognition + site: CCGG). Single-end or Read 1 sequences (paired-end) which were adapter-trimmed + will have a further 2 bp removed from their 3' end. Sequences which were merely + trimmed because of poor quality will not be shortened further. Read 2 of paired-end + libraries will in addition have the first 2 bp removed from the 5' end (by setting + '--clip_r2 2'). This is to avoid using artificial methylation calls from the filled-in + cytosine positions close to the 3' MspI site in sequenced fragments. + This option is not recommended for users of the Tecan Ovation RRBS Methyl-Seq with TrueMethyl + oxBS 1-16 kit (see below). + +--non_directional Selecting this option for non-directional RRBS libraries will screen + quality-trimmed sequences for 'CAA' or 'CGA' at the start of the read + and, if found, removes the first two basepairs. Like with the option + '--rrbs' this avoids using cytosine positions that were filled-in + during the end-repair step. '--non_directional' requires '--rrbs' to + be specified as well. Note that this option does not set '--clip_r2 2' in + paired-end mode. + +--keep Keep the quality trimmed intermediate file. Default: off, which means + the temporary file is being deleted after adapter trimming. Only has + an effect for RRBS samples since other FastQ files are not trimmed + for poor qualities separately. + + +Note for RRBS using the Tecan Ovation RRBS Methyl-Seq with TrueMethyl oxBS 1-16 kit: + +Owing to the fact that the Tecan Ovation RRBS kit attaches a varying number of nucleotides (0-3) after each MspI +site Trim Galore should be run WITHOUT the option --rrbs. This trimming is accomplished in a subsequent +diversity trimming step afterwards (see their manual). + + + +Note for RRBS using MseI: + +If your DNA material was digested with MseI (recognition motif: TTAA) instead of MspI it is NOT necessary +to specify --rrbs or --non_directional since virtually all reads should start with the sequence +'TAA', and this holds true for both directional and non-directional libraries. As the end-repair of 'TAA' +restricted sites does not involve any cytosines it does not need to be treated especially. Instead, simply +run Trim Galore! in the standard (i.e. non-RRBS) mode. + + + + +Paired-end specific options: + +--paired This option performs length trimming of quality/adapter/RRBS trimmed reads for + paired-end files. To pass the validation test, both sequences of a sequence pair + are required to have a certain minimum length which is governed by the option + --length (see above). If only one read passes this length threshold the + other read can be rescued (see option --retain_unpaired). Using this option lets + you discard too short read pairs without disturbing the sequence-by-sequence order + of FastQ files which is required by many aligners. + + Trim Galore! expects paired-end files to be supplied in a pairwise fashion, e.g. + file1_1.fq file1_2.fq SRR2_1.fq.gz SRR2_2.fq.gz ... . + + +--retain_unpaired If only one of the two paired-end reads became too short, the longer + read will be written to either '.unpaired_1.fq' or '.unpaired_2.fq' + output files. The length cutoff for unpaired single-end reads is + governed by the parameters -r1/--length_1 and -r2/--length_2. Default: OFF. + +-r1/--length_1 Unpaired single-end read length cutoff needed for read 1 to be written to + '.unpaired_1.fq' output file. These reads may be mapped in single-end mode. + Default: 35 bp. + +-r2/--length_2 Unpaired single-end read length cutoff needed for read 2 to be written to + '.unpaired_2.fq' output file. These reads may be mapped in single-end mode. + Default: 35 bp. + +Last modified on 02 02 2023. + diff --git a/src/trimgalore/script.sh b/src/trimgalore/script.sh new file mode 100755 index 00000000..1cceea4b --- /dev/null +++ b/src/trimgalore/script.sh @@ -0,0 +1,126 @@ +#!/bin/bash + +set -eo pipefail + +[[ ! -d $output_dir ]] && mkdir -p $par_output_dir + +IFS=";" read -ra input <<< $par_input + +unset_if_false=( + par_phred33 + par_phred64 + par_fastqc + par_illumina + par_stranded_illumina + par_nextera + par_small_rna + par_gzip + par_dont_gzip + par_trim_n + par_no_report_file + par_suppress_warn + par_clock + par_polyA + par_implicon + par_rrbs + par_non_directional + par_keep + par_paired + par_retain_unpaired +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +# Add FastQC file arguments to fastqc_args +fastqc_args="${par_fastqc_args}" +if [ -f "$par_fastqc_contaminants" ]; then + fastqc_args+=" --contaminants $par_fastqc_contaminants" +fi +if [ -f "$par_fastqc_adapters" ]; then + fastqc_args+=" --adapters $par_fastqc_adapters" +fi +if [ -f "$par_fastqc_limits" ]; then + fastqc_args+=" --limits $par_fastqc_limits" +fi + +trim_galore \ + ${par_quality:+-q "${par_quality}"} \ + ${par_phred33:+--phred33} \ + ${par_phred64:+--phred64 } \ + ${par_fastqc:+--fastqc } \ + ${fastqc_args:+--fastqc_args "${fastqc_args}"} \ + ${par_adapter:+-a "${par_adapter}"} \ + ${par_adapter2:+-a2 "${par_adapter2}"} \ + ${par_illumina:+--illumina} \ + ${par_stranded_illumina:+--stranded_illumina} \ + ${par_nextera:+--nextera} \ + ${par_small_rna:+--small_rna} \ + ${par_consider_already_trimmed:+--consider_already_trimmed "${par_consider_already_trimmed}"} \ + ${par_max_length:+--max_length "${par_max_length}"} \ + ${par_stringency:+--stringency "${par_stringency}"} \ + ${par_error_rate:+-e "${par_error_rate}"} \ + ${par_gzip:+--gzip} \ + ${par_dont_gzip:+--dont_gzip} \ + ${par_length:+--length "${par_length}"} \ + ${par_max_n:+--max_n "${par_max_n}"} \ + ${par_trim_n:+--trim-n "${par_trim_n}"} \ + ${par_no_report_file:+--no_report_file} \ + ${par_suppress_warn:+--suppress_warn} \ + ${par_clip_R1:+--clip_R1 "${par_clip_R1}"} \ + ${par_clip_R2:+--clip_R2 "${par_clip_R2}"} \ + ${par_three_prime_clip_R1:+--three_prime_clip_R1 "${par_three_prime_clip_R1}"} \ + ${par_three_prime_clip_R2:+--three_prime_clip_R2 "${par_three_prime_clip_R2}"} \ + ${par_nextseq:+--nextseq "${par_nextseq}"} \ + ${par_basename:+-basename "${par_basename}"} \ + ${par_hardtrim5:+--hardtrim5 "${par_hardtrim5}"} \ + ${par_hardtrim3:+--hardtrim3 "${par_hardtrim3}"} \ + ${par_clock:+--clock} \ + ${par_polyA:+--polyA} \ + ${par_implicon:+--implicon "${par_implicon}"} \ + ${par_rrbs:+--rrbs} \ + ${par_non_directional:+--non_directional} \ + ${par_keep:+--keep} \ + ${par_paired:+--paired} \ + ${par_retain_unpaired:+--retain_unpaired} \ + ${par_length_1:+-r1 "${par_length_1}"} \ + ${par_length_2:+-r2 "${par_length_2}"} \ + ${meta_cpus:+-j "${meta_cpus}"} \ + -o $par_output_dir \ + ${input[*]} + +if [ $par_paired == "true" ]; then + + input_r1=$(basename -- "${input[0]}") + input_r2=$(basename -- "${input[1]}") + [[ ! -z "$par_trimmed_r1" ]] && mv $par_output_dir/*val_1.f*q* $par_trimmed_r1 + [[ ! -z "$par_trimmed_r2" ]] && mv $par_output_dir/*val_2.f*q* $par_trimmed_r2 + [[ ! -z "$par_trimming_report_r1" ]] && mv $par_output_dir/${input_r1}_trimming_report.txt $par_trimming_report_r1 + [[ ! -z "$par_trimming_report_r2" ]] && mv $par_output_dir/${input_r2}_trimming_report.txt $par_trimming_report_r2 + + if [ "$par_fastqc" == "true" ]; then + [[ ! -z "$par_trimmed_fastqc_html_1" ]] && mv $par_output_dir/*val_1_fastqc.html $par_trimmed_fastqc_html_1 + [[ ! -z "$par_trimmed_fastqc_html_2" ]] && mv $par_output_dir/*val_2_fastqc.html $par_trimmed_fastqc_html_2 + [[ ! -z "$par_trimmed_fastqc_zip_1" ]] && mv $par_output_dir/*val_1_fastqc.zip $par_trimmed_fastqc_zip_1 + [[ ! -z "$par_trimmed_fastqc_zip_2" ]] && mv $par_output_dir/*val_2_fastqc.zip $par_trimmed_fastqc_zip_2 + fi + + if [ "$par_retain_unpaired" == "true" ]; then + [[ ! -z "$par_unpaired_r1" ]] && mv $par_output_dir/*.unpaired_1.f*q* $par_unpaired_r1 + [[ ! -z "$par_unpaired_r2" ]] && mv $par_output_dir/*.unpaired_2.f*q* $par_unpaired_r2 + fi + +else + + input_r1=$(basename -- "${input[0]}") + [[ ! -z "$par_trimmed_r1" ]] && mv $par_output_dir/*_trimmed.fq* $par_trimmed_r1 + [[ ! -z "$par_trimming_report_r1" ]] && mv $par_output_dir/${input_r1}_trimming_report.txt $par_trimming_report_r1 + + if [ "$par_fastqc" == "true" ]; then + [[ ! -z "$par_trimmed_fastqc_html_1" ]] && mv $par_output_dir/*_trimmed_fastqc.html $par_trimmed_fastqc_html_1 + [[ ! -z "$par_trimmed_fastqc_zip_1" ]] && mv $par_output_dir/*_trimmed_fastqc.zip $par_trimmed_fastqc_zip_1 + fi + +fi \ No newline at end of file diff --git a/src/trimgalore/test.sh b/src/trimgalore/test.sh new file mode 100644 index 00000000..8cb3ccdb --- /dev/null +++ b/src/trimgalore/test.sh @@ -0,0 +1,125 @@ +#!/bin/bash + +set -eo pipefail + +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_doesnt_exist() { + [ ! -f "$1" ] || { echo "File '$1' exists but shouldn't" && exit 1; } +} +assert_file_empty() { + [ ! -s "$1" ] || { echo "File '$1' is not empty but should be" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_file_not_contains() { + grep -q "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } +} + +################################################################# + +echo ">>> Prepare test data" + +cat > example_R1.fastq <<'EOF' +@SRR6357071.22842410 22842410/1 kraken:taxid|4932 +CAAGTTTTCATCTTCAACAGCTGATTGACTTCTTTGTGGTATGCCTCGATATATTTTTCTTTTTCTTTAATATCTTTATTATAGGTGATTGCCTCATCGTA ++ +BBBBBFFFFFFFFFFFFFFF/BFFFFFFFFFFFFFFFFBFFBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBFFFFFFFFFFFFFFFFFFFFBF< +@SRR6357071.52260105 52260105/1 kraken:taxid|4932 +TAGACTTACCAGTACCCTTTTCGACGGCGGAAACATTCAAAATACCGTTAGAGTCGACATCGAAAGTGACTTCAATTTGTGGGACACCTCTTGGAGCTGGT ++ +BBBBBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF/FFFFFFFFFFFFFFFF +EOF + +cat > example_R2.fastq <<'EOF' +@SRR6357071.22842410 22842410/2 kraken:taxid|4932 +CCGAGATCGAAGAAACGAATTCACCTGATTGCAGCTGTAAAAGCAGTAAAATCAATCAAACCAATACGGACAACCTTACGATACGATGAGGCAATCACCTA ++ +BBBBBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF +@SRR6357071.52260105 52260105/2 kraken:taxid|4932 +GTTGATTCCAAGAAACTCTACCATTCCAACTAAGAAATCCGAAGTTTTCTCTACTTATGCTGACAACCAACCAGGTGTCTTGATTCAAGTCTTTGAAGGTG ++ +BBBBBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF +EOF + +################################################################# + +echo ">>> Testing for single-end reads" +"$meta_executable" \ + --input "example_R1.fastq" \ + --trimmed_fastqc_html_1 output_se_test/example.trimmed.html \ + --trimmed_fastqc_zip_1 output_se_test/example.trimmed.zip \ + --trimmed_r1 output_se_test/example.trimmed.fastq \ + --trimming_report_r1 output_se_test/example.trimming_report.txt \ + --fastqc \ + --output_dir output_se_test + +echo ">> Checking output" +assert_file_exists "output_se_test/example.trimmed.html" +assert_file_exists "output_se_test/example.trimmed.zip" +assert_file_exists "output_se_test/example.trimmed.fastq" +assert_file_exists "output_se_test/example.trimming_report.txt" + +echo ">> Check if output is empty" +assert_file_not_empty "output_se_test/example.trimmed.html" +assert_file_not_empty "output_se_test/example.trimmed.zip" +assert_file_not_empty "output_se_test/example.trimmed.fastq" +assert_file_not_empty "output_se_test/example.trimming_report.txt" + +echo ">> Check contents" +assert_file_contains "output_se_test/example.trimmed.fastq" "@SRR6357071.22842410 22842410/1" +assert_file_contains "output_se_test/example.trimming_report.txt" "Sequences removed because they became shorter than the length cutoff" + +################################################################# + +echo ">>> Testing for paired-end reads" +"$meta_executable" \ + --paired \ + --input "example_R1.fastq;example_R2.fastq" \ + --trimmed_fastqc_html_1 output_pe_test/example_R1.trimmed.html \ + --trimmed_fastqc_html_2 output_pe_test/example_R2.trimmed.html \ + --trimmed_fastqc_zip_1 output_pe_test/example_R1.trimmed.zip \ + --trimmed_fastqc_zip_2 output_pe_test/example_R2.trimmed.zip \ + --trimmed_r1 output_pe_test/example_R1.trimmed.fastq \ + --trimmed_r2 output_pe_test/example_R2.trimmed.fastq \ + --trimming_report_r1 output_pe_test/example_R1.trimming_report.txt \ + --trimming_report_r2 output_pe_test/example_R2.trimming_report.txt \ + --fastqc \ + --output_dir output_pe_test + +echo ">> Checking output" +assert_file_exists "output_pe_test/example_R1.trimmed.html" +assert_file_exists "output_pe_test/example_R2.trimmed.html" +assert_file_exists "output_pe_test/example_R1.trimmed.zip" +assert_file_exists "output_pe_test/example_R2.trimmed.zip" +assert_file_exists "output_pe_test/example_R1.trimmed.fastq" +assert_file_exists "output_pe_test/example_R2.trimmed.fastq" +assert_file_exists "output_pe_test/example_R1.trimming_report.txt" +assert_file_exists "output_pe_test/example_R2.trimming_report.txt" + +echo ">> Check if output is empty" +assert_file_not_empty "output_pe_test/example_R1.trimmed.html" +assert_file_not_empty "output_pe_test/example_R2.trimmed.html" +assert_file_not_empty "output_pe_test/example_R1.trimmed.zip" +assert_file_not_empty "output_pe_test/example_R2.trimmed.zip" +assert_file_not_empty "output_pe_test/example_R1.trimmed.fastq" +assert_file_not_empty "output_pe_test/example_R2.trimmed.fastq" +assert_file_not_empty "output_pe_test/example_R1.trimming_report.txt" +assert_file_not_empty "output_pe_test/example_R2.trimming_report.txt" + +echo ">> Check contents" +assert_file_contains "output_pe_test/example_R1.trimmed.fastq" "@SRR6357071.22842410 22842410/1" +assert_file_contains "output_pe_test/example_R2.trimmed.fastq" "@SRR6357071.22842410 22842410/2" +assert_file_contains "output_pe_test/example_R1.trimming_report.txt" "sequences processed in total" +assert_file_contains "output_pe_test/example_R2.trimming_report.txt" "Number of sequence pairs removed because at least one read was shorter than the length cutoff" + +################################################################# + +echo ">>> Test finished successfully" +exit 0 From 237a2e3a229ee589d1ebbc282526f87398e26f58 Mon Sep 17 00:00:00 2001 From: Hendrik Cannoodt Date: Fri, 27 Sep 2024 11:52:08 +0200 Subject: [PATCH 27/28] Fixes the typo raised in issue #132 (#157) * Fixes the typo raised in issue #132 * Add changelog entry * fix typo, modify script --------- Co-authored-by: jakubmajercik --- CHANGELOG.md | 4 ++++ src/falco/config.vsh.yaml | 2 +- src/falco/script.sh | 4 ++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0613fa25..a2aa5387 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ * `rsem/rsem_calculate_expression`: Calculate expression levels (PR #93). +## BREAKING CHANGES + +* `falco`: Fix a typo in the `--reverse_complement` argument (PR #157). + ## MINOR CHANGES * Upgrade to Viash 0.9.0. diff --git a/src/falco/config.vsh.yaml b/src/falco/config.vsh.yaml index de9906ef..a161e252 100644 --- a/src/falco/config.vsh.yaml +++ b/src/falco/config.vsh.yaml @@ -86,7 +86,7 @@ argument_groups: bisulfite sequencing, and more Ts and fewer Cs are therefore expected and will be accounted for in base content. - - name: --reverse_complliment + - name: --reverse_complement alternatives: [-r] type: boolean_true description: | diff --git a/src/falco/script.sh b/src/falco/script.sh index 43f5efe5..13e2eab4 100644 --- a/src/falco/script.sh +++ b/src/falco/script.sh @@ -4,7 +4,7 @@ set -eo pipefail [[ "$par_nogroup" == "false" ]] && unset par_nogroup [[ "$par_bisulfite" == "false" ]] && unset par_bisulfite -[[ "$par_reverse_compliment" == "false" ]] && unset par_reverse_compliment +[[ "$par_reverse_complement" == "false" ]] && unset par_reverse_complement IFS=";" read -ra input <<< $par_input @@ -15,7 +15,7 @@ $(which falco) \ ${par_limits:+--limits "$par_limits"} \ ${par_subsample:+-subsample $par_subsample} \ ${par_bisulfite:+-bisulfite} \ - ${par_reverse_compliment:+-reverse-compliment} \ + ${par_reverse_complement:+-reverse-complement} \ ${par_outdir:+--outdir "$par_outdir"} \ ${par_format:+--format "$par_format"} \ ${par_data_filename:+-data-filename "$par_data_filename"} \ From 0a0edcacb5368517d249210022363bd9265f1bf5 Mon Sep 17 00:00:00 2001 From: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> Date: Thu, 3 Oct 2024 14:46:57 +0200 Subject: [PATCH 28/28] Cutadapt: fix non-functional action parameter (#161) * Cutadapt: fix non-functional action parameter * Add PR number --- CHANGELOG.md | 4 ++++ src/cutadapt/script.sh | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a2aa5387..d1654375 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,10 @@ * `falco`: Fix a typo in the `--reverse_complement` argument (PR #157). +## BUG FIXES + +* `cutadapt`: fix the the non-functional `action` parameter (PR #161). + ## MINOR CHANGES * Upgrade to Viash 0.9.0. diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh index 20c92724..d181e2b0 100644 --- a/src/cutadapt/script.sh +++ b/src/cutadapt/script.sh @@ -108,7 +108,7 @@ input_args=$(echo \ ${par_overlap:+--overlap "${par_overlap}"} \ ${par_match_read_wildcards:+--match-read-wildcards} \ ${par_no_match_adapter_wildcards:+--no-match-adapter-wildcards} \ - ${par_action:+--action "${par_action}"} \ + ${par_action:+--action="${par_action}"} \ ${par_revcomp:+--revcomp} \ ) debug "Arguments to cutadapt:"