From 80aaf333dc1245b8bbec9aa7b628d85c5e413f4b Mon Sep 17 00:00:00 2001 From: Emma Rousseau Date: Fri, 13 Sep 2024 09:10:30 +0200 Subject: [PATCH] Add Kallisto index (#149) --- CHANGELOG.md | 6 +- src/kallisto/kallisto_index/Kallisto | Bin 0 -> 2439 bytes src/kallisto/kallisto_index/config.vsh.yaml | 94 ++++++++++++++++++ src/kallisto/kallisto_index/help.txt | 21 ++++ src/kallisto/kallisto_index/script.sh | 34 +++++++ src/kallisto/kallisto_index/test.sh | 35 +++++++ .../kallisto_index/test_data/d_list.fasta | 5 + .../test_data/transcriptome.fasta | 23 +++++ 8 files changed, 217 insertions(+), 1 deletion(-) create mode 100644 src/kallisto/kallisto_index/Kallisto create mode 100644 src/kallisto/kallisto_index/config.vsh.yaml create mode 100644 src/kallisto/kallisto_index/help.txt create mode 100644 src/kallisto/kallisto_index/script.sh create mode 100644 src/kallisto/kallisto_index/test.sh create mode 100644 src/kallisto/kallisto_index/test_data/d_list.fasta create mode 100644 src/kallisto/kallisto_index/test_data/transcriptome.fasta diff --git a/CHANGELOG.md b/CHANGELOG.md index d88d0996..846007d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -149,7 +149,11 @@ * `sortmerna`: Local sequence alignment tool for mapping, clustering, and filtering rRNA from metatranscriptomic data. (PR #146) -* `fq_subsample`: Sample a subset of records from single or paired FASTQ files (PR #147). +* `fq_subsample`: Sample a subset of records from single or paired FASTQ files (PR #147). + +* `kallisto`: + - `kallisto_index`: Create a kallisto index (PR #149). + ## MINOR CHANGES diff --git a/src/kallisto/kallisto_index/Kallisto b/src/kallisto/kallisto_index/Kallisto new file mode 100644 index 0000000000000000000000000000000000000000..3c7b5b2bff962965d99ca3f9a4a6b6af6da1f3f0 GIT binary patch literal 2439 zcmeHJTTBx{6rFCj(iW7(R4PH@f{iwcTCEBOksY_VO)N&jMkQb@MkU0z5b%pZJItq>6y3i5QJC02jb;7mKSNlWm{Po|xmnK|d)x%cj5cE^Hf zo5Ms=gP>q-=Dx`Y&8Tam%b*(*sAYc)aw*sH`%Qmb#irI!wJd#g^%N{jJ942Y*B;7v zR8myiWp1c$UHLL&=A`AC^o$*g(yPKxE9&k2TL(+#tPc&Aci!%O@N%%KUe}u^mgT!# zi%Y`WO!oXjbAkQw#f0xZP+kZo&F8eXH{r?boldZcY~7r&DBNCpW#D;g<%>%V>88S* zn(fP&_poKg&FeY4ul8ovgjcuEYhCLuEry9tsbgo; zfcbNEMVB@;wf0@jF3-`vqIs3Vln>G5z~*-Q*|O7R4?+L_8<&^;{;|Iv=yw^0x?1Fq zKKH{t-pk+KZ7B%++|bB;H`mVIyPQk@FmSG}@6usnuL2HvP5$1gzh1RfdlA=oM8O*p9pU%UxMoXtZa4MiI;e6lN9)Zj9SsczoL4F-nxW7gjS;lL#ITXLZTe7Tm{6y<^@7*BFcmyOu`fe-N>GMM_MNm8n3s15~-A-du zs>S(M$mE>7gVQsBMnsW@M&}f>b(D#qkcNO}MG=t0Wgt?+7*11VW2Q|{QbnyveDt-4O(8?_>}HQ+|nsHK}JD> zEI}_&#snCR!wU`=ImWmY2!IuUp@YzBuGIbjA?Q;}NCbS6Ej{38{Q(Ed(9~7CHlh~@ z(zu?LfHTHk>o~Hk>ib8~11>`F@%qmr>8X$)4UE=ZAnP=qIJp|ns6M_j(fMdSqjeZP zKmX@E#Gf*HzUVyzWeLinBtr;M7o$H(mPA>GqA1d9hM{`wuLpH{uWG16io*!3&Oq!~ zY>JwOK3Z%+$HT0KEnpYTsK*f41@5@T5O@KlCdndB3}+_eA<9%le@sX@NTS*o#e2qq z{mZk6y~(I%5^AViqJ%~m(IWP&+TTT!n9y(~NA!$eA9;u!LWn;?@K-_t>bR9cmu5}*J8 literal 0 HcmV?d00001 diff --git a/src/kallisto/kallisto_index/config.vsh.yaml b/src/kallisto/kallisto_index/config.vsh.yaml new file mode 100644 index 00000000..2c4f65c7 --- /dev/null +++ b/src/kallisto/kallisto_index/config.vsh.yaml @@ -0,0 +1,94 @@ +name: kallisto_index +namespace: kallisto +description: | + Build a Kallisto index for the transcriptome to use Kallisto in the mapping-based mode. +keywords: [kallisto, index] +links: + homepage: https://pachterlab.github.io/kallisto/about + documentation: https://pachterlab.github.io/kallisto/manual + repository: https://github.com/pachterlab/kallisto + issue_tracker: https://github.com/pachterlab/kallisto/issues +references: + doi: https://doi.org/10.1038/nbt.3519 +license: BSD 2-Clause License + +argument_groups: +- name: "Input" + arguments: + - name: "--input" + type: file + description: | + Path to a FASTA-file containing the transcriptome sequences, either in plain text or + compressed (.gz) format. + required: true + - name: "--d_list" + type: file + description: | + Path to a FASTA-file containing sequences to mask from quantification. + +- name: "Output" + arguments: + - name: "--index" + type: file + direction: output + example: Kallisto_index + +- name: "Options" + arguments: + - name: "--kmer_size" + type: integer + description: | + Kmer length passed to indexing step of pseudoaligners (default: '31'). + example: 31 + - name: "--make_unique" + type: boolean_true + description: | + Replace repeated target names with unique names. + - name: "--aa" + type: boolean_true + description: | + Generate index from a FASTA-file containing amino acid sequences. + - name: "--distiguish" + type: boolean_true + description: | + Generate index where sequences are distinguished by the sequence names. + - name: "--min_size" + alternatives: ["-m"] + type: integer + description: | + Length of minimizers (default: automatically chosen). + - name: "--ec_max_size" + alternatives: ["-e"] + type: integer + description: | + Maximum number of targets in an equivalence class (default: no maximum). + - name: "--tmp" + alternatives: ["-T"] + type: string + description: | + Path to a directory for temporary files. + example: "tmp" + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: + - type: docker + image: ubuntu:22.04 + setup: + - type: docker + run: | + apt-get update && \ + apt-get install -y --no-install-recommends wget && \ + wget --no-check-certificate https://github.com/pachterlab/kallisto/releases/download/v0.50.1/kallisto_linux-v0.50.1.tar.gz && \ + tar -xzf kallisto_linux-v0.50.1.tar.gz && \ + mv kallisto/kallisto /usr/local/bin/ +runners: + - type: executable + - type: nextflow diff --git a/src/kallisto/kallisto_index/help.txt b/src/kallisto/kallisto_index/help.txt new file mode 100644 index 00000000..28778ac0 --- /dev/null +++ b/src/kallisto/kallisto_index/help.txt @@ -0,0 +1,21 @@ +``` +kallisto index +``` +kallisto 0.50.1 +Builds a kallisto index + +Usage: kallisto index [arguments] FASTA-files + +Required argument: +-i, --index=STRING Filename for the kallisto index to be constructed + +Optional argument: +-k, --kmer-size=INT k-mer (odd) length (default: 31, max value: 31) +-t, --threads=INT Number of threads to use (default: 1) +-d, --d-list=STRING Path to a FASTA-file containing sequences to mask from quantification + --make-unique Replace repeated target names with unique names + --aa Generate index from a FASTA-file containing amino acid sequences + --distinguish Generate index where sequences are distinguished by the sequence name +-T, --tmp=STRING Temporary directory (default: tmp) +-m, --min-size=INT Length of minimizers (default: automatically chosen) +-e, --ec-max-size=INT Maximum number of targets in an equivalence class (default: no maximum) diff --git a/src/kallisto/kallisto_index/script.sh b/src/kallisto/kallisto_index/script.sh new file mode 100644 index 00000000..59a5d3de --- /dev/null +++ b/src/kallisto/kallisto_index/script.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -eo pipefail + +unset_if_false=( par_make_unique par_aa par_distinguish ) + +for var in "${unset_if_false[@]}"; do + temp_var="${!var}" + [[ "$temp_var" == "false" ]] && unset $var +done + +if [ -n "$par_kmer_size" ]; then + if [[ "$par_kmer_size" -lt 1 || "$par_kmer_size" -gt 31 || $(( par_kmer_size % 2 )) -eq 0 ]]; then + echo "Error: Kmer size must be an odd number between 1 and 31." + exit 1 + fi +fi + +kallisto index \ + -i "${par_index}" \ + ${par_kmer_size:+--kmer-size "${par_kmer_size}"} \ + ${par_make_unique:+--make-unique} \ + ${par_aa:+--aa} \ + ${par_distinguish:+--distinguish} \ + ${par_min_size:+--min-size "${par_min_size}"} \ + ${par_ec_max_size:+--ec-max-size "${par_ec_max_size}"} \ + ${par_d_list:+--d-list "${par_d_list}"} \ + ${meta_cpus:+--cpu "${meta_cpus}"} \ + ${par_tmp:+--tmp "${par_tmp}"} \ + "${par_input}" + diff --git a/src/kallisto/kallisto_index/test.sh b/src/kallisto/kallisto_index/test.sh new file mode 100644 index 00000000..2646dcd8 --- /dev/null +++ b/src/kallisto/kallisto_index/test.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +echo ">>>Test 1: Testing $meta_functionality_name with non-default k-mer size" + +"$meta_executable" \ + --input "$meta_resources_dir/test_data/transcriptome.fasta" \ + --index Kallisto \ + --kmer_size 21 + + +echo ">>> Checking whether output exists and is correct" +[ ! -f "Kallisto" ] && echo "Kallisto index does not exist!" && exit 1 +[ ! -s "Kallisto" ] && echo "Kallisto index is empty!" && exit 1 + +kallisto inspect Kallisto 2> test.txt +grep "number of k-mers: 989" test.txt || { echo "The content of the index seems to be incorrect." && exit 1; } + +################################################################################ + +echo ">>>Test 2: Testing $meta_functionality_name with d_list argument" + +"$meta_executable" \ + --input "$meta_resources_dir/test_data/transcriptome.fasta" \ + --index Kallisto \ + --d_list "$meta_resources_dir/test_data/d_list.fasta" + +echo ">>> Checking whether output exists and is correct" +[ ! -f "Kallisto" ] && echo "Kallisto index does not exist!" && exit 1 +[ ! -s "Kallisto" ] && echo "Kallisto index is empty!" && exit 1 + +kallisto inspect Kallisto 2> test.txt +grep "number of k-mers: 959" test.txt || { echo "The content of the index seems to be incorrect." && exit 1; } + +echo "All tests succeeded!" +exit 0 diff --git a/src/kallisto/kallisto_index/test_data/d_list.fasta b/src/kallisto/kallisto_index/test_data/d_list.fasta new file mode 100644 index 00000000..ad5e05bf --- /dev/null +++ b/src/kallisto/kallisto_index/test_data/d_list.fasta @@ -0,0 +1,5 @@ +>YAL067W-A CDS=1-228 +ATGCCAATTATAGGGGTGCCGAGGTGCCTTATAAAACCCTTTTCTGTGCCTGTGACATTTCCTTTTTCGG +TCAAAAAGAATATCCGAATTTTAGATTTGGACCCTCGTACAGAAGCTTATTGTCTAAGCCTGAATTCAGT +CTGCTTTAAACGGCTTCCGCGGAGGAAATATTTCCATCTCTTGAATTCGTACAACATTAAACGTGTGTTG +GGAGTCGTATACTGTTAG diff --git a/src/kallisto/kallisto_index/test_data/transcriptome.fasta b/src/kallisto/kallisto_index/test_data/transcriptome.fasta new file mode 100644 index 00000000..94c06163 --- /dev/null +++ b/src/kallisto/kallisto_index/test_data/transcriptome.fasta @@ -0,0 +1,23 @@ +>YAL069W CDS=1-315 +ATGATCGTAAATAACACACACGTGCTTACCCTACCACTTTATACCACCACCACATGCCATACTCACCCTC +ACTTGTATACTGATTTTACGTACGCACACGGATGCTACAGTATATACCATCTCAAACTTACCCTACTCTC +AGATTCCACTTCACTCCATGGCCCATCTCTCACTGAATCAGTACCAAATGCACTCACATCATTATGCACG +GCACTTGCCTCAGCGGTCTATACCCTGTGCCATTTACCCATAACGCCCATCATTATCCACATTTTGATAT +CTATATCTCATTCGGCGGTCCCAAATATTGTATAA +>YAL068W-A CDS=1-255 +ATGCACGGCACTTGCCTCAGCGGTCTATACCCTGTGCCATTTACCCATAACGCCCATCATTATCCACATT +TTGATATCTATATCTCATTCGGCGGTCCCAAATATTGTATAACTGCCCTTAATACATACGTTATACCACT +TTTGCACCATATACTTACCACTCCATTTATATACACTTATGTCAATATTACAGAAAAATCCCCACAAAAA +TCACCTAAACATAAAAATATTCTACTTTTCAACAATAATACATAA +>YAL068C CDS=1-363 +ATGGTCAAATTAACTTCAATCGCCGCTGGTGTCGCTGCCATCGCTGCTACTGCTTCTGCAACCACCACTC +TAGCTCAATCTGACGAAAGAGTCAACTTGGTGGAATTGGGTGTCTACGTCTCTGATATCAGAGCTCACTT +AGCCCAATACTACATGTTCCAAGCCGCCCACCCAACTGAAACCTACCCAGTCGAAGTTGCTGAAGCCGTT +TTCAACTACGGTGACTTCACCACCATGTTGACCGGTATTGCTCCAGACCAAGTGACCAGAATGATCACCG +GTGTTCCATGGTACTCCAGCAGATTAAAGCCAGCCATCTCCAGTGCTCTATCCAAGGACGGTATCTACAC +TATCGCAAACTAG +>YAL067W-A CDS=1-228 +ATGCCAATTATAGGGGTGCCGAGGTGCCTTATAAAACCCTTTTCTGTGCCTGTGACATTTCCTTTTTCGG +TCAAAAAGAATATCCGAATTTTAGATTTGGACCCTCGTACAGAAGCTTATTGTCTAAGCCTGAATTCAGT +CTGCTTTAAACGGCTTCCGCGGAGGAAATATTTCCATCTCTTGAATTCGTACAACATTAAACGTGTGTTG +GGAGTCGTATACTGTTAG \ No newline at end of file