From 6643f70b424b2afcded3553c546b9ef7dd6a581f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Thu, 19 Mar 2020 14:29:16 +0200 Subject: [PATCH 1/7] add a tutorial with example commands for running single steps of the pipeline --- docs/TUTORIAL.md | 153 +++++++++++++++++++++++++++++++++++++++ docs/scripts/get_read.sh | 25 +++++++ 2 files changed, 178 insertions(+) create mode 100644 docs/TUTORIAL.md create mode 100755 docs/scripts/get_read.sh diff --git a/docs/TUTORIAL.md b/docs/TUTORIAL.md new file mode 100644 index 0000000..dcedd3d --- /dev/null +++ b/docs/TUTORIAL.md @@ -0,0 +1,153 @@ +# Genomic epidemiology with mixed samples: a tutorial +This tutorial constains instructions on how to repproduce the results +of the three main synthetic experiments from *Genomic epidemiology +with mixed samples*, Mäklin et al. 2020, in preparation. + +## Requirements +### mGEMS pipeline +- [Themisto](https://github.com/jnalanko/Themisto) +- [mSWEEP](https://github.com/probic/mSWEEP) +- [mGEMS](https://github.com/probic/mGEMS) +- [shovill](https://github.com/tseemann/shovill/) + +### Phylogenetic analysis +- [snippy](https://github.com/tseemann/snippy/) +- [RAxML-NG](https://github.com/amkozlov/raxml-ng) + +## Tutorial +### Table of Contents + +- [Reference data](#referencedata) +- [Synthetic mixed samples](#mixedsamples) +- [Indexing](#indexing) +- [Pseudoalignment](#pseudoalignment) +- [Abundance estimation](#estimation) +- [Binning](#binning) +- [Assembly](#assembly) +- [SNP calling](#snpcalling) +- [Phylogenetic inference](#phylogenetics) + + +### Reference data +Alternatively, download the reference data from **figshare (TODO:: ADD)** +- [*Escherichia coli*]() +- [*Enterococcus faecalis*]() +- [*Staphylococcus aureus*]() + +### Synthetic mixed samples +Download the reads from the ENA using the 'get_read.sh' script in the +scripts folder +``` +scripts/get_read.sh ERR434377 +scripts/get_read.sh ERR435484 +scripts/get_read.sh ERR905810 +``` +Concatenate the read files together +``` +zcat ERR434377_1.fastq.gz >> ecoli-1_1.fastq +zcat ERR434377_2.fastq.gz >> ecoli-1_2.fastq +zcat ERR435484_1.fastq.gz >> ecoli-1_1.fastq +zcat ERR435484_2.fastq.gz >> ecoli-1_2.fastq +zcat ERR905810_1.fastq.gz >> ecoli-1_1.fastq +zcat ERR905810_2.fastq.gz >> ecoli-1_2.fastq +``` +Compress the mixed sample +``` +gzip ecoli-1_1.fastq +gzip ecoli-1_2.fastq +``` +Remove the original files +``` +rm ERR434377_1.fastq.gz +rm ERR434377_2.fastq.gz +rm ERR435484_1.fastq.gz +rm ERR435484_2.fastq.gz +rm ERR905810_1.fastq.gz +rm ERR905810_2.fastq.gz +``` + +### Indexing +Create a *31*-mer pseudoalignment index with Themisto using four +threads and maximum 8192 megabytes of RAM. +``` +mkdir mGEMS-ecoli-reference +mkdir mGEMS-ecoli-reference/tmp +build_index --k 31 --input-file mGEMS-ecoli-reference-v1.0.0.fasta --auto-colors --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp --mem-megas 8192 --n-threads 4 +``` + +### Pseudoalignment +Align the mixed sample files against the index using four threads +``` +pseudoalign --query-file ecoli-1_1.fastq.gz --outfile ecoli-1_1.aln --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp -n-threads 4 +pseudoalign --query-file ecoli-1_2.fastq.gz --outfile ecoli-1_2.aln --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp --n-threads 4 +``` + +### Abundance estimation +Estimate the relative abundances with mSWEEP and write the results and posterior +probabilities using four threads +``` +mSWEEP --themisto-1 ecoli-1_1.aln --themisto-2 ecoli-1_2.aln --themisto-index mGEMS-ecoli-reference -i mGEMS-ecoli-reference-v1.0.0.grouping -o ecoli-1 --write-probs --gzip-probs -t 4 +``` + +### Binning +Bin the reads with mGEMS and write the binned samples to the +'ecoli-1' folder. +``` +mkdir ecoli-1 +mGEMS -r ecoli-1_1.fastq.gz,ecoli-1_2.fastq.gz --themisto-alns ecoli-1_1.aln,ecoli-1_2.aln -o ecoli-1 --probs ecoli-1_probs.csv.gz -a ecoli-1_abundances.txt --index mGEMS-ecoli-reference +``` +Note this will create bins for **all** reference lineages. If know +which lineages the samples originate from (in our case Escherichia +coli ST131 A, B, and C2), use the '--groups' option to only create +those bins +``` +mGEMS --groups Escherichia_coli_ST131-A,Escherichia_coli_ST131-B,Escherichia_coli_ST131-C2 -r ecoli-1_1.fastq.gz,ecoli-1_2.fastq.gz --themisto-alns ecoli-1_1.aln,ecoli-1_2.aln -o ecoli-1 --probs ecoli-1_probs.csv.gz -a ecoli-1_abundances.txt --index mGEMS-ecoli-reference +``` + +### Assembly +Assemble the sequences with shovill using 4 threads and maximum of +8192 megabytes of RAM +``` +shovill --outdir ecoli-1/Escherichia_coli_ST131-A --R1 ecoli-1/Escherichia_coli_ST131-A_1.fastq.gz --R2 ecoli-1/Escherichia_coli_ST131-A_2.fastq.gz --cpus 4 --ram 8 +shovill --outdir ecoli-1/Escherichia_coli_ST131-B --R1 ecoli-1/Escherichia_coli_ST131-B_1.fastq.gz --R2 ecoli-1/Escherichia_coli_ST131-B_2.fastq.gz --cpus 4 --ram 8 +shovill --outdir ecoli-1/Escherichia_coli_ST131-C2 --R1 ecoli-1/Escherichia_coli_ST131-C2_1.fastq.gz --R2 ecoli-1/Escherichia_coli_ST131-C2_2.fastq.gz --cpus 4 --ram 8 +``` + +### SNP calling +Download the reference sequence 'NCTC13441' from the ENA +``` +wget -O NCTC13441.fasta.gz http://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/uf/UFZF01.fasta.gz +``` +Call SNPs in the genome with snippy +``` +mkdir snippy-tmp +snippy --outdir ecoli-1-Escherichia_coli_ST131-A --ctgs ecoli-1/Escherichia_coli_ST131-A/contigs.fa --ref NCTC13441.fasta.gz --cpus 4 --ram 8 --tmpdir snippy-tmp +snippy --outdir ecoli-1-Escherichia_coli_ST131-B --ctgs ecoli-1/Escherichia_coli_ST131-B/contigs.fa --ref NCTC13441.fasta.gz --cpus 4 --ram 8 --tmpdir snippy-tmp +snippy --outdir ecoli-1-Escherichia_coli_ST131-C2 --ctgs ecoli-1/Escherichia_coli_ST131-C@/contigs.fa --ref NCTC13441.fasta.gz --cpus 4 --ram 8 --tmpdir snippy-tmp +``` +Build the core SNP alignment with snippy +``` +snippy-core --ref ecoli-1-Escherichia_coli_ST131-A/ref.fa ecoli-1-Escherichia_coli_ST131-A ecoli-1-Escherichia_coli_ST131-B ecoli-1-Escherichia_coli_ST131-C2 +``` + +### Phylogenetic inference +Use RAxML-NG to infer a maximum likelihood phylogeny from 100 starting +parsimony and random trees under the GTR+G4 model +``` +raxml-ng --search --msa core.full.aln --prefix CT --threads 4 --tree rand{100},pars{100} --model GTR+G4 +``` +Calculate bootstrap support values with 1000 replicates +``` +raxml-ng --bootstrap --msa core.full.aln.raxml.rba --bs-trees 1000 --prefix CB --threads 4 +``` +Perform a bootstrap convergence check +``` +raxml-ng --bsconverge --bs-trees CB.raxml.bootstraps --prefix CS --seed 2 --threads 4 +``` +Add the bootstrap support values to the maximum likelihood tree with +the best likelihood +``` +raxml-ng --support --tree CT.raxml.bestTree --bs-trees CB.raxml.bootstraps --prefix CS --threads 4 +``` +The best tree with the bootstrap values will be written in the +'CS.support' file. diff --git a/docs/scripts/get_read.sh b/docs/scripts/get_read.sh new file mode 100755 index 0000000..c93d83d --- /dev/null +++ b/docs/scripts/get_read.sh @@ -0,0 +1,25 @@ +ftppath="ftp://ftp.sra.ebi.ac.uk/vol1/fastq" + +strlen=$((${#1})) +dir1="/"${1:0:6} + +if [ ${strlen} -lt 10 ] +then +dir2="/" +elif [ ${strlen} -lt 11 ] +then +dir2="/00"${1: -1}"/" +elif [ ${strlen} -lt 12 ] +then +dir2="/0"${1: -2}"/" +elif [ ${strlen} -lt 13 ] +then +dir2="/"${1: -3}"/" +else +echo "check accession number" +fi + +dlpath=$ftppath$dir1$dir2$1"/" +echo "downloading reads from path: "$dlpath + +wget $dlpath"*" From 1c75b801b462ff05f3a4e2d3b85cbcd7aab5555b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Sun, 22 Mar 2020 10:59:46 +0200 Subject: [PATCH 2/7] change tutorial to show how to reproduce the paper trees and samples --- docs/TUTORIAL.md | 199 +++++++++++++------ docs/scripts/{get_read.sh => get_forward.sh} | 3 +- docs/scripts/get_reverse.sh | 24 +++ 3 files changed, 160 insertions(+), 66 deletions(-) rename docs/scripts/{get_read.sh => get_forward.sh} (85%) create mode 100755 docs/scripts/get_reverse.sh diff --git a/docs/TUTORIAL.md b/docs/TUTORIAL.md index dcedd3d..c325e5f 100644 --- a/docs/TUTORIAL.md +++ b/docs/TUTORIAL.md @@ -3,6 +3,14 @@ This tutorial constains instructions on how to repproduce the results of the three main synthetic experiments from *Genomic epidemiology with mixed samples*, Mäklin et al. 2020, in preparation. +The tutorial will focus on reproducing the *Escherichia coli* +experiment but contains instructions on how to adapt the scripts to +the *Enterococcus faecalis* and *Staphylococcus aureus* experiments. + +For quick instructions on how to run the pipeline in a general +setting, please refer to the README.md file in the root of this +repository. + ## Requirements ### mGEMS pipeline - [Themisto](https://github.com/jnalanko/Themisto) @@ -14,9 +22,27 @@ with mixed samples*, Mäklin et al. 2020, in preparation. - [snippy](https://github.com/tseemann/snippy/) - [RAxML-NG](https://github.com/amkozlov/raxml-ng) +### Extra (macOS only) +#### GNU coreutils +On a macOS system, you'll also need to install GNU coreutils from +homebrew and alias the macOS zcat command to the GNU zcat command for +the duration of the session +``` +brew install coreutils +alias zcat=gzcat +ulimit -n 2048 +``` +#### Concurrent file connections limit +macOS also limits the number of concurrent file connections, which +will have to be increased to run Themisto and shovill +``` +ulimit -n 2048 +``` + ## Tutorial ### Table of Contents +- [Select a species](#selectspecies) - [Reference data](#referencedata) - [Synthetic mixed samples](#mixedsamples) - [Indexing](#indexing) @@ -28,89 +54,120 @@ with mixed samples*, Mäklin et al. 2020, in preparation. - [Phylogenetic inference](#phylogenetics) -### Reference data -Alternatively, download the reference data from **figshare (TODO:: ADD)** -- [*Escherichia coli*]() -- [*Enterococcus faecalis*]() -- [*Staphylococcus aureus*]() - -### Synthetic mixed samples -Download the reads from the ENA using the 'get_read.sh' script in the -scripts folder -``` -scripts/get_read.sh ERR434377 -scripts/get_read.sh ERR435484 -scripts/get_read.sh ERR905810 +### Select a species +Download the supplementary table from the mGEMS manucsript which +contains the relevant information ``` -Concatenate the read files together +## ADD ``` -zcat ERR434377_1.fastq.gz >> ecoli-1_1.fastq -zcat ERR434377_2.fastq.gz >> ecoli-1_2.fastq -zcat ERR435484_1.fastq.gz >> ecoli-1_1.fastq -zcat ERR435484_2.fastq.gz >> ecoli-1_2.fastq -zcat ERR905810_1.fastq.gz >> ecoli-1_1.fastq -zcat ERR905810_2.fastq.gz >> ecoli-1_2.fastq +Filter the table to contain only the *E. coli* (ecoli) experiments ``` -Compress the mixed sample +grep "ecoli" Supplementary_table_mixed_samples.tsv" > mixed_samples.tsv ``` -gzip ecoli-1_1.fastq -gzip ecoli-1_2.fastq -``` -Remove the original files +If you want to reproduce the *E. faecalis* experiments, change 'ecoli' +to 'efaec'. For *S. aureus*, change 'ecoli' to 'saur'. Running these +other two experiments may require resources beyond the typical laptop or +desktop computer. + +### Reference data +Download the relevant reference data from **figshare (TODO:: ADD)** +- [*E. coli*]() +- [*E. faecalis*]() +- [*S. aureus*]() + +by running ``` -rm ERR434377_1.fastq.gz -rm ERR434377_2.fastq.gz -rm ERR435484_1.fastq.gz -rm ERR435484_2.fastq.gz -rm ERR905810_1.fastq.gz -rm ERR905810_2.fastq.gz +## ADD ``` ### Indexing -Create a *31*-mer pseudoalignment index with Themisto using four +Create a *31*-mer pseudoalignment index with Themisto using two threads and maximum 8192 megabytes of RAM. ``` mkdir mGEMS-ecoli-reference mkdir mGEMS-ecoli-reference/tmp -build_index --k 31 --input-file mGEMS-ecoli-reference-v1.0.0.fasta --auto-colors --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp --mem-megas 8192 --n-threads 4 +build_index --k 31 --input-file mGEMS-ecoli-reference-v1.0.0.fasta --auto-colors --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp --mem-megas 8192 --n-threads 2 +``` +change 'ecoli' to 'efaec' or 'saur' if you are trying to reproduce the +other experiments. + +### Synthetic mixed samples +Download the isolate sequencing data and create the synthetic mixed +samples by concatenating the isolate files +``` +## Download the sequencing data and create the samples +oldid="" +while read line; do + id=$(echo $line | cut -f3 -d' ') + sample=$(echo $line | cut -f1 -d' ') + scripts/get_forward.sh $sample | gunzip -c >> $id""_1.fastq + scripts/get_reverse.sh $sample | gunzip -c >> $id""_2.fastq + if [[ "$id" != "$oldid" ]]; then + if [ ! -z "$oldid" -a "$oldid" != "" ]; then + gzip $oldid""_1.fastq + gzip $oldid""_2.fastq + fi + fi + oldid=$id +done < mixed_samples.tsv +gzip $oldid""_1.fastq +gzip $oldid""_2.fastq ``` ### Pseudoalignment -Align the mixed sample files against the index using four threads +Align the mixed sample files against the index using two threads ``` -pseudoalign --query-file ecoli-1_1.fastq.gz --outfile ecoli-1_1.aln --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp -n-threads 4 -pseudoalign --query-file ecoli-1_2.fastq.gz --outfile ecoli-1_2.aln --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp --n-threads 4 +for f1 in *_1.fastq.gz; do + f=${f1%_1.fastq.gz} + f2=$f""_2.fastq.gz + pseudoalign --query-file $f1 --outfile $f""_1.aln --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp --n-threads 2 --rc + pseudoalign --query-file $f2 --outfile $f""_2.aln --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp --n-threads 2 --rc + gzip $f""_1.aln + gzip $f""_2.aln +done ``` ### Abundance estimation Estimate the relative abundances with mSWEEP and write the results and posterior -probabilities using four threads +probabilities using two threads ``` -mSWEEP --themisto-1 ecoli-1_1.aln --themisto-2 ecoli-1_2.aln --themisto-index mGEMS-ecoli-reference -i mGEMS-ecoli-reference-v1.0.0.grouping -o ecoli-1 --write-probs --gzip-probs -t 4 +for f1 in *_1.fastq.gz; do + f=${f1%_1.fastq.gz} + mkdir $f + mSWEEP --themisto-1 $f""_1.aln.gz --themisto-2 $f""_2.aln.gz --themisto-index mGEMS-ecoli-reference -i mGEMS-ecoli-reference-v1.0.0.grouping -o $f/$f --write-probs --gzip-probs -t 2 +done ``` ### Binning Bin the reads with mGEMS and write the binned samples to the 'ecoli-1' folder. ``` -mkdir ecoli-1 -mGEMS -r ecoli-1_1.fastq.gz,ecoli-1_2.fastq.gz --themisto-alns ecoli-1_1.aln,ecoli-1_2.aln -o ecoli-1 --probs ecoli-1_probs.csv.gz -a ecoli-1_abundances.txt --index mGEMS-ecoli-reference -``` -Note this will create bins for **all** reference lineages. If know -which lineages the samples originate from (in our case Escherichia -coli ST131 A, B, and C2), use the '--groups' option to only create -those bins -``` -mGEMS --groups Escherichia_coli_ST131-A,Escherichia_coli_ST131-B,Escherichia_coli_ST131-C2 -r ecoli-1_1.fastq.gz,ecoli-1_2.fastq.gz --themisto-alns ecoli-1_1.aln,ecoli-1_2.aln -o ecoli-1 --probs ecoli-1_probs.csv.gz -a ecoli-1_abundances.txt --index mGEMS-ecoli-reference -``` +while read line; do + id=$(echo $line | cut -f3 -d' ') + cluster=$(echo $line | cut -f2 -d' ') + mGEMS --groups $cluster -r $id""_1.fastq.gz,$id""_2.fastq.gz --themisto-alns $id""_1.aln.gz,$id""_2.aln.gz -o $id --probs $id/$id""_probs.csv.gz -a $id/$id""_abundances.txt --index mGEMS-ecoli-reference +done < mixed_samples.tsv +``` +Note that by default mGEMS creates bins for **all** reference lineages. If know +which lineages the samples originate from (in our case these are +supplied in the mixed_samples.tsv in the second column), the +'--groups' option enables you to only create those bins. Multiple +groups can be binned in the single run by supplying them as a +comma-separated list. ### Assembly -Assemble the sequences with shovill using 4 threads and maximum of +Assemble the sequences with shovill using 2 threads and maximum of 8192 megabytes of RAM ``` -shovill --outdir ecoli-1/Escherichia_coli_ST131-A --R1 ecoli-1/Escherichia_coli_ST131-A_1.fastq.gz --R2 ecoli-1/Escherichia_coli_ST131-A_2.fastq.gz --cpus 4 --ram 8 -shovill --outdir ecoli-1/Escherichia_coli_ST131-B --R1 ecoli-1/Escherichia_coli_ST131-B_1.fastq.gz --R2 ecoli-1/Escherichia_coli_ST131-B_2.fastq.gz --cpus 4 --ram 8 -shovill --outdir ecoli-1/Escherichia_coli_ST131-C2 --R1 ecoli-1/Escherichia_coli_ST131-C2_1.fastq.gz --R2 ecoli-1/Escherichia_coli_ST131-C2_2.fastq.gz --cpus 4 --ram 8 +while read line; do + id=$(echo $line | cut -f3 -d' ') + cluster=$(echo $line | cut -f2 -d' ') + shovill --outdir $id/$cluster --R1 $id/$cluster""_1.fastq.gz --R2 $id/$cluster""_2.fastq.gz --cpus 2 --ram 8 + mv $id/$cluster/contigs.fa $id/ + rm -rf $id/$cluster + mkdir $id/$cluster + mv $id/contigs.fa $id/$cluster/ +done < mixed_samples.tsv ``` ### SNP calling @@ -121,33 +178,47 @@ wget -O NCTC13441.fasta.gz http://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/uf/ Call SNPs in the genome with snippy ``` mkdir snippy-tmp -snippy --outdir ecoli-1-Escherichia_coli_ST131-A --ctgs ecoli-1/Escherichia_coli_ST131-A/contigs.fa --ref NCTC13441.fasta.gz --cpus 4 --ram 8 --tmpdir snippy-tmp -snippy --outdir ecoli-1-Escherichia_coli_ST131-B --ctgs ecoli-1/Escherichia_coli_ST131-B/contigs.fa --ref NCTC13441.fasta.gz --cpus 4 --ram 8 --tmpdir snippy-tmp -snippy --outdir ecoli-1-Escherichia_coli_ST131-C2 --ctgs ecoli-1/Escherichia_coli_ST131-C@/contigs.fa --ref NCTC13441.fasta.gz --cpus 4 --ram 8 --tmpdir snippy-tmp +gunzip NCTC13441.fasta.gz +while read line; do + id=$(echo $line | cut -f3 -d' ') + sample=$(echo $line | cut -f1 -d' ') + cluster=$(echo $line | cut -f2 -d' ') + snippy --outdir $id/$cluster/$sample --ctgs $id/$cluster/contigs.fa --ref NCTC13441.fasta --cpus 2 --ram 8 --tmpdir snippy-tmp +done < mixed_samples.tsv +gzip NCTC13441.fasta ``` Build the core SNP alignment with snippy ``` -snippy-core --ref ecoli-1-Escherichia_coli_ST131-A/ref.fa ecoli-1-Escherichia_coli_ST131-A ecoli-1-Escherichia_coli_ST131-B ecoli-1-Escherichia_coli_ST131-C2 -``` +snippys="" +while read line; do + id=$(echo $line | cut -f3 -d' ') + sample=$(echo $line | cut -f1 -d' ') + cluster=$(echo $line | cut -f2 -d' ') + snippys=$snippys""$id/$cluster/$sample" " + last=$id/$cluster/$sample +done < mixed_samples.tsv +snippy-core --ref $last/ref.fa $snippys +``` +the alignment will be stored in the 'core.full.aln' file. ### Phylogenetic inference -Use RAxML-NG to infer a maximum likelihood phylogeny from 100 starting +Use RAxML-NG to infer a maximum likelihood phylogeny from 10 starting parsimony and random trees under the GTR+G4 model ``` -raxml-ng --search --msa core.full.aln --prefix CT --threads 4 --tree rand{100},pars{100} --model GTR+G4 +raxml-ng --search --msa core.full.aln --prefix CT --threads 2 --tree rand{10},pars{10} --model GTR+G4 ``` -Calculate bootstrap support values with 1000 replicates +Calculate bootstrap support values with 100 replicates ``` -raxml-ng --bootstrap --msa core.full.aln.raxml.rba --bs-trees 1000 --prefix CB --threads 4 +raxml-ng --bootstrap --msa core.full.aln --bs-trees 100 --prefix CB --threads 2 --model GTR+G4 ``` Perform a bootstrap convergence check ``` -raxml-ng --bsconverge --bs-trees CB.raxml.bootstraps --prefix CS --seed 2 --threads 4 +raxml-ng --bsconverge --bs-trees CB.raxml.bootstraps --prefix CS --seed 2 --threads 2 ``` Add the bootstrap support values to the maximum likelihood tree with the best likelihood ``` -raxml-ng --support --tree CT.raxml.bestTree --bs-trees CB.raxml.bootstraps --prefix CS --threads 4 +raxml-ng --support --tree CT.raxml.bestTree --bs-trees CB.raxml.bootstraps --prefix CS --threads 2 ``` The best tree with the bootstrap values will be written in the -'CS.support' file. +'CS.raxml.support' file. diff --git a/docs/scripts/get_read.sh b/docs/scripts/get_forward.sh similarity index 85% rename from docs/scripts/get_read.sh rename to docs/scripts/get_forward.sh index c93d83d..ef82db6 100755 --- a/docs/scripts/get_read.sh +++ b/docs/scripts/get_forward.sh @@ -20,6 +20,5 @@ echo "check accession number" fi dlpath=$ftppath$dir1$dir2$1"/" -echo "downloading reads from path: "$dlpath -wget $dlpath"*" +wget -qO- $dlpath""$1_1.fastq.gz diff --git a/docs/scripts/get_reverse.sh b/docs/scripts/get_reverse.sh new file mode 100755 index 0000000..e694bef --- /dev/null +++ b/docs/scripts/get_reverse.sh @@ -0,0 +1,24 @@ +ftppath="ftp://ftp.sra.ebi.ac.uk/vol1/fastq" + +strlen=$((${#1})) +dir1="/"${1:0:6} + +if [ ${strlen} -lt 10 ] +then +dir2="/" +elif [ ${strlen} -lt 11 ] +then +dir2="/00"${1: -1}"/" +elif [ ${strlen} -lt 12 ] +then +dir2="/0"${1: -2}"/" +elif [ ${strlen} -lt 13 ] +then +dir2="/"${1: -3}"/" +else +echo "check accession number" +fi + +dlpath=$ftppath$dir1$dir2$1"/" + +wget -qO- $dlpath""$1_2.fastq.gz From 27af5856ceff819937cddd53793442ca6d250004 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Mar 2020 12:50:44 +0200 Subject: [PATCH 3/7] add links to the mixed sample information and reference datasets --- docs/TUTORIAL.md | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/docs/TUTORIAL.md b/docs/TUTORIAL.md index c325e5f..28a400d 100644 --- a/docs/TUTORIAL.md +++ b/docs/TUTORIAL.md @@ -58,11 +58,11 @@ ulimit -n 2048 Download the supplementary table from the mGEMS manucsript which contains the relevant information ``` -## ADD +https://zenodo.org/record/3724144/files/mGEMS_Supplementary_Table_mixed_samples.tsv ``` Filter the table to contain only the *E. coli* (ecoli) experiments ``` -grep "ecoli" Supplementary_table_mixed_samples.tsv" > mixed_samples.tsv +grep "ecoli" mGEMS_Supplementary_Table_mixed_samples.tsv" > mixed_samples.tsv ``` If you want to reproduce the *E. faecalis* experiments, change 'ecoli' to 'efaec'. For *S. aureus*, change 'ecoli' to 'saur'. Running these @@ -70,15 +70,18 @@ other two experiments may require resources beyond the typical laptop or desktop computer. ### Reference data -Download the relevant reference data from **figshare (TODO:: ADD)** -- [*E. coli*]() -- [*E. faecalis*]() -- [*S. aureus*]() +Download and extract the relevant reference data from zenodo +- [*E. coli*](https://zenodo.org/record/3724112) +- [*E. faecalis*](https://zenodo.org/record/3724101) +- [*S. aureus*](https://zenodo.org/record/3724135) by running ``` -## ADD +wget https://zenodo.org/record/3724112/files/mGEMS-ecoli-reference-v1.0.0.tar.gz +tar -zxvf mGEMS-ecoli-reference-v1.0.0.tar.gz ``` +Construction of the reference dataset(s) is describe in more detail in +Mäklin et al. 2020. ### Indexing Create a *31*-mer pseudoalignment index with Themisto using two @@ -86,7 +89,7 @@ threads and maximum 8192 megabytes of RAM. ``` mkdir mGEMS-ecoli-reference mkdir mGEMS-ecoli-reference/tmp -build_index --k 31 --input-file mGEMS-ecoli-reference-v1.0.0.fasta --auto-colors --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp --mem-megas 8192 --n-threads 2 +build_index --k 31 --input-file mGEMS-ecoli-reference-sequences-v1.0.0.fasta.gz --auto-colors --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp --mem-megas 8192 --n-threads 2 ``` change 'ecoli' to 'efaec' or 'saur' if you are trying to reproduce the other experiments. @@ -134,7 +137,7 @@ probabilities using two threads for f1 in *_1.fastq.gz; do f=${f1%_1.fastq.gz} mkdir $f - mSWEEP --themisto-1 $f""_1.aln.gz --themisto-2 $f""_2.aln.gz --themisto-index mGEMS-ecoli-reference -i mGEMS-ecoli-reference-v1.0.0.grouping -o $f/$f --write-probs --gzip-probs -t 2 + mSWEEP --themisto-1 $f""_1.aln.gz --themisto-2 $f""_2.aln.gz --themisto-index mGEMS-ecoli-reference -i mGEMS-ecoli-reference-grouping-v1.0.0.txt -o $f/$f --write-probs --gzip-probs -t 2 done ``` From 5ea2b053dd96301f0ac2853aaef62852c94315a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Mar 2020 12:54:29 +0200 Subject: [PATCH 4/7] clarify some things about the reference data --- docs/TUTORIAL.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/TUTORIAL.md b/docs/TUTORIAL.md index 28a400d..9472e75 100644 --- a/docs/TUTORIAL.md +++ b/docs/TUTORIAL.md @@ -58,7 +58,7 @@ ulimit -n 2048 Download the supplementary table from the mGEMS manucsript which contains the relevant information ``` -https://zenodo.org/record/3724144/files/mGEMS_Supplementary_Table_mixed_samples.tsv +wget https://zenodo.org/record/3724144/files/mGEMS_Supplementary_Table_mixed_samples.tsv ``` Filter the table to contain only the *E. coli* (ecoli) experiments ``` @@ -70,18 +70,18 @@ other two experiments may require resources beyond the typical laptop or desktop computer. ### Reference data -Download and extract the relevant reference data from zenodo +The reference data from Mäklin et al. is available from zenodo - [*E. coli*](https://zenodo.org/record/3724112) - [*E. faecalis*](https://zenodo.org/record/3724101) - [*S. aureus*](https://zenodo.org/record/3724135) +Construction of the reference dataset(s) is describe in more detail in +Mäklin et al. 2020. -by running +Download and extract the *E. coli* dataset by running ``` wget https://zenodo.org/record/3724112/files/mGEMS-ecoli-reference-v1.0.0.tar.gz tar -zxvf mGEMS-ecoli-reference-v1.0.0.tar.gz ``` -Construction of the reference dataset(s) is describe in more detail in -Mäklin et al. 2020. ### Indexing Create a *31*-mer pseudoalignment index with Themisto using two From 2d19d82d33440bf4140788aeda1f64c40c333898 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Mar 2020 12:55:04 +0200 Subject: [PATCH 5/7] fix a typo --- docs/TUTORIAL.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/TUTORIAL.md b/docs/TUTORIAL.md index 9472e75..1a9cced 100644 --- a/docs/TUTORIAL.md +++ b/docs/TUTORIAL.md @@ -74,6 +74,7 @@ The reference data from Mäklin et al. is available from zenodo - [*E. coli*](https://zenodo.org/record/3724112) - [*E. faecalis*](https://zenodo.org/record/3724101) - [*S. aureus*](https://zenodo.org/record/3724135) + Construction of the reference dataset(s) is describe in more detail in Mäklin et al. 2020. From 060152a6a45c87ae887b9e3c1c60305ed0bd2920 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Mar 2020 12:59:36 +0200 Subject: [PATCH 6/7] add a note about the tutorial --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 708338d..481112c 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,12 @@ mGEMS bin and mGEMS extract, which bin the reads in the input pseudoalignment (mGEMS bin) and extract the binned reads from the original mixed samples (mGEMS extract). -### (Pseudo)tutorial — Full pipeline with Themisto and mSWEEP +### Tutorial — E. coli ST131 sublineages +A tutorial for reproducing the *E. coli* ST131 sublineage phylogenetic +tree presented in Mäklin et al. 2020 using mGEMS is available in the +[docs folder of this repository](docs/TUTORIAL.md). + +### Quickstart — full pipeline Build a [Themisto](https://github.com/jnalanko/themisto) index to align against. ``` From 3aec6d9dd63555b88d951e8807c55c84bc807cd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 30 Mar 2020 13:13:49 +0300 Subject: [PATCH 7/7] add option to filter target groups by minimum abundance threshold --- README.md | 6 +++--- src/main.cpp | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 55fdcd3..a99fbbf 100644 --- a/README.md +++ b/README.md @@ -107,9 +107,9 @@ mGEMS accepts the following input flags -a Relative abundance estimates from mSWEEP (tab-separated, 1st column has the group names and 2nd column the estimates). --index Themisto pseudoalignment index directory. - --groups (Optional) which groups to extract from the input reads. - --compress (Optional) Toggle compressing the output files - (default: compress) + --groups (Optional) Which groups to extract from the input reads. + --min-abundance (Optional) Extract only groups that have a relative abundance higher than this value. + --compress (Optional) Toggle compressing the output files (default: compress) ``` diff --git a/src/main.cpp b/src/main.cpp index eb53541..9245a0d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -39,9 +39,11 @@ void ParseBin(int argc, char* argv[], cxxargs::Arguments &args) { args.add_long_argument("probs", "Posterior probabilities from mSWEEP."); args.add_long_argument("merge-mode", "How to merge paired-end alignments from Themisto (default: intersection).", "intersection"); args.add_long_argument>("groups", "Which reference groups to bin reads to (default: all)."); + args.add_long_argument("min-abundance", "Bin only the groups that have a relative abundance higher than this value (optional)."); args.add_short_argument('q', "Tuning parameter for the binning thresholds (default: 1.0).", (long double)1); args.add_long_argument("index", "Themisto pseudoalignment index directory."); args.set_not_required("groups"); + args.set_not_required("min-abundance"); args.parse(argc, argv); } @@ -91,6 +93,15 @@ void ReadAndExtract(cxxargs::Arguments &args) { Extract(bins, target_groups, args); } +void FilterTargetGroups(const std::vector &group_names, const std::vector &abundances, const long double min_abundance, std::vector *target_groups) { + uint32_t n_groups = group_names.size(); + for (uint32_t i = 0; i < n_groups; ++i) { + if (abundances[i] < min_abundance && std::find(target_groups->begin(), target_groups->end(), group_names[i]) != target_groups->end()) { + target_groups->erase(std::find(target_groups->begin(), target_groups->end(), group_names[i])); + } + } +} + void Bin(const cxxargs::Arguments &args, bool extract_bins) { DIR* dir = opendir(args.value('o').c_str()); if (dir) { @@ -128,6 +139,10 @@ void Bin(const cxxargs::Arguments &args, bool extract_bins) { } else { target_groups = groups; } + if (args.is_initialized("min-abundance")) { + FilterTargetGroups(groups, abundances, args.value("min-abundance"), &target_groups); + } + const std::vector> &bins = mGEMS::Bin(aln, args.value('q'), abundances, groups, probs_file.stream(), &target_groups); if (!extract_bins) { uint32_t n_bins = bins.size();