From 6643f70b424b2afcded3553c546b9ef7dd6a581f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tommi=20M=C3=A4klin?= <tommi@maklin.fi>
Date: Thu, 19 Mar 2020 14:29:16 +0200
Subject: [PATCH 1/7] add a tutorial with example commands for running single
 steps of the pipeline

---
 docs/TUTORIAL.md         | 153 +++++++++++++++++++++++++++++++++++++++
 docs/scripts/get_read.sh |  25 +++++++
 2 files changed, 178 insertions(+)
 create mode 100644 docs/TUTORIAL.md
 create mode 100755 docs/scripts/get_read.sh
diff --git a/docs/TUTORIAL.md b/docs/TUTORIAL.md
new file mode 100644
index 0000000..dcedd3d
--- /dev/null
+++ b/docs/TUTORIAL.md
@@ -0,0 +1,153 @@
+# Genomic epidemiology with mixed samples: a tutorial
+This tutorial constains instructions on how to repproduce the results
+of the three main synthetic experiments from *Genomic epidemiology
+with mixed samples*, Mäklin et al. 2020, in preparation.
+
+## Requirements
+### mGEMS pipeline
+- [Themisto](https://github.com/jnalanko/Themisto)
+- [mSWEEP](https://github.com/probic/mSWEEP)
+- [mGEMS](https://github.com/probic/mGEMS)
+- [shovill](https://github.com/tseemann/shovill/)
+
+### Phylogenetic analysis
+- [snippy](https://github.com/tseemann/snippy/)
+- [RAxML-NG](https://github.com/amkozlov/raxml-ng)
+
+## Tutorial
+### Table of Contents
+
+- [Reference data](#referencedata)
+- [Synthetic mixed samples](#mixedsamples)
+- [Indexing](#indexing)
+- [Pseudoalignment](#pseudoalignment)
+- [Abundance estimation](#estimation)
+- [Binning](#binning)
+- [Assembly](#assembly)
+- [SNP calling](#snpcalling)
+- [Phylogenetic inference](#phylogenetics)
+
+
+### <a name="referencedata"></a>Reference data
+Alternatively, download the reference data from **figshare (TODO:: ADD)**
+- [*Escherichia coli*]()
+- [*Enterococcus faecalis*]()
+- [*Staphylococcus aureus*]()
+
+### <a name="mixedsamples"></a>Synthetic mixed samples
+Download the reads from the ENA using the 'get_read.sh' script in the
+scripts folder
+```
+scripts/get_read.sh ERR434377
+scripts/get_read.sh ERR435484
+scripts/get_read.sh ERR905810
+```
+Concatenate the read files together
+```
+zcat ERR434377_1.fastq.gz >> ecoli-1_1.fastq
+zcat ERR434377_2.fastq.gz >> ecoli-1_2.fastq
+zcat ERR435484_1.fastq.gz >> ecoli-1_1.fastq
+zcat ERR435484_2.fastq.gz >> ecoli-1_2.fastq
+zcat ERR905810_1.fastq.gz >> ecoli-1_1.fastq
+zcat ERR905810_2.fastq.gz >> ecoli-1_2.fastq
+```
+Compress the mixed sample
+```
+gzip ecoli-1_1.fastq
+gzip ecoli-1_2.fastq
+```
+Remove the original files
+```
+rm ERR434377_1.fastq.gz
+rm ERR434377_2.fastq.gz
+rm ERR435484_1.fastq.gz
+rm ERR435484_2.fastq.gz
+rm ERR905810_1.fastq.gz
+rm ERR905810_2.fastq.gz
+```
+
+### <a name="indexing"></a>Indexing
+Create a *31*-mer pseudoalignment index with Themisto using four
+threads and maximum 8192 megabytes of RAM.
+```
+mkdir mGEMS-ecoli-reference
+mkdir mGEMS-ecoli-reference/tmp
+build_index --k 31 --input-file mGEMS-ecoli-reference-v1.0.0.fasta --auto-colors --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp --mem-megas 8192 --n-threads 4
+```
+
+### <a name="pseudoalignment"></a>Pseudoalignment
+Align the mixed sample files against the index using four threads
+```
+pseudoalign --query-file ecoli-1_1.fastq.gz --outfile ecoli-1_1.aln --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp -n-threads 4
+pseudoalign --query-file ecoli-1_2.fastq.gz --outfile ecoli-1_2.aln --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp --n-threads 4 
+```
+
+### <a name="estimation"></a>Abundance estimation
+Estimate the relative abundances with mSWEEP and write the results and posterior
+probabilities using four threads
+```
+mSWEEP --themisto-1 ecoli-1_1.aln --themisto-2 ecoli-1_2.aln --themisto-index mGEMS-ecoli-reference -i mGEMS-ecoli-reference-v1.0.0.grouping -o ecoli-1 --write-probs --gzip-probs -t 4
+```
+
+### <a name="binning"></a>Binning
+Bin the reads with mGEMS and write the binned samples to the
+'ecoli-1' folder.
+```
+mkdir ecoli-1
+mGEMS -r ecoli-1_1.fastq.gz,ecoli-1_2.fastq.gz --themisto-alns ecoli-1_1.aln,ecoli-1_2.aln -o ecoli-1 --probs ecoli-1_probs.csv.gz -a ecoli-1_abundances.txt --index mGEMS-ecoli-reference
+```
+Note this will create bins for **all** reference lineages. If know
+which lineages the samples originate from (in our case Escherichia
+coli ST131 A, B, and C2), use the '--groups' option to only create
+those bins
+```
+mGEMS --groups Escherichia_coli_ST131-A,Escherichia_coli_ST131-B,Escherichia_coli_ST131-C2 -r ecoli-1_1.fastq.gz,ecoli-1_2.fastq.gz --themisto-alns ecoli-1_1.aln,ecoli-1_2.aln -o ecoli-1 --probs ecoli-1_probs.csv.gz -a ecoli-1_abundances.txt --index mGEMS-ecoli-reference
+```
+
+### <a name="assembly"></a>Assembly
+Assemble the sequences with shovill using 4 threads and maximum of
+8192 megabytes of RAM
+```
+shovill --outdir ecoli-1/Escherichia_coli_ST131-A --R1 ecoli-1/Escherichia_coli_ST131-A_1.fastq.gz --R2 ecoli-1/Escherichia_coli_ST131-A_2.fastq.gz --cpus 4 --ram 8
+shovill --outdir ecoli-1/Escherichia_coli_ST131-B --R1 ecoli-1/Escherichia_coli_ST131-B_1.fastq.gz --R2 ecoli-1/Escherichia_coli_ST131-B_2.fastq.gz --cpus 4 --ram 8
+shovill --outdir ecoli-1/Escherichia_coli_ST131-C2 --R1 ecoli-1/Escherichia_coli_ST131-C2_1.fastq.gz --R2 ecoli-1/Escherichia_coli_ST131-C2_2.fastq.gz --cpus 4 --ram 8
+```
+
+### <a name="snpcalling"></a>SNP calling
+Download the reference sequence 'NCTC13441' from the ENA
+```
+wget -O NCTC13441.fasta.gz http://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/uf/UFZF01.fasta.gz
+```
+Call SNPs in the genome with snippy
+```
+mkdir snippy-tmp
+snippy --outdir ecoli-1-Escherichia_coli_ST131-A --ctgs ecoli-1/Escherichia_coli_ST131-A/contigs.fa --ref NCTC13441.fasta.gz --cpus 4 --ram 8 --tmpdir snippy-tmp
+snippy --outdir ecoli-1-Escherichia_coli_ST131-B --ctgs ecoli-1/Escherichia_coli_ST131-B/contigs.fa --ref NCTC13441.fasta.gz --cpus 4 --ram 8 --tmpdir snippy-tmp
+snippy --outdir ecoli-1-Escherichia_coli_ST131-C2 --ctgs ecoli-1/Escherichia_coli_ST131-C@/contigs.fa --ref NCTC13441.fasta.gz --cpus 4 --ram 8 --tmpdir snippy-tmp
+```
+Build the core SNP alignment with snippy
+```
+snippy-core --ref ecoli-1-Escherichia_coli_ST131-A/ref.fa ecoli-1-Escherichia_coli_ST131-A ecoli-1-Escherichia_coli_ST131-B ecoli-1-Escherichia_coli_ST131-C2
+```
+
+### <a name="phylogenetics"></a>Phylogenetic inference
+Use RAxML-NG to infer a maximum likelihood phylogeny from 100 starting
+parsimony and random trees under the GTR+G4 model
+```
+raxml-ng --search --msa core.full.aln --prefix CT --threads 4 --tree rand{100},pars{100} --model GTR+G4
+```
+Calculate bootstrap support values with 1000 replicates
+```
+raxml-ng --bootstrap --msa core.full.aln.raxml.rba --bs-trees 1000 --prefix CB --threads 4
+```
+Perform a bootstrap convergence check
+```
+raxml-ng --bsconverge --bs-trees CB.raxml.bootstraps --prefix CS --seed 2 --threads 4
+```
+Add the bootstrap support values to the maximum likelihood tree with
+the best likelihood
+```
+raxml-ng --support --tree CT.raxml.bestTree --bs-trees CB.raxml.bootstraps --prefix CS --threads 4
+```
+The best tree with the bootstrap values will be written in the
+'CS.support' file.
diff --git a/docs/scripts/get_read.sh b/docs/scripts/get_read.sh
new file mode 100755
index 0000000..c93d83d
--- /dev/null
+++ b/docs/scripts/get_read.sh
@@ -0,0 +1,25 @@
+ftppath="ftp://ftp.sra.ebi.ac.uk/vol1/fastq"
+
+strlen=$((${#1}))
+dir1="/"${1:0:6}
+
+if [ ${strlen} -lt 10 ]
+then
+dir2="/"
+elif [ ${strlen} -lt 11 ]
+then
+dir2="/00"${1: -1}"/"
+elif [ ${strlen} -lt 12 ]
+then
+dir2="/0"${1: -2}"/"
+elif [ ${strlen} -lt 13 ]
+then
+dir2="/"${1: -3}"/"
+else
+echo "check accession number"
+fi
+
+dlpath=$ftppath$dir1$dir2$1"/"
+echo "downloading reads from path: "$dlpath
+
+wget $dlpath"*"

From 1c75b801b462ff05f3a4e2d3b85cbcd7aab5555b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tommi=20M=C3=A4klin?= <tommi@maklin.fi>
Date: Sun, 22 Mar 2020 10:59:46 +0200
Subject: [PATCH 2/7] change tutorial to show how to reproduce the paper trees
 and samples

---
 docs/TUTORIAL.md                             | 199 +++++++++++++------
 docs/scripts/{get_read.sh => get_forward.sh} |   3 +-
 docs/scripts/get_reverse.sh                  |  24 +++
 3 files changed, 160 insertions(+), 66 deletions(-)
 rename docs/scripts/{get_read.sh => get_forward.sh} (85%)
 create mode 100755 docs/scripts/get_reverse.sh

diff --git a/docs/TUTORIAL.md b/docs/TUTORIAL.md
index dcedd3d..c325e5f 100644
--- a/docs/TUTORIAL.md
+++ b/docs/TUTORIAL.md
@@ -3,6 +3,14 @@ This tutorial constains instructions on how to repproduce the results
 of the three main synthetic experiments from *Genomic epidemiology
 with mixed samples*, Mäklin et al. 2020, in preparation.
 
+The tutorial will focus on reproducing the *Escherichia coli*
+experiment but contains instructions on how to adapt the scripts to
+the *Enterococcus faecalis* and *Staphylococcus aureus* experiments.
+
+For quick instructions on how to run the pipeline in a general
+setting, please refer to the README.md file in the root of this
+repository.
+
 ## Requirements
 ### mGEMS pipeline
 - [Themisto](https://github.com/jnalanko/Themisto)
@@ -14,9 +22,27 @@ with mixed samples*, Mäklin et al. 2020, in preparation.
 - [snippy](https://github.com/tseemann/snippy/)
 - [RAxML-NG](https://github.com/amkozlov/raxml-ng)
 
+### Extra (macOS only)
+#### GNU coreutils
+On a macOS system, you'll also need to install GNU coreutils from
+homebrew and alias the macOS zcat command to the GNU zcat command for
+the duration of the session
+```
+brew install coreutils
+alias zcat=gzcat
+ulimit -n 2048
+```
+#### Concurrent file connections limit
+macOS also limits the number of concurrent file connections, which
+will have to be increased to run Themisto and shovill
+```
+ulimit -n 2048
+```
+
 ## Tutorial
 ### Table of Contents
 
+- [Select a species](#selectspecies)
 - [Reference data](#referencedata)
 - [Synthetic mixed samples](#mixedsamples)
 - [Indexing](#indexing)
@@ -28,89 +54,120 @@ with mixed samples*, Mäklin et al. 2020, in preparation.
 - [Phylogenetic inference](#phylogenetics)
 
 
-### <a name="referencedata"></a>Reference data
-Alternatively, download the reference data from **figshare (TODO:: ADD)**
-- [*Escherichia coli*]()
-- [*Enterococcus faecalis*]()
-- [*Staphylococcus aureus*]()
-
-### <a name="mixedsamples"></a>Synthetic mixed samples
-Download the reads from the ENA using the 'get_read.sh' script in the
-scripts folder
-```
-scripts/get_read.sh ERR434377
-scripts/get_read.sh ERR435484
-scripts/get_read.sh ERR905810
+### <a name="selectspecies"></a>Select a species
+Download the supplementary table from the mGEMS manucsript which
+contains the relevant information
 ```
-Concatenate the read files together
+## ADD
 ```
-zcat ERR434377_1.fastq.gz >> ecoli-1_1.fastq
-zcat ERR434377_2.fastq.gz >> ecoli-1_2.fastq
-zcat ERR435484_1.fastq.gz >> ecoli-1_1.fastq
-zcat ERR435484_2.fastq.gz >> ecoli-1_2.fastq
-zcat ERR905810_1.fastq.gz >> ecoli-1_1.fastq
-zcat ERR905810_2.fastq.gz >> ecoli-1_2.fastq
+Filter the table to contain only the *E. coli* (ecoli) experiments
 ```
-Compress the mixed sample
+grep "ecoli" Supplementary_table_mixed_samples.tsv" > mixed_samples.tsv
 ```
-gzip ecoli-1_1.fastq
-gzip ecoli-1_2.fastq
-```
-Remove the original files
+If you want to reproduce the *E. faecalis* experiments, change 'ecoli'
+to 'efaec'. For *S. aureus*, change 'ecoli' to 'saur'. Running these
+other two experiments may require resources beyond the typical laptop or
+desktop computer.
+
+### <a name="referencedata"></a>Reference data
+Download the relevant reference data from **figshare (TODO:: ADD)**
+- [*E. coli*]()
+- [*E. faecalis*]()
+- [*S. aureus*]()
+
+by running
 ```
-rm ERR434377_1.fastq.gz
-rm ERR434377_2.fastq.gz
-rm ERR435484_1.fastq.gz
-rm ERR435484_2.fastq.gz
-rm ERR905810_1.fastq.gz
-rm ERR905810_2.fastq.gz
+## ADD
 ```
 
 ### <a name="indexing"></a>Indexing
-Create a *31*-mer pseudoalignment index with Themisto using four
+Create a *31*-mer pseudoalignment index with Themisto using two
 threads and maximum 8192 megabytes of RAM.
 ```
 mkdir mGEMS-ecoli-reference
 mkdir mGEMS-ecoli-reference/tmp
-build_index --k 31 --input-file mGEMS-ecoli-reference-v1.0.0.fasta --auto-colors --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp --mem-megas 8192 --n-threads 4
+build_index --k 31 --input-file mGEMS-ecoli-reference-v1.0.0.fasta --auto-colors --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp --mem-megas 8192 --n-threads 2
+```
+change 'ecoli' to 'efaec' or 'saur' if you are trying to reproduce the
+other experiments.
+
+### <a name="mixedsamples"></a>Synthetic mixed samples
+Download the isolate sequencing data and create the synthetic mixed
+samples by concatenating the isolate files
+```
+## Download the sequencing data and create the samples
+oldid=""
+while read line; do
+	id=$(echo $line | cut -f3 -d' ')
+	sample=$(echo $line | cut -f1 -d' ')
+	scripts/get_forward.sh $sample | gunzip -c >> $id""_1.fastq
+	scripts/get_reverse.sh $sample | gunzip -c >> $id""_2.fastq
+	if [[ "$id" != "$oldid" ]]; then
+		if [ ! -z "$oldid" -a "$oldid" != "" ]; then
+			gzip $oldid""_1.fastq
+			gzip $oldid""_2.fastq
+		fi
+	fi
+	oldid=$id
+done < mixed_samples.tsv
+gzip $oldid""_1.fastq
+gzip $oldid""_2.fastq
 ```
 
 ### <a name="pseudoalignment"></a>Pseudoalignment
-Align the mixed sample files against the index using four threads
+Align the mixed sample files against the index using two threads
 ```
-pseudoalign --query-file ecoli-1_1.fastq.gz --outfile ecoli-1_1.aln --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp -n-threads 4
-pseudoalign --query-file ecoli-1_2.fastq.gz --outfile ecoli-1_2.aln --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp --n-threads 4 
+for f1 in *_1.fastq.gz; do
+	f=${f1%_1.fastq.gz}
+	f2=$f""_2.fastq.gz
+	pseudoalign --query-file $f1 --outfile $f""_1.aln --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp --n-threads 2 --rc
+	pseudoalign --query-file $f2 --outfile $f""_2.aln --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp --n-threads 2 --rc
+	gzip $f""_1.aln
+	gzip $f""_2.aln
+done
 ```
 
 ### <a name="estimation"></a>Abundance estimation
 Estimate the relative abundances with mSWEEP and write the results and posterior
-probabilities using four threads
+probabilities using two threads
 ```
-mSWEEP --themisto-1 ecoli-1_1.aln --themisto-2 ecoli-1_2.aln --themisto-index mGEMS-ecoli-reference -i mGEMS-ecoli-reference-v1.0.0.grouping -o ecoli-1 --write-probs --gzip-probs -t 4
+for f1 in *_1.fastq.gz; do
+	f=${f1%_1.fastq.gz}
+	mkdir $f
+	mSWEEP --themisto-1 $f""_1.aln.gz --themisto-2 $f""_2.aln.gz --themisto-index mGEMS-ecoli-reference -i mGEMS-ecoli-reference-v1.0.0.grouping -o $f/$f --write-probs --gzip-probs -t 2
+done
 ```
 
 ### <a name="binning"></a>Binning
 Bin the reads with mGEMS and write the binned samples to the
 'ecoli-1' folder.
 ```
-mkdir ecoli-1
-mGEMS -r ecoli-1_1.fastq.gz,ecoli-1_2.fastq.gz --themisto-alns ecoli-1_1.aln,ecoli-1_2.aln -o ecoli-1 --probs ecoli-1_probs.csv.gz -a ecoli-1_abundances.txt --index mGEMS-ecoli-reference
-```
-Note this will create bins for **all** reference lineages. If know
-which lineages the samples originate from (in our case Escherichia
-coli ST131 A, B, and C2), use the '--groups' option to only create
-those bins
-```
-mGEMS --groups Escherichia_coli_ST131-A,Escherichia_coli_ST131-B,Escherichia_coli_ST131-C2 -r ecoli-1_1.fastq.gz,ecoli-1_2.fastq.gz --themisto-alns ecoli-1_1.aln,ecoli-1_2.aln -o ecoli-1 --probs ecoli-1_probs.csv.gz -a ecoli-1_abundances.txt --index mGEMS-ecoli-reference
-```
+while read line; do
+	id=$(echo $line | cut -f3 -d' ')
+	cluster=$(echo $line | cut -f2 -d' ')
+	mGEMS --groups $cluster -r $id""_1.fastq.gz,$id""_2.fastq.gz --themisto-alns $id""_1.aln.gz,$id""_2.aln.gz -o $id --probs $id/$id""_probs.csv.gz -a $id/$id""_abundances.txt --index mGEMS-ecoli-reference
+done < mixed_samples.tsv
+```
+Note that by default mGEMS creates bins for **all** reference lineages. If know
+which lineages the samples originate from (in our case these are
+supplied in the mixed_samples.tsv in the second column), the
+'--groups' option enables you to only create those bins. Multiple
+groups can be binned in the single run by supplying them as a
+comma-separated list.
 
 ### <a name="assembly"></a>Assembly
-Assemble the sequences with shovill using 4 threads and maximum of
+Assemble the sequences with shovill using 2 threads and maximum of
 8192 megabytes of RAM
 ```
-shovill --outdir ecoli-1/Escherichia_coli_ST131-A --R1 ecoli-1/Escherichia_coli_ST131-A_1.fastq.gz --R2 ecoli-1/Escherichia_coli_ST131-A_2.fastq.gz --cpus 4 --ram 8
-shovill --outdir ecoli-1/Escherichia_coli_ST131-B --R1 ecoli-1/Escherichia_coli_ST131-B_1.fastq.gz --R2 ecoli-1/Escherichia_coli_ST131-B_2.fastq.gz --cpus 4 --ram 8
-shovill --outdir ecoli-1/Escherichia_coli_ST131-C2 --R1 ecoli-1/Escherichia_coli_ST131-C2_1.fastq.gz --R2 ecoli-1/Escherichia_coli_ST131-C2_2.fastq.gz --cpus 4 --ram 8
+while read line; do
+	id=$(echo $line | cut -f3 -d' ')
+	cluster=$(echo $line | cut -f2 -d' ')
+	shovill --outdir $id/$cluster --R1 $id/$cluster""_1.fastq.gz --R2 $id/$cluster""_2.fastq.gz --cpus 2 --ram 8
+	mv $id/$cluster/contigs.fa $id/
+	rm -rf $id/$cluster
+	mkdir $id/$cluster
+	mv $id/contigs.fa $id/$cluster/
+done < mixed_samples.tsv
 ```
 
 ### <a name="snpcalling"></a>SNP calling
@@ -121,33 +178,47 @@ wget -O NCTC13441.fasta.gz http://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/uf/
 Call SNPs in the genome with snippy
 ```
 mkdir snippy-tmp
-snippy --outdir ecoli-1-Escherichia_coli_ST131-A --ctgs ecoli-1/Escherichia_coli_ST131-A/contigs.fa --ref NCTC13441.fasta.gz --cpus 4 --ram 8 --tmpdir snippy-tmp
-snippy --outdir ecoli-1-Escherichia_coli_ST131-B --ctgs ecoli-1/Escherichia_coli_ST131-B/contigs.fa --ref NCTC13441.fasta.gz --cpus 4 --ram 8 --tmpdir snippy-tmp
-snippy --outdir ecoli-1-Escherichia_coli_ST131-C2 --ctgs ecoli-1/Escherichia_coli_ST131-C@/contigs.fa --ref NCTC13441.fasta.gz --cpus 4 --ram 8 --tmpdir snippy-tmp
+gunzip NCTC13441.fasta.gz
+while read line; do
+	id=$(echo $line | cut -f3 -d' ')
+	sample=$(echo $line | cut -f1 -d' ')
+	cluster=$(echo $line | cut -f2 -d' ')
+	snippy --outdir $id/$cluster/$sample --ctgs $id/$cluster/contigs.fa --ref NCTC13441.fasta --cpus 2 --ram 8 --tmpdir snippy-tmp
+done < mixed_samples.tsv
+gzip NCTC13441.fasta
 ```
 Build the core SNP alignment with snippy
 ```
-snippy-core --ref ecoli-1-Escherichia_coli_ST131-A/ref.fa ecoli-1-Escherichia_coli_ST131-A ecoli-1-Escherichia_coli_ST131-B ecoli-1-Escherichia_coli_ST131-C2
-```
+snippys=""
+while read line; do
+	id=$(echo $line | cut -f3 -d' ')
+	sample=$(echo $line | cut -f1 -d' ')
+	cluster=$(echo $line | cut -f2 -d' ')
+	snippys=$snippys""$id/$cluster/$sample" "
+	last=$id/$cluster/$sample
+done < mixed_samples.tsv
+snippy-core --ref $last/ref.fa $snippys
+```
+the alignment will be stored in the 'core.full.aln' file.
 
 ### <a name="phylogenetics"></a>Phylogenetic inference
-Use RAxML-NG to infer a maximum likelihood phylogeny from 100 starting
+Use RAxML-NG to infer a maximum likelihood phylogeny from 10 starting
 parsimony and random trees under the GTR+G4 model
 ```
-raxml-ng --search --msa core.full.aln --prefix CT --threads 4 --tree rand{100},pars{100} --model GTR+G4
+raxml-ng --search --msa core.full.aln --prefix CT --threads 2 --tree rand{10},pars{10} --model GTR+G4
 ```
-Calculate bootstrap support values with 1000 replicates
+Calculate bootstrap support values with 100 replicates
 ```
-raxml-ng --bootstrap --msa core.full.aln.raxml.rba --bs-trees 1000 --prefix CB --threads 4
+raxml-ng --bootstrap --msa core.full.aln --bs-trees 100 --prefix CB --threads 2 --model GTR+G4
 ```
 Perform a bootstrap convergence check
 ```
-raxml-ng --bsconverge --bs-trees CB.raxml.bootstraps --prefix CS --seed 2 --threads 4
+raxml-ng --bsconverge --bs-trees CB.raxml.bootstraps --prefix CS --seed 2 --threads 2
 ```
 Add the bootstrap support values to the maximum likelihood tree with
 the best likelihood
 ```
-raxml-ng --support --tree CT.raxml.bestTree --bs-trees CB.raxml.bootstraps --prefix CS --threads 4
+raxml-ng --support --tree CT.raxml.bestTree --bs-trees CB.raxml.bootstraps --prefix CS --threads 2
 ```
 The best tree with the bootstrap values will be written in the
-'CS.support' file.
+'CS.raxml.support' file.
diff --git a/docs/scripts/get_read.sh b/docs/scripts/get_forward.sh
similarity index 85%
rename from docs/scripts/get_read.sh
rename to docs/scripts/get_forward.sh
index c93d83d..ef82db6 100755
--- a/docs/scripts/get_read.sh
+++ b/docs/scripts/get_forward.sh
@@ -20,6 +20,5 @@ echo "check accession number"
 fi
 
 dlpath=$ftppath$dir1$dir2$1"/"
-echo "downloading reads from path: "$dlpath
 
-wget $dlpath"*"
+wget -qO- $dlpath""$1_1.fastq.gz
diff --git a/docs/scripts/get_reverse.sh b/docs/scripts/get_reverse.sh
new file mode 100755
index 0000000..e694bef
--- /dev/null
+++ b/docs/scripts/get_reverse.sh
@@ -0,0 +1,24 @@
+ftppath="ftp://ftp.sra.ebi.ac.uk/vol1/fastq"
+
+strlen=$((${#1}))
+dir1="/"${1:0:6}
+
+if [ ${strlen} -lt 10 ]
+then
+dir2="/"
+elif [ ${strlen} -lt 11 ]
+then
+dir2="/00"${1: -1}"/"
+elif [ ${strlen} -lt 12 ]
+then
+dir2="/0"${1: -2}"/"
+elif [ ${strlen} -lt 13 ]
+then
+dir2="/"${1: -3}"/"
+else
+echo "check accession number"
+fi
+
+dlpath=$ftppath$dir1$dir2$1"/"
+
+wget -qO- $dlpath""$1_2.fastq.gz

From 27af5856ceff819937cddd53793442ca6d250004 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tommi=20M=C3=A4klin?= <tommi@maklin.fi>
Date: Mon, 23 Mar 2020 12:50:44 +0200
Subject: [PATCH 3/7] add links to the mixed sample information and reference
 datasets

---
 docs/TUTORIAL.md | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/docs/TUTORIAL.md b/docs/TUTORIAL.md
index c325e5f..28a400d 100644
--- a/docs/TUTORIAL.md
+++ b/docs/TUTORIAL.md
@@ -58,11 +58,11 @@ ulimit -n 2048
 Download the supplementary table from the mGEMS manucsript which
 contains the relevant information
 ```
-## ADD
+https://zenodo.org/record/3724144/files/mGEMS_Supplementary_Table_mixed_samples.tsv
 ```
 Filter the table to contain only the *E. coli* (ecoli) experiments
 ```
-grep "ecoli" Supplementary_table_mixed_samples.tsv" > mixed_samples.tsv
+grep "ecoli" mGEMS_Supplementary_Table_mixed_samples.tsv" > mixed_samples.tsv
 ```
 If you want to reproduce the *E. faecalis* experiments, change 'ecoli'
 to 'efaec'. For *S. aureus*, change 'ecoli' to 'saur'. Running these
@@ -70,15 +70,18 @@ other two experiments may require resources beyond the typical laptop or
 desktop computer.
 
 ### <a name="referencedata"></a>Reference data
-Download the relevant reference data from **figshare (TODO:: ADD)**
-- [*E. coli*]()
-- [*E. faecalis*]()
-- [*S. aureus*]()
+Download and extract the relevant reference data from zenodo
+- [*E. coli*](https://zenodo.org/record/3724112)
+- [*E. faecalis*](https://zenodo.org/record/3724101)
+- [*S. aureus*](https://zenodo.org/record/3724135)
 
 by running
 ```
-## ADD
+wget https://zenodo.org/record/3724112/files/mGEMS-ecoli-reference-v1.0.0.tar.gz
+tar -zxvf mGEMS-ecoli-reference-v1.0.0.tar.gz
 ```
+Construction of the reference dataset(s) is describe in more detail in
+Mäklin et al. 2020.
 
 ### <a name="indexing"></a>Indexing
 Create a *31*-mer pseudoalignment index with Themisto using two
@@ -86,7 +89,7 @@ threads and maximum 8192 megabytes of RAM.
 ```
 mkdir mGEMS-ecoli-reference
 mkdir mGEMS-ecoli-reference/tmp
-build_index --k 31 --input-file mGEMS-ecoli-reference-v1.0.0.fasta --auto-colors --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp --mem-megas 8192 --n-threads 2
+build_index --k 31 --input-file mGEMS-ecoli-reference-sequences-v1.0.0.fasta.gz --auto-colors --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp --mem-megas 8192 --n-threads 2
 ```
 change 'ecoli' to 'efaec' or 'saur' if you are trying to reproduce the
 other experiments.
@@ -134,7 +137,7 @@ probabilities using two threads
 for f1 in *_1.fastq.gz; do
 	f=${f1%_1.fastq.gz}
 	mkdir $f
-	mSWEEP --themisto-1 $f""_1.aln.gz --themisto-2 $f""_2.aln.gz --themisto-index mGEMS-ecoli-reference -i mGEMS-ecoli-reference-v1.0.0.grouping -o $f/$f --write-probs --gzip-probs -t 2
+	mSWEEP --themisto-1 $f""_1.aln.gz --themisto-2 $f""_2.aln.gz --themisto-index mGEMS-ecoli-reference -i mGEMS-ecoli-reference-grouping-v1.0.0.txt -o $f/$f --write-probs --gzip-probs -t 2
 done
 ```
 

From 5ea2b053dd96301f0ac2853aaef62852c94315a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tommi=20M=C3=A4klin?= <tommi@maklin.fi>
Date: Mon, 23 Mar 2020 12:54:29 +0200
Subject: [PATCH 4/7] clarify some things about the reference data

---
 docs/TUTORIAL.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/TUTORIAL.md b/docs/TUTORIAL.md
index 28a400d..9472e75 100644
--- a/docs/TUTORIAL.md
+++ b/docs/TUTORIAL.md
@@ -58,7 +58,7 @@ ulimit -n 2048
 Download the supplementary table from the mGEMS manucsript which
 contains the relevant information
 ```
-https://zenodo.org/record/3724144/files/mGEMS_Supplementary_Table_mixed_samples.tsv
+wget https://zenodo.org/record/3724144/files/mGEMS_Supplementary_Table_mixed_samples.tsv
 ```
 Filter the table to contain only the *E. coli* (ecoli) experiments
 ```
@@ -70,18 +70,18 @@ other two experiments may require resources beyond the typical laptop or
 desktop computer.
 
 ### <a name="referencedata"></a>Reference data
-Download and extract the relevant reference data from zenodo
+The reference data from Mäklin et al. is available from zenodo
 - [*E. coli*](https://zenodo.org/record/3724112)
 - [*E. faecalis*](https://zenodo.org/record/3724101)
 - [*S. aureus*](https://zenodo.org/record/3724135)
+Construction of the reference dataset(s) is describe in more detail in
+Mäklin et al. 2020.
 
-by running
+Download and extract the *E. coli* dataset by running
 ```
 wget https://zenodo.org/record/3724112/files/mGEMS-ecoli-reference-v1.0.0.tar.gz
 tar -zxvf mGEMS-ecoli-reference-v1.0.0.tar.gz
 ```
-Construction of the reference dataset(s) is describe in more detail in
-Mäklin et al. 2020.
 
 ### <a name="indexing"></a>Indexing
 Create a *31*-mer pseudoalignment index with Themisto using two

From 2d19d82d33440bf4140788aeda1f64c40c333898 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tommi=20M=C3=A4klin?= <tommi@maklin.fi>
Date: Mon, 23 Mar 2020 12:55:04 +0200
Subject: [PATCH 5/7] fix a typo

---
 docs/TUTORIAL.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/TUTORIAL.md b/docs/TUTORIAL.md
index 9472e75..1a9cced 100644
--- a/docs/TUTORIAL.md
+++ b/docs/TUTORIAL.md
@@ -74,6 +74,7 @@ The reference data from Mäklin et al. is available from zenodo
 - [*E. coli*](https://zenodo.org/record/3724112)
 - [*E. faecalis*](https://zenodo.org/record/3724101)
 - [*S. aureus*](https://zenodo.org/record/3724135)
+
 Construction of the reference dataset(s) is describe in more detail in
 Mäklin et al. 2020.
 

From 060152a6a45c87ae887b9e3c1c60305ed0bd2920 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tommi=20M=C3=A4klin?= <tommi@maklin.fi>
Date: Mon, 23 Mar 2020 12:59:36 +0200
Subject: [PATCH 6/7] add a note about the tutorial

---
 README.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 708338d..481112c 100644
--- a/README.md
+++ b/README.md
@@ -40,7 +40,12 @@ mGEMS bin and mGEMS extract, which bin the reads in the input
 pseudoalignment (mGEMS bin) and extract the binned reads from the
 original mixed samples (mGEMS extract).
 
-### (Pseudo)tutorial — Full pipeline with Themisto and mSWEEP
+### Tutorial — E. coli ST131 sublineages
+A tutorial for reproducing the *E. coli* ST131 sublineage phylogenetic
+tree presented in Mäklin et al. 2020 using mGEMS is available in the
+[docs folder of this repository](docs/TUTORIAL.md).
+
+### Quickstart — full pipeline
 Build a [Themisto](https://github.com/jnalanko/themisto) index to
 align against.
 ```

From 3aec6d9dd63555b88d951e8807c55c84bc807cd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tommi=20M=C3=A4klin?= <tommi@maklin.fi>
Date: Mon, 30 Mar 2020 13:13:49 +0300
Subject: [PATCH 7/7] add option to filter target groups by minimum abundance
 threshold

---
 README.md    |  6 +++---
 src/main.cpp | 15 +++++++++++++++
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 55fdcd3..a99fbbf 100644
--- a/README.md
+++ b/README.md
@@ -107,9 +107,9 @@ mGEMS accepts the following input flags
 	-a                 Relative abundance estimates from mSWEEP (tab-separated, 1st
 	                   column has the group names and 2nd column the estimates).
 	--index            Themisto pseudoalignment index directory.
-	--groups           (Optional) which groups to extract from the input reads.
-	--compress         (Optional) Toggle compressing the output files
-                       (default: compress)
+	--groups           (Optional) Which groups to extract from the input reads.
+	--min-abundance    (Optional) Extract only groups that have a relative abundance higher than this value.
+	--compress         (Optional) Toggle compressing the output files (default: compress)
 ```
 
 
diff --git a/src/main.cpp b/src/main.cpp
index eb53541..9245a0d 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -39,9 +39,11 @@ void ParseBin(int argc, char* argv[], cxxargs::Arguments &args) {
   args.add_long_argument<std::string>("probs", "Posterior probabilities from mSWEEP.");
   args.add_long_argument<std::string>("merge-mode", "How to merge paired-end alignments from Themisto (default: intersection).", "intersection");
   args.add_long_argument<std::vector<std::string>>("groups", "Which reference groups to bin reads to (default: all).");
+  args.add_long_argument<long double>("min-abundance", "Bin only the groups that have a relative abundance higher than this value (optional).");
   args.add_short_argument<long double>('q', "Tuning parameter for the binning thresholds (default: 1.0).", (long double)1);
   args.add_long_argument<std::string>("index", "Themisto pseudoalignment index directory.");
   args.set_not_required("groups");
+  args.set_not_required("min-abundance");
 
   args.parse(argc, argv);
 }
@@ -91,6 +93,15 @@ void ReadAndExtract(cxxargs::Arguments &args) {
   Extract(bins, target_groups, args);
 }
 
+void FilterTargetGroups(const std::vector<std::string> &group_names, const std::vector<long double> &abundances, const long double min_abundance, std::vector<std::string> *target_groups) {
+  uint32_t n_groups = group_names.size();
+  for (uint32_t i = 0; i < n_groups; ++i) {
+    if (abundances[i] < min_abundance && std::find(target_groups->begin(), target_groups->end(), group_names[i]) != target_groups->end()) {
+      target_groups->erase(std::find(target_groups->begin(), target_groups->end(), group_names[i]));
+    }
+  }
+}
+
 void Bin(const cxxargs::Arguments &args, bool extract_bins) {
   DIR* dir = opendir(args.value<std::string>('o').c_str());
   if (dir) {
@@ -128,6 +139,10 @@ void Bin(const cxxargs::Arguments &args, bool extract_bins) {
   } else {
     target_groups = groups;
   }
+  if (args.is_initialized("min-abundance")) {
+    FilterTargetGroups(groups, abundances, args.value<long double>("min-abundance"), &target_groups);
+  }
+
   const std::vector<std::vector<uint32_t>> &bins = mGEMS::Bin(aln, args.value<long double>('q'), abundances, groups, probs_file.stream(), &target_groups);
   if (!extract_bins) {
     uint32_t n_bins = bins.size();