Skip to content

Commit

Permalink
Merge pull request #5 from PROBIC/dev
Browse files Browse the repository at this point in the history
mGEMS-v0.2.0 (8 March 2020)
  • Loading branch information
tmaklin authored Mar 8, 2020
2 parents 1690b35 + 629c7ad commit 69c7895
Show file tree
Hide file tree
Showing 16 changed files with 235 additions and 178 deletions.
9 changes: 0 additions & 9 deletions .gitmodules

This file was deleted.

50 changes: 26 additions & 24 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
cmake_minimum_required (VERSION 2.8.12)
project (msweep-assembly)
include(ExternalProject)
project (mGEMS)

set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
Expand All @@ -14,17 +13,6 @@ elseif(CMAKE_BUILD_TYPE MATCHES Debug)
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -g -Wall -Wextra -Wpedantic")
endif()

if(CMAKE_BUILD_TYPE MATCHES Release)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_PARALLEL -flto")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -D_GLIBCXX_PARALLEL -flto")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -D_GLIBCXX_PARALLEL -flto")
if(CMAKE_CXX_COMPILER STREQUAL "GNU")
set(CMAKE_AR "gcc-ar")
set(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> qcs <TARGET> <LINK_FLAGS> <OBJECTS>")
set(CMAKE_C_ARCHIVE_FINISH true)
endif()
endif()

## Set C++11 support depending on cmake version
if (${CMAKE_MAJOR_VERSION} GREATER 2 AND ${CMAKE_MINOR_VERSION} GREATER 0)
set (CMAKE_CXX_STANDARD 11)
Expand All @@ -33,24 +21,34 @@ else()
add_compile_options(-std=c++11)
endif()

## mGEMS executable
add_executable(mGEMS ${CMAKE_CURRENT_SOURCE_DIR}/src/main.cpp)

## Check supported compression types
find_package(BZip2)
include_directories(${BZIP2_INCLUDE_DIRS})

if (BZIP2_FOUND)
include_directories(${BZIP2_INCLUDE_DIRS})
target_link_libraries(mGEMS ${BZIP2_LIBRARIES})
endif()
find_package(LibLZMA)
include_directories(${LIBLZMA_INCLUDE_DIRS})

if (LIBLZMA_FOUND)
include_directories(${LIBLZMA_INCLUDE_DIRS})
target_link_libraries(mGEMS ${LIBLZMA_LIBRARIES})
endif()
find_package(ZLIB)
include_directories(${ZLIB_INCLUDE_DIRS})

add_executable(mGEMS ${CMAKE_CURRENT_SOURCE_DIR}/src/main.cpp)
if (ZLIB_FOUND)
include_directories(${ZLIB_INCLUDE_DIRS})
target_link_libraries(mGEMS ${ZLIB_LIBRARIES})
endif()

## Check dependencies and download them if not given
## telescope
if (DEFINED CMAKE_TELESCOPE_LIBRARY AND DEFINED CMAKE_TELESCOPE_HEADERS)
find_library(TELESCOPE NAMES telescope HINTS ${CMAKE_TELESCOPE_LIBRARY})
target_link_libraries(mGEMS ${TELESCOPE})
include_directories("${CMAKE_TELESCOPE_HEADERS}")
else()
configure_file(CMakeLists-telescope.txt.in ${CMAKE_BINARY_DIR}/external/telescope-download/CMakeLists.txt)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/CMakeLists-telescope.txt.in ${CMAKE_BINARY_DIR}/external/telescope-download/CMakeLists.txt)
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
RESULT_VARIABLE result
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/external/telescope-download )
Expand All @@ -70,10 +68,11 @@ else()
target_link_libraries(mGEMS libtelescope)
endif()

## bxzstr
if (DEFINED CMAKE_BXZSTR_HEADERS)
include_directories("${CMAKE_BXZSTR_HEADERS}")
else()
configure_file(CMakeLists-bxzstr.txt.in ${CMAKE_BINARY_DIR}/external/bxzstr-download/CMakeLists.txt)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/CMakeLists-bxzstr.txt.in ${CMAKE_BINARY_DIR}/external/bxzstr-download/CMakeLists.txt)
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
RESULT_VARIABLE result
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/external/bxzstr-download )
Expand All @@ -89,10 +88,11 @@ else()
include_directories(${CMAKE_CURRENT_BINARY_DIR}/external/bxzstr/include)
endif()

## cxxargs
if (DEFINED CMAKE_CXXARGS_HEADERS)
include_directories("${CMAKE_CXXARGS_HEADERS}")
else()
configure_file(CMakeLists-cxxargs.txt.in ${CMAKE_BINARY_DIR}/external/cxxargs-download/CMakeLists.txt)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/CMakeLists-cxxargs.txt.in ${CMAKE_BINARY_DIR}/external/cxxargs-download/CMakeLists.txt)
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
RESULT_VARIABLE result
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/external/cxxargs-download )
Expand Down Expand Up @@ -128,13 +128,15 @@ string(TIMESTAMP _BUILD_TIMESTAMP)
## Generate a version.h file containing build version and timestamp
configure_file(${CMAKE_SOURCE_DIR}/include/version.h.in ${CMAKE_BINARY_DIR}/include/version.h @ONLY)

## external/include has the version info
include_directories(include ${CMAKE_SOURCE_DIR}/external)

## mGEMS library
add_library(libmgems ${CMAKE_CURRENT_SOURCE_DIR}/src/mGEMS.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/bin_reads.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/extract_bin.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/mGEMS.cpp)
set_target_properties(libmgems PROPERTIES OUTPUT_NAME mgems)

# Link libraries
target_link_libraries(mGEMS libmgems ${ZLIB_LIBRARIES} ${LIBLZMA_LIBRARIES} ${BZIP2_LIBRARIES})
target_link_libraries(mGEMS libmgems)
112 changes: 58 additions & 54 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

Bacterial sequencing data binning on strain-level based on probabilistic taxonomic classification.

# Installation
## Dependencies
## Installation
### Dependencies
To run the binning + assembly pipeline, you will need a program that
does pseudoalignment and another program that estimates an assignment
probability matrix for the reads to the alignment targets.
Expand All @@ -13,27 +13,34 @@ We recommend to use [Themisto](https://github.com/jnalanko/themisto)
[mSWEEP](https://github.com/probic/msweep-assembly) (v1.3.2 or newer)
for estimating the probability matrix.

## Compiling from source
### Requirements
### Compiling from source
#### Requirements
- C++11 compliant compiler.
- cmake

### Compilation
Clone the repository (note the *--recursive* option in git clone)
#### Compilation
Clone the repository
```
git clone --recursive https://github.com/PROBIC/msweep-assembly.git
git clone https://github.com/PROBIC/mGEMS.git
```
enter the directory and run
```
> mkdir build
> cd build
> cmake ..
> make
mkdir build
cd build
cmake ..
make
```
This will compile the read_alignment, assign_reads, build_sample, and telescope executables in the build/bin/ directory.
This will compile the mGEMS executable in the build/bin/ directory.

# Usage
## Indexing
## Usage
### mGEMS
The mGEMS executable provides three commands: mGEMS, mGEMS bin, and
mGEMS extract. The first command (mGEMS) is shorthand for running both
mGEMS bin and mGEMS extract, which bin the reads in the input
pseudoalignment (mGEMS bin) and extract the binned reads from the
original mixed samples (mGEMS extract).

### (Pseudo)tutorial — Full pipeline with Themisto and mSWEEP
Build a [Themisto](https://github.com/jnalanko/themisto) index to
align against.
```
Expand All @@ -44,67 +51,64 @@ build_index --k 31 --input-file example.fasta --auto-colors --index-dir themisto

Align paired-end reads 'reads_1.fastq.gz' and 'reads_2.fastq.gz' with Themisto
```
pseudoalign --index-dir themisto_index --query-file reads_1.fastq.gz --outfile pseudoalignments_1.txt --rc --temp-dir themisto_index/tmp --n-threads 16 --mem-megas 8192
pseudoalign --index-dir themisto_index --query-file reads_2.fastq.gz --outfile pseudoalignments_2.txt --rc --temp-dir themisto_index/tmp --n-threads 16 --mem-megas 8192
pseudoalign --index-dir themisto_index --query-file reads_1.fastq.gz --outfile pseudoalignments_1.aln --rc --temp-dir themisto_index/tmp --n-threads 16 --mem-megas 8192
pseudoalign --index-dir themisto_index --query-file reads_2.fastq.gz --outfile pseudoalignments_2.aln --rc --temp-dir themisto_index/tmp --n-threads 16 --mem-megas 8192
```

Convert the pseudoalignment to
[kallisto](https://github.com/pachterlab/kallisto) format using
[telescope](https://github.com/tmaklin/telescope) (supplied with the msweep-assembly installation).
Estimate the relative abundances with mSWEEP (reference_grouping.txt
should contain the groups the sequences in 'example.fasta' are
assigned to. See the [mSWEEP](https://github.com/probic/msweep-assembly) usage instructions for details).
```
mkdir outfolder
ntargets=$(sort themisto_index/coloring-names.txt | uniq | wc -l)
telescope --n-refs $ntargets -r pseudoalignments_1.txt,pseudoalignments_2.txt -o outfolder --mode intersection
mSWEEP --themisto-1 pseudoalignments_1.aln --themisto-2 pseudoalignments_2.aln -o mSWEEP -i reference_grouping.txt --write-probs
```

Create a fake kallisto-style run_info.json file using the
Themisto_run_info.sh script in the root directory of this project
Bin the reads and write all bins to the 'mGEMS-out' folder
```
Themisto_run_info.sh $(wc -l < pseudoalignments_1.txt) $ntargets > outfolder/run_info.json
mkdir mGEMS-out
mGEMS -r reads_1.fastq.gz,reads_2.fastq.gz --themisto-alns pseudoalignments_1.txt,pseudoalignments_2.txt -o mGEMS-out --probs mSWEEP_probs.csv -a mSWEEP_abundances.txt --index themisto_index
```
This will write the binned paired-end reads for *all groups* in the
mSWEEP_abundances.txt file in the mGEMS-out folder (compressed with
zlib).

Determine read assignments to equivalence classes from the kallisto
format files
### Advanced use
... or bin and write only the reads that are assigned to "group-3" or
"group-4" by adding the '--groups group-3,group-4' flag
```
read_alignment -e outfolder/pseudoalignments.ec -s outfolder/read-to-ref.txt -o outfolder --write-ecs --themisto --n-refs $ntargets --gzip-output
mGEMS --groups group-3,group-4 -r reads_1.fastq.gz,reads_2.fastq.gz --themisto-alns pseudoalignments_1.txt,pseudoalignments_2.txt -o mGEMS-out --probs mSWEEP_probs.csv -a mSWEEP_abundances.txt --index themisto_index
```

Estimate the relative abundances with mSWEEP (reference_grouping.txt
should contain the groups the sequences in 'example.fasta' are
assigned to. See the [mSWEEP](https://github.com/probic/msweep-assembly) usage instructions for details).
Alternatively, find and write only the read bins for "group-3" and
"group-4", skipping extracting the reads
```
mSWEEP -f outfolder -i reference_grouping.txt -o msweep-out --write-probs --gzip-probs
mGEMS bin --groups group-3,group-4 --themisto-alns pseudoalignments_1.txt,pseudoalignments_2.txt -o mGEMS-out --probs mSWEEP_probs.csv -a mSWEEP_abundances.txt --index themisto_index
```

(Optional) Extract the names of the 3 most abundant reference
groups.
... and extract the reads when feeling like it
```
grep -v "^[#]" msweep-out_abundances.txt | sort -rgk2 | cut -f1 | head -n3 > most_abundant_groups.txt
mGEMS extract --bins mGEMS-out/group-3.bin,mGEMS-out/group-4.bin -r
reads_1.fastq.gz,reads_2.fastq.gz -o mGEMS-out
```
If you use a more refined method or know which reference groups (as
specified in the reference_grouping.txt file) you want to assemble,
put their names in a .txt file where each line corresponds to a
cluster name instead. It is also possible to supply the names in
tab-separated format on one or more lines.

Assign reads to the 3 most abundant reference groups based on the estimated probabilities
### Accepted input flags
mGEMS accepts the following input flags
```
assign_reads -f outfolder/ec_to_read.csv.gz -p msweep-out_probs.csv.gz -a msweep-out_abundances.txt -o outfolder/ --groups most_abundant_groups.txt --gzip-output
-r Comma-separated list of input read(s).
--themisto-alns Comma-separated list of pseudoalignment file(s)
for the reads from themisto.
-o Output directory (must exist before running!).
--probs Comma-separated Posterior probability matrix (output from mSWEEP with
the --write-probs flag).
-a Relative abundance estimates from mSWEEP (tab-separated, 1st
column has the group names and 2nd column the estimates).
--index Themisto pseudoalignment index directory.
--groups (Optional) which groups to extract from the input reads.
--compress (Optional) Toggle compressing the output files
(default: compress)
```

Construct the binned samples from the original files

```
while read -r sample; do
build_sample -a outfolder/$sample""_reads.txt.gz -o outfolder/$sample -1 reads_1.fastq.gz -2 reads_2.fastq.gz --gzip-output
done < most_abundant_groups.txt
```
This will create the <group name>_1.fastq.gz and <group
name>_2.fastq.gz files in the outfolder, which you can assemble with
your assembler of choice.

# License
## License
The source code from this project is subject to the terms of the MIT
license. A copy of the MIT license is supplied with the project, or
can be obtained at https://opensource.org/licenses/MIT.
9 changes: 0 additions & 9 deletions Themisto_run_info.sh

This file was deleted.

3 changes: 1 addition & 2 deletions CMakeLists-bxzstr.txt.in → config/CMakeLists-bxzstr.txt.in
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@ ExternalProject_Add(bxzstr-download
GIT_REPOSITORY https://github.com/tmaklin/bxzstr.git
GIT_TAG master
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/bxzstr"
BUILD_IN_SOURCE 1
CONFIGURE_COMMAND cmake -DLIBLZMA_FOUND=1 -DBZIP2_FOUND=0
BUILD_IN_SOURCE 0
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ include(ExternalProject)

ExternalProject_Add(telescope-download
GIT_REPOSITORY https://github.com/tmaklin/telescope.git
GIT_TAG dev
GIT_TAG v0.2.0
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/telescope"
BUILD_IN_SOURCE 0
BUILD_COMMAND ""
Expand Down
1 change: 0 additions & 1 deletion external/bxzstr
Submodule bxzstr deleted from 4e05f3
1 change: 0 additions & 1 deletion external/cxxargs
Submodule cxxargs deleted from ef6a4f
1 change: 0 additions & 1 deletion external/telescope
Submodule telescope deleted from 5a710a
4 changes: 2 additions & 2 deletions include/assign_reads.h → include/bin_reads.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
namespace mGEMS {
uint32_t ReadAbundances(std::istream &stream, std::vector<long double> *abundances, std::vector<std::string> *groups);
void ConstructThresholds(const uint32_t num_ecs, const long double theta_frac, const std::vector<long double> &abundances, std::vector<long double> *thresholds);
void AssignProbs(const std::vector<long double> &thresholds, std::istream &probs_file, std::vector<std::vector<bool>> *assignments);
void BinReads(const std::vector<std::vector<bool>> &assignments, const std::vector<bool> &groups_to_assign, const std::vector<std::vector<uint32_t>> &aligned_reads, std::vector<std::vector<uint32_t>> *assigned_reads);
std::vector<bool> AssignProbs(const std::vector<long double> &thresholds, std::istream &probs_file, std::vector<std::string> *target_groups, std::vector<std::vector<bool>> *assignments, const std::vector<std::vector<uint32_t>> &assigned_reads, std::vector<std::vector<uint32_t>> *bins);
void BinReads(const std::vector<std::vector<bool>> &assignments, const std::vector<bool> &groups_to_assign, const std::vector<std::vector<uint32_t>> &aligned_reads, std::vector<std::vector<uint32_t>> *assigned_reads);
void WriteBin(const std::vector<uint32_t> &binned_reads, std::ostream &of);
}

Expand Down
2 changes: 1 addition & 1 deletion include/extract_bin.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#include "file.hpp"

namespace mGEMS {
void ExtractBin(const std::vector<uint32_t> &bin_assignments, std::ostream* out_strand_1, std::ostream* out_strand_2, std::istream* in_strand_1, std::istream* in_strand_2);
void ExtractBin(const std::vector<uint32_t> &bin_assignments, std::vector<File::In> &in_strands, std::vector<File::Out> *out_strands);
std::vector<uint32_t> ReadBin(std::istream &stream);
}

Expand Down
Loading

0 comments on commit 69c7895

Please sign in to comment.