iqbal-lab-org · mbhall88 · Nov 2, 2020 · Oct 21, 2020 · Oct 22, 2020 · Oct 22, 2020
diff --git a/.travis.yml b/.travis.yml
@@ -2,35 +2,48 @@ language: cpp
 compiler: gcc
 os: linux
 dist: bionic
+env:
+  - OMP_NUM_THREADS=4  # https://docs.travis-ci.com/user/languages/cpp/#openmp-projects
+
+git:
+  depth: 3  # https://docs.travis-ci.com/user/customizing-the-build/#git-clone-depth
+
+cache:
+  directories:
+    - ${TRAVIS_BUILD_DIR}/deps/boost-1.62.0
+
+branches:
+  only: # build all branches https://docs.travis-ci.com/user/customizing-the-build/#safelisting-or-blocklisting-branches
+    - gh-pages
+    - /.*/
 
 include:
   - os: linux
     addons:
       apt:
         packages:
           - clang-format-8
+          - zlib1g-dev
 
 jobs:
+  fast_finish: true
   include:
-    - stage: "Lint"                # naming the Tests stage
-      name: "Format"            # names the first Tests stage job
+    - stage: "Format"
       script:
         - find src/ include/ test/ -type f \( -iname \*.h -o -iname \*.cpp \) | xargs -I _ clang-format -style=file -output-replacements-xml _ | grep -c "<replacement " >/dev/null
         - if [ $? -ne 1 ]; then echo "Not all source and header files are formatted with clang-format"; exit 1; fi
-    - stage: "Build"
-      script: echo "This is where we build Pandora"
-    - stage: "Test"
-      name: "Unit tests"
-      script: echo "This is where we run the unit tests"
-    - stage: "Deploy"
-      name: "DockerHub"
-      script: echo "This is where we deploy a container to Docker Hub"
+    - stage: "Build and Test"
+      env:
+        - BOOST_VERSION="1.62.0"
+        - BOOST_LIBS="system,filesystem,iostreams,log,thread,date_time"
+        - BUILD_TYPE="Debug"
+        - DEPS_DIR="${TRAVIS_BUILD_DIR}/deps"
+      install: bash ci/install.sh
+      script: bash ci/script.sh
 
 stages:
-  - "Lint"
-  - "Build"
-  - "Test"
-  - "Deploy"
+  - "Format"
+  - "Build and Test"
 
 notifications:
   email: false

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -46,13 +46,16 @@ set(EXTERNAL_LIBS
         gatbcore
         hdf5
         hdf5_tools)
+link_directories(${gatb_binary_dir}/lib)
 
 include(${PROJECT_SOURCE_DIR}/ext/gtest.cmake)
 
 #include directories as SYSTEM includes, thus warnings will be ignored for these
 include_directories(SYSTEM
         ${CMAKE_BINARY_DIR}/include
         ${PROJECT_SOURCE_DIR}/cgranges/cpp
+        ${gatb_source_dir}/gatb-core/src
+        ${gatb_binary_dir}/include
         )
 
 #normal includes: warnings will be reported for these

diff --git a/README.md b/README.md
@@ -1,86 +1,148 @@
-[![Build Status](https://travis-ci.org/rmcolq/pandora.svg?branch=master)](https://travis-ci.org/rmcolq/pandora) master
+| Branch             | Status                                                                           |
+|:-------------------|:---------------------------------------------------------------------------------|
+| [`master`][master] | ![Travis (.com) branch](https://img.shields.io/travis/com/rmcolq/pandora/master) |
+| [`dev`][dev]       | ![Travis (.com) branch](https://img.shields.io/travis/com/rmcolq/pandora/dev)    |
 
-[![Build Status](https://travis-ci.com/rmcolq/pandora.svg?token=mxzxNwUzHrkcpsL2i7zU&branch=dev)](https://travis-ci.com/rmcolq/pandora) dev
+[master]: https://github.com/rmcolq/pandora/tree/master
+[dev]: https://github.com/rmcolq/pandora/tree/dev
 
-![Docker Cloud Build Status](https://img.shields.io/docker/cloud/build/rmcolq/pandora)
 
 # Pandora
 
-## Contents
-* [Introduction](#introduction)
-* [Quick Start](#quick-start)
-* [Installation](#installation)
-* [Usage](#usage)
+[TOC]: #
+
+# Table of Contents
+- [Introduction](#introduction)
+- [Quick Start](#quick-start)
+- [Installation](#installation)
+  - [Containers](#containers)
+  - [Installation from source](#installation-from-source)
+- [Usage](#usage)
+  - [Population Reference Graphs](#population-reference-graphs)
+  - [Build index](#build-index)
+  - [Map reads to index](#map-reads-to-index)
+  - [Compare reads from several samples](#compare-reads-from-several-samples)
+  - [Discover novel variants](#discover-novel-variants)
+
 
 ## Introduction
-Pandora is a tool for bacterial genome analysis using a pangenome reference graph (PanRG). It allows gene presence/absence detection and genotyping of SNPs, indels and longer variants in one or a number of samples. Pandora works with Illumina or Nanopore data. Core ideas behind the method are:
- - new genomes look like recombinants (plus mutations) of things seen before
- - we should be analysing nucleotide-level variation everywhere, not just in core genes
- - arbitrary single reference genomes are unnatural and limit comparisons of diverse sets of genomes
 
-The pangenome reference graph (PanRG) is a collection of 'floating' local graphs, each representing some orthologous region of interest (e.g. genes, mobile elements or intergenic regions). See https://github.com/rmcolq/make_prg for a pipeline which can construct these PRGs from a set of aligned sequence files.
+Pandora is a tool for bacterial genome analysis using a pangenome
+reference graph (PanRG). It allows gene presence/absence detection and
+genotyping of SNPs, indels and longer variants in one or a number of
+samples. Pandora works with Illumina or Nanopore data. Core ideas behind
+the method are:
+- new genomes look like recombinants (plus mutations) of things seen
+  before
+- we should be analysing nucleotide-level variation everywhere, not just
+  in core genes
+- arbitrary single reference genomes are unnatural and limit comparisons
+  of diverse sets of genomes
+
+The pangenome reference graph (PanRG) is a collection of 'floating'
+local graphs, each representing some orthologous region of interest
+(e.g. genes, mobile elements or intergenic regions). See
+https://github.com/rmcolq/make_prg for a pipeline which can construct
+these PRGs from a set of aligned sequence files.
 
 Pandora can do the following for a single sample (read dataset):
-- Output inferred mosaic of reference sequences for loci (eg genes) from the PanRG which are present
-- Output a VCF showing the variation found within these loci, with respect to any reference path in the PRG.
+- Output inferred mosaic of reference sequences for loci (eg genes) from
+  the PanRG which are present
+- Output a VCF showing the variation found within these loci, with
+  respect to any reference path in the PRG.
 
 Soon, in a galaxy not so far away, it will allow:
 - discovery of new variation not in the PRG
 
 For a collection of samples, it can:
-- Output a matrix showing inferred copy-number of each locus in each sample genome
-- Output a multisample pangenome VCF showing how including genotype calls for each sample in each of the loci
-- Output one VCF per orthologous-chunk, showing how samples which contained this chunk differed in their gene sequence. Variation is shown with respect to the most informative recombinant path in the PRG.
+- Output a matrix showing inferred copy-number of each locus in each
+  sample genome
+- Output a multisample pangenome VCF showing how including genotype
+  calls for each sample in each of the loci
+- Output one VCF per orthologous-chunk, showing how samples which
+  contained this chunk differed in their gene sequence. Variation is
+  shown with respect to the most informative recombinant path in the
+  PRG.
 
 Warning - this code is still in development.
 
 ## Quick Start
+
 Index PanRG file:
+
 ```
 pandora index -t 8 <panrg.fa>
 ```
-Compare first 30X of each Illumina sample to get pangenome matrix and VCF
+
+Compare first 30X of each Illumina sample to get pangenome matrix and
+VCF
+
 ```
 pandora compare --genotype --illumina --max-covg 30 <panrg.fa> <read_index.tab>
 ```
-Map Nanopore reads from a single sample to get approximate sequence for genes present
+
+Map Nanopore reads from a single sample to get approximate sequence for
+genes present
+
 ```
 pandora map <panrg.fa> <reads.fq>
 ```
 
 ## Installation
 
 ### Containers
-We highly recommend that you download a containerized image of Pandora. Pandora is hosted on Dockerhub and images can be downloaded with the command:
+
+![Docker Cloud Build Status](https://img.shields.io/docker/cloud/build/rmcolq/pandora)
+
+We highly recommend that you download a containerized image of Pandora.
+Pandora is hosted on Dockerhub and images can be downloaded with the
+command:
+
 ```
 docker pull rmcolq/pandora:latest
 ```
+
 Alternatively, using singularity:
+
 ```
 singularity pull docker://rmcolq/pandora:latest
 ```
+
 NB For consistency, we no longer maintain images on singularity hub.
 
 ### Installation from source
-This is not recommended because the required zlib and boost system installs do not always play nicely.
-If you want to take the risk:
+
+This is not recommended because the required zlib and boost system
+installs do not always play nicely. If you want to take the risk:
 - Requires a Unix or Mac OS.
-- Requires a system install of `zlib`. If this is not already installed, [this](https://geeksww.com/tutorials/libraries/zlib/installation/installing_zlib_on_ubuntu_linux.php) tutorial is helpful or try the following.
+- Requires a system install of `zlib`. If this is not already installed,
+  [this](https://geeksww.com/tutorials/libraries/zlib/installation/installing_zlib_on_ubuntu_linux.php)
+  tutorial is helpful or try the following.
+
 ```
 wget http://www.zlib.net/zlib-1.2.11.tar.gz -O - | tar xzf -
 cd zlib-1.2.11
 ./configure [--prefix=/prefix/path]
 make
 make install
 ```
-- Requires a system installation of `boost` containing the `system`, `filesystem`, `log` (which also depends on `thread` and `date_time`) and `iostreams` libraries. If not already installed use the following or look at [this](https://www.boost.org/doc/libs/1_62_0/more/getting_started/unix-variants.html) guide.
+
+- Requires a system installation of `boost` containing the `system`,
+  `filesystem`, `log` (which also depends on `thread` and `date_time`)
+  and `iostreams` libraries. If not already installed use the following
+  or look at
+  [this](https://www.boost.org/doc/libs/1_62_0/more/getting_started/unix-variants.html)
+  guide.
+
 ```
 wget https://sourceforge.net/projects/boost/files/boost/1.62.0/boost_1_62_0.tar.gz -O - | tar xzf -
 cd boost_1_62_0
 ./bootstrap.sh [--prefix=/prefix/path] --with-libraries=system,filesystem,iostreams,log,thread,date_time
 ./b2 install
 ```
+
 - Download and install `pandora` as follows:
+
 ```
 git clone --single-branch https://github.com/rmcolq/pandora.git --recursive
 cd pandora
@@ -113,11 +175,20 @@ Subcommands:
 ```
 
 ### Population Reference Graphs
-Pandora assumes you have already constructed a fasta-like file of graphs, one entry for each gene/ genome region of interest.
-If you haven't, you will need a multiple sequence alignment for each graph. Precompiled collections of MSA representing othologous gene clusters for a number of species can be downloaded from [here](http://pangenome.de/) and converted to graphs using the pipeline from [here](https://github.com/rmcolq/make_prg).
+
+Pandora assumes you have already constructed a fasta-like file of
+graphs, one entry for each gene/ genome region of interest. If you
+haven't, you will need a multiple sequence alignment for each graph.
+Precompiled collections of MSA representing othologous gene clusters for
+a number of species can be downloaded from [here](http://pangenome.de/)
+and converted to graphs using the pipeline from
+[here](https://github.com/rmcolq/make_prg).
 
 ### Build index
-Takes a fasta-like file of PanRG sequences and constructs an index, and a directory of gfa files to be used by `pandora map` or `pandora compare`. These are output in the same directory as the PanRG file.
+
+Takes a fasta-like file of PanRG sequences and constructs an index, and
+a directory of gfa files to be used by `pandora map` or `pandora
+compare`. These are output in the same directory as the PanRG file.
 
 ```
 $ pandora index --help
@@ -136,10 +207,15 @@ Options:
   -v                          Verbosity of logging. Repeat for increased verbosity
 ```
 
-The index stores (w,k)-minimizers for each PanRG path found. These parameters can be specified, but default to w=14, k=15.
+The index stores (w,k)-minimizers for each PanRG path found. These
+parameters can be specified, but default to w=14, k=15.
 
 ### Map reads to index
-This takes a fasta/q of Nanopore or Illumina reads and compares to the index. It infers which of the PanRG genes/elements is present, and for those that are present it outputs the inferred sequence and a genotyped VCF.
+
+This takes a fasta/q of Nanopore or Illumina reads and compares to the
+index. It infers which of the PanRG genes/elements is present, and for
+those that are present it outputs the inferred sequence and a genotyped
+VCF.
 
 ```
 $ pandora map --help
@@ -199,7 +275,12 @@ Genotyping:
 ```
 
 ### Compare reads from several samples
-This takes Nanopore or Illumina read fasta/q for a number of samples, mapping each to the index. It infers which of the PanRG genes/elements is present in each sample, and outputs a presence/absence pangenome matrix, the inferred sequences for each sample and a genotyped multisample pangenome VCF.
+
+This takes Nanopore or Illumina read fasta/q for a number of samples,
+mapping each to the index. It infers which of the PanRG genes/elements
+is present in each sample, and outputs a presence/absence pangenome
+matrix, the inferred sequences for each sample and a genotyped
+multisample pangenome VCF.
 
 ```
 $ pandora compare --help
@@ -256,26 +337,27 @@ Genotyping:
 
 ### Discover novel variants
 
-This will look for regions in the pangraph where the reads do not map and attempt to locally assemble these regions to find novel variants.
+This will look for regions in the pangraph where the reads do not map
+and attempt to locally assemble these regions to find novel variants.
 
 ```
 $ pandora discover --help
 Quasi-map reads to an indexed PRG, infer the sequence of present loci in the sample and discover novel variants.
-Usage: ./pandora discover [OPTIONS] <TARGET> <QUERY>
+Usage: pandora discover [OPTIONS] <TARGET> <QUERY>
 
 Positionals:
   <TARGET> FILE [required]    An indexed PRG file (in fasta format)
   <QUERY> FILE [required]     Fast{a,q} file containing reads to quasi-map
 
 Options:
   -h,--help                   Print this help message and exit
-  --discover-k INT            K-mer size to use when discovering novel variants [default: 11]
+  --discover-k INT:[0-32)     K-mer size to use when discovering novel variants [default: 11]
   --max-ins INT               Max. insertion size for novel variants. Warning: setting too long may impair performance [default: 15]
   --covg-threshold INT        Positions with coverage less than this will be tagged for variant discovery [default: 3]
   -l INT                      Min. length of consecutive positions below coverage threshold to trigger variant discovery [default: 1]
   -L INT                      Max. length of consecutive positions below coverage threshold to trigger variant discovery [default: 50]
   -P,--pad INT                Padding either side of candidate variant intervals [default: 22]
-  -d,--merge INT              Merge candidate variant intervals within distance [default: 22]
+  -d,--merge INT              Merge candidate variant intervals within distance [default: 15]
   --min-dbg-dp INT            Minimum node/kmer depth in the de Bruijn graph used for discovering variants [default: 2]
   -v                          Verbosity of logging. Repeat for increased verbosity
 
@@ -303,8 +385,10 @@ Preset:
 
 Filtering:
   --clean                     Add a step to clean and detangle the pangraph
+  --clean-dbg                 Clean the local assembly de Bruijn graph
   --max-covg INT              Maximum coverage of reads to accept [default: 600]
 
 Consensus/Variant Calling:
   --kmer-avg INT              Maximum number of kmers to average over when selecting the maximum likelihood path [default: 100]
-```
+```
+
diff --git a/ci/install.sh b/ci/install.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+set -evu
+
+BOOST_URL="http://sourceforge.net/projects/boost/files/boost/${BOOST_VERSION}/boost_${BOOST_VERSION//\./_}.tar.gz"
+BOOST_DIR="${DEPS_DIR}/boost-${BOOST_VERSION}"
+BOOST_ROOT="/usr"
+export BOOST_DIR
+export BOOST_ROOT
+
+# we cache this boost directory so we don't build it every time
+if [[ -z "$(ls -A "${BOOST_DIR}")" ]]; then
+  mkdir -p "${BOOST_DIR}"
+  { wget --quiet -O - "${BOOST_URL}" | tar --strip-components=1 -xz -C "${BOOST_DIR}"; } || exit 1
+fi
+
+cd "$BOOST_DIR" || exit 1
+{ sudo ./bootstrap.sh --with-libraries="$BOOST_LIBS" --prefix="$BOOST_ROOT" && sudo ./b2 install; } || exit 1
diff --git a/ci/script.sh b/ci/script.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -evu
+
+export CMAKE_OPTIONS="-DCMAKE_BUILD_TYPE=$BUILD_TYPE "
+
+echo "$CMAKE_OPTIONS"
+mkdir build
+cd build || exit 1
+{ cmake "$CMAKE_OPTIONS" ..  && make -j4; } || exit 1
+export PATH="${PWD}:${PATH}"
+./pandora --help
+ctest -V || exit 1