Skip to content

Commit

Permalink
Merge pull request #74 from broadinstitute/dpark-dev
Browse files Browse the repository at this point in the history
change subsampled reads from fastq to bam, add installation instructions to RTD
  • Loading branch information
dpark01 committed Jan 18, 2015
2 parents a594a66 + 634a9c7 commit e1902ce
Show file tree
Hide file tree
Showing 16 changed files with 190 additions and 62 deletions.
11 changes: 5 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@
viral-ngs
=========

A set of scripts and tools for the analysis of viral NGS data. More
details to come.
A set of scripts and tools for the analysis of viral NGS data.


Usage
=========

Detailed command line help can be found at http://viral-ngs.readthedocs.org/
More detailed documentation can be found at http://viral-ngs.readthedocs.org/
This includes installation instructions,
usage instructions for the command line tools,
and usage of the pipeline infrastructure.
63 changes: 45 additions & 18 deletions assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,34 @@
log = logging.getLogger(__name__)


def bam_to_trim_rmdup_subsamp_fastqs(inBam, clipDb, n_reads=100000):
def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000, outFastqs=None):
''' Take reads through Trimmomatic, Prinseq, and subsampling.
'''

# BAM -> fastq
infq = list(map(util.file.mkstempfname, ['.in.1.fastq', '.in.2.fastq']))
tools.picard.SamToFastqTool().execute(inBam, infq[0], infq[1])

# Trimmomatic
trimfq = list(map(util.file.mkstempfname, ['.trim.1.fastq', '.trim.2.fastq']))
taxon_filter.trimmomatic(infq[0], infq[1], trimfq[0], trimfq[1], clipDb)
os.unlink(infq[0])
os.unlink(infq[1])

# Prinseq
rmdupfq = list(map(util.file.mkstempfname, ['.rmdup.1.fastq', '.rmdup.2.fastq']))
read_utils.rmdup_prinseq_fastq(trimfq[0], rmdupfq[0])
read_utils.rmdup_prinseq_fastq(trimfq[1], rmdupfq[1])
os.unlink(trimfq[0])
os.unlink(trimfq[1])


# Purge unmated
purgefq = list(map(util.file.mkstempfname, ['.fix.1.fastq', '.fix.2.fastq']))
read_utils.purge_unmated(rmdupfq[0], rmdupfq[1], purgefq[0], purgefq[1])
os.unlink(rmdupfq[0])
os.unlink(rmdupfq[1])

# Subsample
subsampfq = list(map(util.file.mkstempfname, ['.subsamp.1.fastq', '.subsamp.2.fastq']))
cmd = [os.path.join(util.file.get_scripts_path(), 'subsampler.py'),
'-n', str(n_reads),
Expand All @@ -47,21 +55,40 @@ def bam_to_trim_rmdup_subsamp_fastqs(inBam, clipDb, n_reads=100000):
subprocess.check_call(cmd)
os.unlink(purgefq[0])
os.unlink(purgefq[1])
return subsampfq

# Fastq -> BAM
tmp_bam = util.file.mkstempfname('.subsamp.bam')
tmp_header = util.file.mkstempfname('.header.sam')
tools.picard.FastqToSamTool().execute(
subsampfq[0], subsampfq[1], 'Dummy', tmp_bam)
tools.samtools.SamtoolsTool().dumpHeader(inBam, tmp_header)
tools.samtools.SamtoolsTool().reheader(tmp_bam, tmp_header, outBam)
os.unlink(tmp_bam)
os.unlink(tmp_header)

# Save fastqs if requested
if outFastqs:
shutil.copyfile(subsampfq[0], outFastqs[0])
shutil.copyfile(subsampfq[1], outFastqs[1])
os.unlink(subsampfq[0])
os.unlink(subsampfq[1])


def assemble_trinity(inBam, outFasta, clipDb, n_reads=100000, outFq=None):
def assemble_trinity(inBam, outFasta, clipDb, n_reads=100000, outReads=None):
''' This step runs the Trinity assembler.
First trim reads with trimmomatic, rmdup with prinseq,
and random subsample to no more than 100k reads.
'''
subsampfq = bam_to_trim_rmdup_subsamp_fastqs(inBam, clipDb, n_reads)
if outReads:
subsamp_bam = outReads
else:
subsamp_bam = util.file.mkstempfname('.subsamp.bam')
subsampfq = list(map(util.file.mkstempfname, ['.subsamp.1.fastq', '.subsamp.2.fastq']))
trim_rmdup_subsamp_reads(inBam, clipDb, subsamp_bam,
n_reads=n_reads, outFastqs=subsampfq)
tools.trinity.TrinityTool().execute(subsampfq[0], subsampfq[1], outFasta)
if outFq:
if len(outFq)!=2:
raise ValueError("outFq must have exactly two values")
shutil.copyfile(subsampfq[0], outFq[0])
shutil.copyfile(subsampfq[1], outFq[1])
if not outReads:
os.unlink(subsamp_bam)
os.unlink(subsampfq[0])
os.unlink(subsampfq[1])

Expand All @@ -74,8 +101,8 @@ def parser_assemble_trinity(parser=argparse.ArgumentParser()):
help='Output assembly.')
parser.add_argument('--n_reads', default=100000, type=int,
help='Subsample reads to no more than this many pairs. (default %(default)s)')
parser.add_argument('--outFq', default=None, nargs=2,
help='Save the trimmomatic/prinseq/subsamp reads to a pair of output fastq files')
parser.add_argument('--outReads', default=None,
help='Save the trimmomatic/prinseq/subsamp reads to a BAM file')
util.cmd.common_args(parser, (('loglevel',None), ('version',None), ('tmpDir',None)))
util.cmd.attach_main(parser, assemble_trinity, split_args=True)
return parser
Expand Down Expand Up @@ -103,13 +130,12 @@ def order_and_orient(inFasta, inReference, outFasta, inReads=None):
'-musclepath', musclepath,
'-samtoolspath', tools.samtools.SamtoolsTool().install_and_get_path()]
if inReads:
if len(inReads)!=2:
raise ValueError("inReads must have exactly two values")
infq = list(map(util.file.mkstempfname, ['.in.1.fastq', '.in.2.fastq']))
tools.picard.SamToFastqTool().execute(inReads, infq[0], infq[1])
mosaik = tools.mosaik.MosaikTool()
mosaikDir = os.path.dirname(mosaik.install_and_get_path())
cmd = cmd + [
'-readfq', inReads[0], '-readfq2', inReads[1],
'-mosaikpath', mosaikDir,
'-readfq', infq[0], '-readfq2', infq[1],
'-mosaikpath', os.path.dirname(mosaik.install_and_get_path()),
'-mosaiknetworkpath', mosaik.get_networkFile(),
]
subprocess.check_call(cmd)
Expand All @@ -131,7 +157,8 @@ def parser_order_and_orient(parser=argparse.ArgumentParser()):
help='Reference genome, FASTA format.')
parser.add_argument('outFasta',
help='Output assembly, FASTA format.')
parser.add_argument('--inReads', default=None, nargs=2, help='Input reads in FASTQ format.')
parser.add_argument('--inReads', default=None,
help='Input reads in BAM format.')
util.cmd.common_args(parser, (('loglevel',None), ('version',None), ('tmpDir',None)))
util.cmd.attach_main(parser, order_and_orient, split_args=True)
return parser
Expand Down
4 changes: 2 additions & 2 deletions docs/assembly.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Assembly commands
=====================================
assembly.py - *de novo* assembly
================================

.. argparse::
:module: assembly
Expand Down
2 changes: 1 addition & 1 deletion docs/broad_utils.rst
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Processing Broad Institute Sequencing outputs
broad_utils.py - for data generated at the Broad Institute
=====================================

.. argparse::
Expand Down
14 changes: 14 additions & 0 deletions docs/cmdline.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
Command line tools
==================

.. toctree::

taxon_filter
assembly
interhost
intrahost
read_utils
reports
broad_utils


2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@
# dir menu entry, description, category)
texinfo_documents = [
('index', 'viral-ngs', 'viral-ngs Documentation',
'Broad Institute Viral Genomics', 'viral-ngs', 'One line description of project.',
'Broad Institute Viral Genomics', 'viral-ngs', 'Viral genomics analysis pipelines',
'Miscellaneous'),
]

Expand Down
22 changes: 10 additions & 12 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,17 @@
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to viral-ngs's documentation!
=====================================
viral-ngs: genomic analysis pipelines for viral sequencing
==========================================================

Contents:

Contents
--------

.. toctree::
:maxdepth: 2

taxon_filter
assembly
interhost
intrahost
read_utils
reports
broad_utils

:numbered:

install
cmdline
pipeuse
72 changes: 72 additions & 0 deletions docs/install.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
Installation
============


System dependencies
-------------------

This is known to install cleanly on most modern Linux systems with Python,
Java, and some basic development libraries. On Ubuntu 14.04 LTS, the
following APT packages should be installed on top of the vanilla setup::

python3 python3-pip python3-nose
python-software-properties
zlib zlib1g zlib1g-dev
libblas3gf libblas-dev liblapack3gf liblapack-dev
libatlas-dev libatlas3-base libatlas3gf-base libatlas-base-dev
gfortran
oracle-java8-installer
libncurses5-dev

The Fortran libraries (including blas and atlas) are required to install
numpy via pip from source. numpy is not actually required if you have
Python 3.4, if you want to avoid this system dependency.

**Java >= 1.7** is required by GATK and Picard.


Python dependencies
-------------------

The **command line tools** require Python >= 2.7 or >= 3.4. Required packages
(like pysam and Biopython) are listed in requirements.txt and can be
installed the usual pip way::

pip install -r requirements.txt

Additionally, in order to use the **pipeline infrastructure**, Python 3.4
is required (Python 2 is not supported) and you must install snakemake
as well::

pip install snakemake==3.2 yappi=0.94

You should either sudo pip install or use a virtualenv (recommended).


Tool dependencies
-----------------

A lot of effort has gone into writing auto download/compile wrappers for
most of the bioinformatic tools we rely on here. They will auto-download
and install the first time they are needed by any command. If you want
to pre-install all of the external tools, simply type this::

python -m unittest test.test_tools.TestToolsInstallation -v

However, there are two tools in particular that cannot be auto-installed
due to licensing restrictions. You will need to download and install
these tools on your own (paying for it if your use case requires it) and
set environment variables pointing to their installed location.

* GATK - http://www.broadinstitute.org/gatk/
* Novoalign - http://www.novocraft.com/products/novoalign/

The environment variables you will need to set are GATK_PATH and
NOVOALIGN_PATH. These should be set to the full directory path
that contains these tools (the jar file for GATK and the executable
binaries for Novoalign).

Alternatively, if you are using the Snakemake pipelines, you can create
a dictionary called "env_vars" in the config.json file for Snakemake,
and the pipelines will automatically set all environment variables prior
to running any scripts.
2 changes: 1 addition & 1 deletion docs/interhost.rst
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Interhost genetic variation
interhost.py - species and population-level genetic variation
=====================================

.. argparse::
Expand Down
4 changes: 2 additions & 2 deletions docs/intrahost.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Intrahost genetic variation (iSNVs)
=====================================
intrahost.py - within-host genetic variation (iSNVs)
====================================================

.. argparse::
:module: intrahost
Expand Down
29 changes: 29 additions & 0 deletions docs/pipeuse.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
Using the Snakemake pipelines
=============================

Much more documentation to come...

This utilizes Snakemake, which is documented at
https://bitbucket.org/johanneskoester/snakemake/wiki/Home

Note that Python 3.4 is required to use these tools with Snakemake.


Setting up an analysis directory
--------------------------------


Configuring for your compute platform
-------------------------------------



Assembly of pre-filtered reads
------------------------------

Taxonomic filtration of raw reads
---------------------------------

Starting from Illumina BCL directories
--------------------------------------

2 changes: 1 addition & 1 deletion docs/read_utils.rst
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Read utilities
read_utils.py - utilities that manipulate bam and fastq files
=====================================

.. argparse::
Expand Down
2 changes: 1 addition & 1 deletion docs/reports.rst
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Reports
reports.py - produce various metrics and reports
=====================================

.. argparse::
Expand Down
2 changes: 1 addition & 1 deletion docs/taxon_filter.rst
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Taxonomic filtration tools
taxon_filter.py - tools for taxonomic removal or filtration of reads
=====================================

.. argparse::
Expand Down
11 changes: 1 addition & 10 deletions pipes/ref_assisted/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,7 @@
Then type "ref_assisted" and wait a few hours for aligned BAMs, VCFs, and
FASTAs.
This is designed for use on a single linux computer (e.g. Ubuntu 14.04 LTS) with:
apt-get install:
python3 python3-pip python-software-properties
zlib zlib1g zlib1g-dev
libblas3gf libblas-dev liblapack3gf liblapack-dev
libatlas-dev libatlas3-base libatlas3gf-base libatlas-base-dev
gfortran git oracle-java8-installer
libncurses5-dev python3-nose
pip3 install -r requirements.txt
pip3 install snakemake==3.2 yappi==0.94
This is designed for use on a single linux computer.
"""

__author__ = 'Daniel Park <[email protected]>'
Expand Down
Loading

0 comments on commit e1902ce

Please sign in to comment.