Skip to content

Commit

Permalink
Merge pull request #62 from broadinstitute/dp-nigeria
Browse files Browse the repository at this point in the history
Nigeria preparations: add reference guided assembly pipelines for short-term use Spring 2015 at run.edu.ng and ucad.sn.
  • Loading branch information
dpark01 committed Jan 11, 2015
2 parents 3cbc0a2 + 49c580d commit 4a0d7d2
Show file tree
Hide file tree
Showing 8 changed files with 221 additions and 46 deletions.
41 changes: 28 additions & 13 deletions assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def align_and_orient_vfat(inFasta, inReference, outFasta, minLength, minUnambig,

def refine_assembly(inFasta, inBam, outFasta,
outVcf=None, outBam=None, novo_params='', min_coverage=2,
JVMmemory=None):
contig_names=[], keep_all_reads=False, JVMmemory=None):
''' This a refinement step where we take a crude assembly, align
all reads back to it, and modify the assembly to the majority
allele at each position based on read pileups.
Expand All @@ -122,11 +122,15 @@ def refine_assembly(inFasta, inBam, outFasta,

# Novoalign reads to self
novoBam = util.file.mkstempfname('.novoalign.bam')
min_qual = 0 if keep_all_reads else 1
novoalign.execute(inBam, inFasta, novoBam,
options=novo_params.split(), min_qual=1, JVMmemory=JVMmemory)
options=novo_params.split(), min_qual=min_qual, JVMmemory=JVMmemory)
rmdupBam = util.file.mkstempfname('.rmdup.bam')
opts = ['CREATE_INDEX=true']
if not keep_all_reads:
opts.append('REMOVE_DUPLICATES=true')
picard_mkdup.execute([novoBam], rmdupBam,
picardOptions=['REMOVE_DUPLICATES=true', 'CREATE_INDEX=true'], JVMmemory=JVMmemory)
picardOptions=opts, JVMmemory=JVMmemory)
os.unlink(novoBam)
realignBam = util.file.mkstempfname('.realign.bam')
gatk.local_realign(rmdupBam, deambigFasta, realignBam, JVMmemory=JVMmemory)
Expand All @@ -139,9 +143,12 @@ def refine_assembly(inFasta, inBam, outFasta,
gatk.ug(realignBam, deambigFasta, tmpVcf, JVMmemory=JVMmemory)
os.unlink(realignBam)
os.unlink(deambigFasta)
name_opts = []
if contig_names:
name_opts = ['--name'] + contig_names
main_vcf_to_fasta(parser_vcf_to_fasta().parse_args([
tmpVcf, outFasta, '--trim_ends', '--min_coverage', str(min_coverage),
]))
] + name_opts))
if outVcf:
shutil.copyfile(tmpVcf, outVcf)
if outVcf.endswith('.gz'):
Expand All @@ -168,12 +175,18 @@ def parser_refine_assembly():
parser.add_argument('--outVcf',
default=None,
help='GATK genotype calls for genome in inFasta coordinate space.')
parser.add_argument('--novo_params',
default='-r Random -l 40 -g 40 -x 20 -t 100',
help='Alignment parameters for Novoalign.')
parser.add_argument('--min_coverage',
default=3, type=int,
help='Minimum read coverage required to call a position unambiguous.')
parser.add_argument('--novo_params',
default='-r Random -l 40 -g 40 -x 20 -t 100',
help='Alignment parameters for Novoalign.')
parser.add_argument("--chr_names", dest="chr_names", nargs="*",
help="Rename all output chromosomes (default: retain original chromosome names)",
default=[])
parser.add_argument("--keep_all_reads",
help="""Retain all reads in BAM file? Default is to remove unaligned and duplicate reads.""",
default=False, action="store_true", dest="keep_all_reads")
parser.add_argument('--JVMmemory',
default=tools.gatk.GATKTool.jvmMemDefault,
help='JVM virtual memory size (default: %(default)s)')
Expand All @@ -182,7 +195,8 @@ def parser_refine_assembly():
def main_refine_assembly(args):
refine_assembly(args.inFasta, args.inBam, args.outFasta,
args.outVcf, args.outBam, args.novo_params, args.min_coverage,
JVMmemory=args.JVMmemory)
contig_names=args.chr_names,
keep_all_reads=args.keep_all_reads, JVMmemory=args.JVMmemory)
return 0
__commands__.append(('refine_assembly', main_refine_assembly, parser_refine_assembly))

Expand Down Expand Up @@ -575,9 +589,9 @@ def parser_vcf_to_fasta():
total read count. This filter will not apply to any sites unless both DP values
are reported. [default: %(default)s]""",
default=0.0)
parser.add_argument("--name", dest="name",
help="output sequence name (default: reference name in VCF file)",
default=None)
parser.add_argument("--name", dest="name", nargs="*",
help="output sequence names (default: reference names in VCF file)",
default=[])
util.cmd.common_args(parser, (('loglevel',None), ('version',None)))
return parser
def main_vcf_to_fasta(args):
Expand All @@ -588,13 +602,14 @@ def main_vcf_to_fasta(args):
chrlens = dict(vcf.chrlens())
samples = vcf.samples()
with open(args.outFasta, 'wt') as outf:
chr_idx = 0
for header, seq in vcf_to_seqs(util.file.read_tabfile(args.inVcf),
chrlens, samples, min_dp=args.min_dp, major_cutoff=args.major_cutoff,
min_dp_ratio=args.min_dp_ratio):
if args.trim_ends:
seq = seq.strip('Nn')
if args.name!=None:
header = args.name
if args.name:
header = args.name[chr_idx % len(args.name)]
for line in util.file.fastaMaker([(header, seq)]):
outf.write(line)

Expand Down
15 changes: 8 additions & 7 deletions pipes/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,19 @@

__author__ = 'Daniel Park <[email protected]>'

import os.path

configfile: "config.json"
pipesDir = os.path.join(os.path.expanduser(config['binDir']), 'pipes', 'rules')

include: config["binDir"]+"/pipes/rules/common.rules"

include: os.path.join(pipesDir, 'common.rules')
set_env_vars()

include: config["binDir"]+"/pipes/rules/demux.rules"
include: config["binDir"]+"/pipes/rules/hs_deplete.rules"
include: config["binDir"]+"/pipes/rules/assembly.rules"
include: config["binDir"]+"/pipes/rules/interhost.rules"
include: config["binDir"]+"/pipes/rules/reports.rules"
include: os.path.join(pipesDir, 'demux.rules')
include: os.path.join(pipesDir, 'hs_deplete.rules')
include: os.path.join(pipesDir, 'assembly.rules')
include: os.path.join(pipesDir, 'interhost.rules')
include: os.path.join(pipesDir, 'reports.rules')

rule all:
input:
Expand Down
2 changes: 2 additions & 0 deletions pipes/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
"samples_assembly": "samples-assembly.txt",
"samples_per_run": "samples-runs.txt",

"seq_center": "BI",

"bmTaggerDbDir": "/idi/sabeti-scratch/kandersen/references/bmtagger",
"bmTaggerDbs_remove": [
"hg19",
Expand Down
128 changes: 128 additions & 0 deletions pipes/ref_assisted/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
"""
This performs reference-assisted assembly of viral genomes, starting
from fastqs produced directly from MiSeq machines.
Make copies of this Snakefile and config.json to your analysis directory and
customize as needed.
This operates on the SampleSheet.csv file that is used as input to the MiSeq
machine and all of the fastq files that are emitted by that machine. Put
the SampleSheet.csv in the project directory and put the fastq files in a
subdirectory called "data". Copy the config.json to the project directory.
Then type "ref_assisted" and wait a few hours for aligned BAMs, VCFs, and
FASTAs.
This is designed for use on a single linux computer (e.g. Ubuntu 14.04 LTS) with:
apt-get install:
python3 python3-pip python-software-properties
zlib zlib1g zlib1g-dev
libblas3gf libblas-dev liblapack3gf liblapack-dev
libatlas-dev libatlas3-base libatlas3gf-base libatlas-base-dev
gfortran git oracle-java8-installer
libncurses5-dev python3-nose
pip3 install -r requirements.txt
pip3 install snakemake==3.2
"""

__author__ = 'Daniel Park <[email protected]>'

import os, os.path, time, hashlib, base64

configfile: "config.json"

for k,v in config.get('env_vars', {}).items():
os.environ[k] = v

def get_sample_list(fname):
with open(fname, 'rt') as inf:
header = None
for line in inf:
if line.startswith('Sample_ID'):
header = line.strip().split(',')
elif header and line:
yield line.strip().split(',')[0]

def get_sample_info(sample, fname):
with open(fname, 'rt') as inf:
header = None
n = 0
for line in inf:
if line.startswith('Sample_ID'):
header = line.strip().split(',')
elif header and line:
n += 1
out = dict(zip(header, line.strip().split(',')))
out['n'] = n
if out['Sample_ID']==sample:
return out

def get_file_date(fname):
return time.strftime("%Y-%m-%d", time.localtime(os.path.getmtime(fname)))

def make_run_hash(fname, length=6):
if 'flowcell' in config:
return config['flowcell']
with open(fname, 'rt') as inf:
csv = ''.join(inf.readlines())
hash_obj = hashlib.sha1(csv.encode('utf-8'))
b32_str = base64.b32encode(bytes(hash_obj.digest())).decode('utf-8')
return b32_str[:length]

##############################################

rule all:
input:
expand("{dataDir}/{sample}.fasta",
dataDir=config["dataDir"],
sample=get_sample_list(config["samples"]))

##############################################

def get_fastq_filenames(wildcards):
info = get_sample_info(wildcards.sample, config["samples"])
suffix = config.get('fastqs_gzipped', False) and ".gz" or ""
return [
os.path.join(wildcards.dir,
'{sample}_S{idx}_L001_R{dir}_001.fastq{suffix}'.format(
sample=wildcards.sample, idx=info['n'], dir=direction,
suffix=suffix))
for direction in ('1','2')]
rule bams_from_fastq:
input: get_fastq_filenames
output: '{dir}/{sample}.raw.bam'
resources: mem=4
params: logid="{sample}",
center=config["seq_center"]
run:
run_date = get_file_date(input[0])
info = get_sample_info(wildcards.sample, config["samples"])
hash = make_run_hash(config["samples"])
shell("{config[binDir]}/read_utils.py fastq_to_bam {input} {output} " \
+ "--sampleName {wildcards.sample} --picardOptions " \
+ "SEQUENCING_CENTER={params.center} " \
+ "RUN_DATE={run_date} " \
+ "PLATFORM_UNIT={hash}.1.{info[index]}-{info[index2]} " \
+ "READ_GROUP_NAME={hash} " \
+ "PLATFORM=illumina SORT_ORDER=queryname")

##############################################

rule ref_guided_consensus:
input: '{dir}/{sample}.raw.bam'
output: '{dir}/{sample}.realigned.bam',
'{dir}/{sample}.vcf.gz',
'{dir}/{sample}.fasta'
resources: mem=4
params: LSF='-W 4:00',
logid="{sample}",
refGenome=os.path.expanduser(config["ref_genome"]),
novoalign_options=config["refine_options"]["novoalign"],
min_coverage=config["refine_options"]["min_coverage"]
shell: "{config[binDir]}/assembly.py refine_assembly " \
+ "{params.refGenome} {input} {output[2]} " \
+ "--outBam {output[0]} " \
+ "--outVcf {output[1]} " \
+ "--keep_all_reads " \
+ "--chr_names {wildcards.sample} " \
+ "--min_coverage {params.min_coverage} " \
+ "--novo_params '{params.novoalign_options}' "
20 changes: 20 additions & 0 deletions pipes/ref_assisted/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"samples": "SampleSheet.csv",

"refine_options": {
"novoalign": "-r Random -l 30 -g 40 -x 20 -t 502",
"min_coverage": "3"
},

"seq_center": "ACEGID_RUN",
"fastqs_gzipped": false,
"n_cores": 4,
"max_ram": 16,
"ref_genome": "~/resources/ref_genome/genome.fasta",

"dataDir": "data",
"logDir": "log",
"binDir": "~/viral-ngs",
"venvDir": "~/venv",
"project": "viral_ngs"
}
12 changes: 12 additions & 0 deletions pipes/ref_assisted/ref_assisted
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

# load config dirs from config.json
BIN_DIR=`python -c 'import json,os.path;f=open("config.json");print(os.path.expanduser(json.load(f)["binDir"]));f.close()'`
N_CORES=`python -c 'import json;f=open("config.json");print(json.load(f)["n_cores"]);f.close()'`
MAX_RAM=`python -c 'import json;f=open("config.json");print(json.load(f)["max_ram"]);f.close()'`

# execute snakemake on this machine with specified resources
snakemake --timestamp --rerun-incomplete --keep-going \
--jobs $N_CORES --resources mem=$MAX_RAM \
-s $BIN_DIR/pipes/ref_assisted/Snakefile \
"$@"
15 changes: 14 additions & 1 deletion pipes/rules/demux.rules
Original file line number Diff line number Diff line change
Expand Up @@ -118,14 +118,15 @@ rule illumina_basecalls:
resources: mem=64
params: LSF='-q flower',
logid="{flowcell}.{lane}",
center=config['seq_center'],
outdir=config['tmpDir']+'/'+config['subdirs']['demux']+'/bams_per_lane/{flowcell}.{lane}'
run:
shutil.rmtree(params.outdir, ignore_errors=True)
makedirs(params.outdir)
lane = get_one_lane_from_run(wildcards.flowcell, wildcards.lane, config['seqruns_demux'])
dir = lane['bustard_dir']
run_date = lane.get('seq_run_date')
shell("{config[binDir]}/broad_utils.py illumina_basecalls {dir} {input[1]} {wildcards.flowcell} {wildcards.lane} {input[0]} --include_non_pf_reads=false --run_start_date={run_date}")
shell("{config[binDir]}/broad_utils.py illumina_basecalls {dir} {input[1]} {wildcards.flowcell} {wildcards.lane} {input[0]} --include_non_pf_reads=false --run_start_date={run_date} --sequencing_center={params.center}")

def demux_move_bams_inputs(wildcards):
lane = get_one_lane_from_run(wildcards.flowcell, wildcards.lane, config.get('seqruns_demux',''))
Expand All @@ -141,3 +142,15 @@ rule move_bams_demux:
makedirs(os.path.join(config['dataDir'], config['subdirs']['source']))
shutil.move(input[0], output[0])

rule bams_from_fastq:
input: os.path.join(config['dataDir'],config['subdirs']['source'],'{sample}_R1_{idx}.fastq'),
os.path.join(config['dataDir'],config['subdirs']['source'],'{sample}_R2_{idx}.fastq')
output: os.path.join(config['dataDir'],config['subdirs']['source'],'{sample}_{idx}.bam')
params: logid="{sample}_{idx}",
center=config["seq_center"]
run:
makedirs(os.path.join(config['dataDir'], config['subdirs']['source']))
shell("{config[binDir]}/read_utils.py fastq_to_bam {input} {output} --sampleName {wildcards.sample} --picardOptions PLATFORM=illumina SEQUENCING_CENTER={params.center} LIBRARY_NAME={wildcards.sample}_{wildcards.idx} SORT_ORDER=queryname")

ruleorder: move_bams_demux > bams_from_fastq

34 changes: 9 additions & 25 deletions pipes/rules/interhost.rules
Original file line number Diff line number Diff line change
Expand Up @@ -27,38 +27,22 @@ rule all_ref_guided:
sample=read_samples_file(config["samples_per_run"])),
config["reportsDir"]+'/summary.coverage_ref.txt.gz'

rule map_reads_to_ref:
rule ref_guided_consensus:
input: config["dataDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam'
output: config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.bam',
config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.mapped.bam',
config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.rmdup.bam',
config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.realigned.bam'
resources: mem=8
output: config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.realigned.bam',
config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.vcf.gz',
config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.fasta'
resources: mem=4
params: LSF='-W 4:00',
logid="{sample}",
refGenome=config["ref_genome"],
novoalign_options="-r Random -l 30 -g 40 -x 20 -t 502"
novoalign_options="-r Random -l 30 -g 40 -x 20 -t 502",
min_coverage="3"
run:
makedirs(expand("{dir}/{subdir}",
dir=[config["dataDir"], config["tmpDir"]],
dir=[config["dataDir"]],
subdir=[config["subdirs"]["align_ref"], config["subdirs"]["assembly"]]))
shell("{config[binDir]}/read_utils.py novoalign {input[0]} {params.refGenome} {output[0]} --options '{params.novoalign_options}'")
shell("{config[binDir]}/read_utils.py filter_bam_mapped_only {output[0]} {output[1]}")
shell("{config[binDir]}/read_utils.py mkdup_picard {output[1]} {output[2]} --remove --picardOptions CREATE_INDEX=true")
shell("{config[binDir]}/read_utils.py gatk_realign {output[2]} {params.refGenome} {output[3]}")

rule ref_guided_consensus:
input: config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.realigned.bam'
output: config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.vcf.gz',
config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.fasta'
resources: mem=8
params: LSF='-W 4:00',
logid="{sample}",
refGenome=config["ref_genome"]
run:
update_timestamps(input)
gatk_ug(input[0], params.refGenome, output[0])
shell("{config[binDir]}/assembly.py vcf_to_fasta {output[0]} {output[1]} --min_coverage 2 --name {wildcards.sample}")
shell("{config[binDir]}/assembly.py refine_assembly {params.refGenome} {input} {output[2]} --outBam {output[0]} --outVcf {output[1]} --min_coverage {params.min_coverage} --novo_params '{params.novoalign_options}' --keep_all_reads --chr_names {wildcards.sample}")

rule ref_guided_diversity:
input:
Expand Down

0 comments on commit 4a0d7d2

Please sign in to comment.