From bff059953bd4173477b9d60407582973505fdc7a Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Sat, 10 Jan 2015 15:49:25 -0500
Subject: [PATCH 1/6] start some work on ref_assisted assembly

---
 assembly.py                     | 21 ++++++++++-----
 pipes/Snakefile                 | 15 ++++++-----
 pipes/config.json               |  2 ++
 pipes/ref_assisted/Snakefile    | 46 +++++++++++++++++++++++++++++++++
 pipes/ref_assisted/config.json  |  9 +++++++
 pipes/ref_assisted/ref_assisted |  5 ++++
 pipes/rules/demux.rules         | 15 ++++++++++-
 pipes/rules/interhost.rules     | 36 +++++++-------------------
 8 files changed, 108 insertions(+), 41 deletions(-)
 create mode 100644 pipes/ref_assisted/Snakefile
 create mode 100644 pipes/ref_assisted/config.json
 create mode 100755 pipes/ref_assisted/ref_assisted

diff --git a/assembly.py b/assembly.py
index c7ac5c418..2c0762149 100755
--- a/assembly.py
+++ b/assembly.py
@@ -96,7 +96,7 @@ def align_and_orient_vfat(inFasta, inReference, outFasta, minLength, minUnambig,
 
 def refine_assembly(inFasta, inBam, outFasta,
         outVcf=None, outBam=None, novo_params='', min_coverage=2,
-        JVMmemory=None):
+        keep_all_reads=False, JVMmemory=None):
     ''' This a refinement step where we take a crude assembly, align
         all reads back to it, and modify the assembly to the majority
         allele at each position based on read pileups.
@@ -122,11 +122,15 @@ def refine_assembly(inFasta, inBam, outFasta,
     
     # Novoalign reads to self
     novoBam = util.file.mkstempfname('.novoalign.bam')
+    min_qual = 0 if keep_all_reads else 1
     novoalign.execute(inBam, inFasta, novoBam,
-        options=novo_params.split(), min_qual=1, JVMmemory=JVMmemory)
+        options=novo_params.split(), min_qual=min_qual, JVMmemory=JVMmemory)
     rmdupBam = util.file.mkstempfname('.rmdup.bam')
+    opts = ['CREATE_INDEX=true']
+    if not keep_all_reads:
+        opts.append('REMOVE_DUPLICATES=true')
     picard_mkdup.execute([novoBam], rmdupBam,
-        picardOptions=['REMOVE_DUPLICATES=true', 'CREATE_INDEX=true'], JVMmemory=JVMmemory)
+        picardOptions=opts, JVMmemory=JVMmemory)
     os.unlink(novoBam)
     realignBam = util.file.mkstempfname('.realign.bam')
     gatk.local_realign(rmdupBam, deambigFasta, realignBam, JVMmemory=JVMmemory)
@@ -168,12 +172,15 @@ def parser_refine_assembly():
     parser.add_argument('--outVcf',
         default=None,
         help='GATK genotype calls for genome in inFasta coordinate space.')
-    parser.add_argument('--novo_params',
-        default='-r Random -l 40 -g 40 -x 20 -t 100',
-        help='Alignment parameters for Novoalign.')
     parser.add_argument('--min_coverage',
         default=3, type=int,
         help='Minimum read coverage required to call a position unambiguous.')
+    parser.add_argument('--novo_params',
+        default='-r Random -l 40 -g 40 -x 20 -t 100',
+        help='Alignment parameters for Novoalign.')
+    parser.add_argument("--keep_all_reads",
+        help="""Retain all reads in BAM file? Default is to remove unaligned and duplicate reads.""",
+        default=False, action="store_true", dest="keep_all_reads")
     parser.add_argument('--JVMmemory',
         default=tools.gatk.GATKTool.jvmMemDefault,
         help='JVM virtual memory size (default: %(default)s)')
@@ -182,7 +189,7 @@ def parser_refine_assembly():
 def main_refine_assembly(args):
     refine_assembly(args.inFasta, args.inBam, args.outFasta,
         args.outVcf, args.outBam, args.novo_params, args.min_coverage,
-        JVMmemory=args.JVMmemory)
+        keep_all_reads=args.keep_all_reads, JVMmemory=args.JVMmemory)
     return 0
 __commands__.append(('refine_assembly', main_refine_assembly, parser_refine_assembly))
 
diff --git a/pipes/Snakefile b/pipes/Snakefile
index d756fab7b..6c198aab5 100644
--- a/pipes/Snakefile
+++ b/pipes/Snakefile
@@ -9,18 +9,19 @@
 
 __author__ = 'Daniel Park <dpark@broadinstitute.org>'
 
+import os.path
 
 configfile: "config.json"
+pipesDir = os.path.join(os.path.expanduser(config['binDir']), 'pipes', 'rules')
 
-include: config["binDir"]+"/pipes/rules/common.rules"
-
+include: os.path.join(pipesDir, 'common.rules')
 set_env_vars()
 
-include: config["binDir"]+"/pipes/rules/demux.rules"
-include: config["binDir"]+"/pipes/rules/hs_deplete.rules"
-include: config["binDir"]+"/pipes/rules/assembly.rules"
-include: config["binDir"]+"/pipes/rules/interhost.rules"
-include: config["binDir"]+"/pipes/rules/reports.rules"
+include: os.path.join(pipesDir, 'demux.rules')
+include: os.path.join(pipesDir, 'hs_deplete.rules')
+include: os.path.join(pipesDir, 'assembly.rules')
+include: os.path.join(pipesDir, 'interhost.rules')
+include: os.path.join(pipesDir, 'reports.rules')
 
 rule all:
     input:
diff --git a/pipes/config.json b/pipes/config.json
index e6d1037a7..9d3d0e637 100644
--- a/pipes/config.json
+++ b/pipes/config.json
@@ -4,6 +4,8 @@
   "samples_assembly":  "samples-assembly.txt",
   "samples_per_run":   "samples-runs.txt",
 
+  "seq_center": "BI",
+
   "bmTaggerDbDir":  "/idi/sabeti-scratch/kandersen/references/bmtagger",
   "bmTaggerDbs_remove": [
     "hg19",
diff --git a/pipes/ref_assisted/Snakefile b/pipes/ref_assisted/Snakefile
new file mode 100644
index 000000000..efb77eb84
--- /dev/null
+++ b/pipes/ref_assisted/Snakefile
@@ -0,0 +1,46 @@
+"""
+    This performs reference-assisted assembly of viral genomes, starting
+    from fastqs produced directly from MiSeq machines.
+    
+    Make copies of this Snakefile and config.json to your analysis directory and
+    customize as needed.
+"""
+
+__author__ = 'Daniel Park <dpark@broadinstitute.org>'
+
+import os.path
+
+configfile: "config.json"
+
+def read_file(fname):
+    with open(fname, 'rt') as inf:
+        for line in inf:
+            yield line.strip()
+
+rule all:
+    input:
+        expand("{dataDir}/{sample}.fasta",
+            dataDir=config["dataDir"],
+            sample=read_file(config["samples"]))
+
+rule bams_from_fastq:
+    input:  os.path.join(config['dataDir'],'{sample}_R1_{idx}.fastq'),
+            os.path.join(config['dataDir'],'{sample}_R2_{idx}.fastq')
+    output: os.path.join(config['dataDir'],'{sample}_{idx}.bam')
+    resources: mem=4
+    params: logid="{sample}_{idx}",
+            center=config["seq_center"]
+    shell:  "read_utils.py fastq_to_bam {input} {output} --sampleName {wildcards.sample} --picardOptions PLATFORM=illumina SEQUENCING_CENTER={params.center} LIBRARY_NAME={wildcards.sample}_{wildcards.idx} SORT_ORDER=queryname"
+
+rule ref_guided_consensus:
+    input:  os.path.join(config['dataDir'],'{sample}.bam')
+    output: os.path.join(config['dataDir'],'{sample}.realigned.bam'),
+            os.path.join(config['dataDir'],'{sample}.vcf.gz'),
+            os.path.join(config['dataDir'],'{sample}.fasta')
+    resources: mem=4
+    params: LSF='-W 4:00',
+            logid="{sample}",
+            refGenome=config["ref_genome"],
+            novoalign_options = "-r Random -l 30 -g 40 -x 20 -t 502",
+            min_coverage = "2"
+    shell:  "assembly.py refine_assembly {params.refGenome} {input} {output[2]} --outBam {output[0]} --outVcf {output[1]} --min_coverage {params.min_coverage} --novo_params '{params.novoalign_options}' --keep_all_reads"
diff --git a/pipes/ref_assisted/config.json b/pipes/ref_assisted/config.json
new file mode 100644
index 000000000..97ef57cbb
--- /dev/null
+++ b/pipes/ref_assisted/config.json
@@ -0,0 +1,9 @@
+{
+  "samples":    "samples.txt",
+  "ref_genome": "~/resources/ref_genome/genome.fasta",
+  "seq_center": "ACEGID_RUN",
+  
+  "dataDir":    "data",
+  "logDir":     "log",
+  "project":    "viral_ngs"
+}
diff --git a/pipes/ref_assisted/ref_assisted b/pipes/ref_assisted/ref_assisted
new file mode 100755
index 000000000..f6791a7f9
--- /dev/null
+++ b/pipes/ref_assisted/ref_assisted
@@ -0,0 +1,5 @@
+#!/bin/bash
+snakemake --rerun-incomplete --keep-going \
+	--jobs 4 --resources mem=16 \
+	-s "$HOME/viral-ngs/pipes/ref_assisted/Snakefile" \
+	"$@"
diff --git a/pipes/rules/demux.rules b/pipes/rules/demux.rules
index cccfe43f1..6d54da11e 100644
--- a/pipes/rules/demux.rules
+++ b/pipes/rules/demux.rules
@@ -118,6 +118,7 @@ rule illumina_basecalls:
     resources: mem=64
     params: LSF='-q flower',
             logid="{flowcell}.{lane}",
+            center=config['seq_center'],
             outdir=config['tmpDir']+'/'+config['subdirs']['demux']+'/bams_per_lane/{flowcell}.{lane}'
     run:
             shutil.rmtree(params.outdir, ignore_errors=True)
@@ -125,7 +126,7 @@ rule illumina_basecalls:
             lane = get_one_lane_from_run(wildcards.flowcell, wildcards.lane, config['seqruns_demux'])
             dir = lane['bustard_dir']
             run_date = lane.get('seq_run_date')
-            shell("{config[binDir]}/broad_utils.py illumina_basecalls {dir} {input[1]} {wildcards.flowcell} {wildcards.lane} {input[0]} --include_non_pf_reads=false --run_start_date={run_date}")
+            shell("{config[binDir]}/broad_utils.py illumina_basecalls {dir} {input[1]} {wildcards.flowcell} {wildcards.lane} {input[0]} --include_non_pf_reads=false --run_start_date={run_date} --sequencing_center={params.center}")
 
 def demux_move_bams_inputs(wildcards):
     lane = get_one_lane_from_run(wildcards.flowcell, wildcards.lane, config.get('seqruns_demux',''))
@@ -141,3 +142,15 @@ rule move_bams_demux:
             makedirs(os.path.join(config['dataDir'], config['subdirs']['source']))
             shutil.move(input[0], output[0])
 
+rule bams_from_fastq:
+    input:  os.path.join(config['dataDir'],config['subdirs']['source'],'{sample}_R1_{idx}.fastq'),
+            os.path.join(config['dataDir'],config['subdirs']['source'],'{sample}_R2_{idx}.fastq')
+    output: os.path.join(config['dataDir'],config['subdirs']['source'],'{sample}_{idx}.bam')
+    params: logid="{sample}_{idx}",
+            center=config["seq_center"]
+    run:
+            makedirs(os.path.join(config['dataDir'], config['subdirs']['source']))
+            shell("{config[binDir]}/read_utils.py fastq_to_bam {input} {output} --sampleName {wildcards.sample} --picardOptions PLATFORM=illumina SEQUENCING_CENTER={params.center} LIBRARY_NAME={wildcards.sample}_{wildcards.idx} SORT_ORDER=queryname")
+
+ruleorder: move_bams_demux > bams_from_fastq
+
diff --git a/pipes/rules/interhost.rules b/pipes/rules/interhost.rules
index 0f7ce11fa..3a336aa80 100644
--- a/pipes/rules/interhost.rules
+++ b/pipes/rules/interhost.rules
@@ -27,38 +27,22 @@ rule all_ref_guided:
                 sample=read_samples_file(config["samples_per_run"])),
             config["reportsDir"]+'/summary.coverage_ref.txt.gz'
 
-rule map_reads_to_ref:
-    input:  config["dataDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam'
-    output: config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.bam',
-            config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.mapped.bam',
-            config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.rmdup.bam',
-            config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.realigned.bam'
-    resources: mem=8
+rule ref_guided_consensus:
+    input:  config["dataDir"]+'/'+config["subdirs"]["source"]+'/{sample}.bam'
+    output: config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.realigned.bam',
+            config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.vcf.gz',
+            config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.fasta'
+    resources: mem=4
     params: LSF='-W 4:00',
             logid="{sample}",
             refGenome=config["ref_genome"],
-            novoalign_options="-r Random -l 30 -g 40 -x 20 -t 502"
+            novoalign_options="-r Random -l 30 -g 40 -x 20 -t 502",
+            min_coverage="2"
     run:
             makedirs(expand("{dir}/{subdir}",
-                dir=[config["dataDir"], config["tmpDir"]],
+                dir=[config["dataDir"]],
                 subdir=[config["subdirs"]["align_ref"], config["subdirs"]["assembly"]]))
-            shell("{config[binDir]}/read_utils.py novoalign {input[0]} {params.refGenome} {output[0]} --options '{params.novoalign_options}'")
-            shell("{config[binDir]}/read_utils.py filter_bam_mapped_only {output[0]} {output[1]}")
-            shell("{config[binDir]}/read_utils.py mkdup_picard {output[1]} {output[2]} --remove --picardOptions CREATE_INDEX=true")
-            shell("{config[binDir]}/read_utils.py gatk_realign {output[2]} {params.refGenome} {output[3]}")
-
-rule ref_guided_consensus:
-    input:  config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.realigned.bam'
-    output: config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.vcf.gz',
-            config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.fasta'
-    resources: mem=8
-    params: LSF='-W 4:00',
-            logid="{sample}",
-            refGenome=config["ref_genome"]
-    run:
-            update_timestamps(input)
-            gatk_ug(input[0], params.refGenome, output[0])
-            shell("{config[binDir]}/assembly.py vcf_to_fasta {output[0]} {output[1]} --min_coverage 2 --name {wildcards.sample}")
+            shell("{config[binDir]}/assembly.py refine_assembly {params.refGenome} {input} {output[2]} --outBam {output[0]} --outVcf {output[1]} --min_coverage {params.min_coverage} --novo_params '{params.novoalign_options}' --keep_all_reads")
 
 rule ref_guided_diversity:
     input:  

From 1865b2acff9bd3ef12a9d641936ae368087527e0 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Sat, 10 Jan 2015 19:38:31 -0500
Subject: [PATCH 2/6] update nigeria run miseq analysis pipeline

---
 pipes/ref_assisted/Snakefile    | 102 +++++++++++++++++++++++++++-----
 pipes/ref_assisted/config.json  |  20 ++++---
 pipes/ref_assisted/ref_assisted |  11 +++-
 3 files changed, 109 insertions(+), 24 deletions(-)

diff --git a/pipes/ref_assisted/Snakefile b/pipes/ref_assisted/Snakefile
index efb77eb84..03fd92cb2 100644
--- a/pipes/ref_assisted/Snakefile
+++ b/pipes/ref_assisted/Snakefile
@@ -4,43 +4,115 @@
     
     Make copies of this Snakefile and config.json to your analysis directory and
     customize as needed.
+    
+    This operates on the SampleSheet.csv file that is used as input to the MiSeq
+    machine and all of the fastq files that are emitted by that machine. Put
+    the SampleSheet.csv in the project directory and put the fastq files in a
+    subdirectory called "data". Copy the config.json to the project directory.
+    Then type "ref_assisted" and wait a few hours for aligned BAMs, VCFs, and
+    FASTAs.
+    
+    This is designed for use on a single linux computer (e.g. Ubuntu 14.04 LTS) with:
+        apt-get install:
+         python3 python3-pip python-software-properties
+         zlib zlib1g zlib1g-dev
+         libblas3gf libblas-dev liblapack3gf liblapack-dev
+         libatlas-dev libatlas3-base libatlas3gf-base libatlas-base-dev
+         gfortran git oracle-java8-installer
+         libncurses5-dev python3-nose
+        pip3 install -r requirements.txt
+        pip3 install snakemake==3.2
 """
 
 __author__ = 'Daniel Park <dpark@broadinstitute.org>'
 
-import os.path
+import os.path, time, hashlib, base64
 
 configfile: "config.json"
 
-def read_file(fname):
+def get_sample_list(fname):
+    with open(fname, 'rt') as inf:
+        header = None
+        for line in inf:
+            if line.startswith('Sample_ID'):
+                header = line.strip().split(',')
+            elif header and line:
+                yield line.strip().split(',')[0]
+
+def get_sample_info(sample, fname):
     with open(fname, 'rt') as inf:
+        header = None
+        n = 0
         for line in inf:
-            yield line.strip()
+            if line.startswith('Sample_ID'):
+                header = line.strip().split(',')
+            elif header and line:
+                n += 1
+                out = dict(zip(header, line.strip().split(',')))
+                out['n'] = n
+                if out['Sample_ID']==sample:
+                    return out
+
+def get_file_date(fname):
+    return time.strftime("%Y-%m-%d", time.localtime(os.path.getmtime(fname)))
+
+def make_run_hash(fname, length=6):
+    if 'flowcell' in config:
+        return config['flowcell']
+    with open(fname, 'rt') as inf:
+        csv = inf.readlines()
+    hash_obj = hashlib.sha1(csv.encode('utf-8'))
+    b32_str = base64.b32encode(bytes(hash_obj.digest())).decode('utf-8')
+    return b32_str[:length]
+
+##############################################
 
 rule all:
     input:
         expand("{dataDir}/{sample}.fasta",
             dataDir=config["dataDir"],
-            sample=read_file(config["samples"]))
+            sample=get_sample_list(config["samples"]))
 
+##############################################
+
+def get_fastq_filenames(wildcards):
+    info = get_sample_info(wildcards.sample, config["samples"])
+    suffix = config.get('fastqs_gzipped', False) and ".gz" or ""
+    return [
+        os.path.join(wildcards.dir,
+            '{sample}_S{idx}_L001_R{dir}_001.fastq{suffix}'.format(
+            sample=wildcards.sample, idx=info['n'], dir=direction,
+            suffix=suffix))
+        for direction in ('1','2')]
 rule bams_from_fastq:
-    input:  os.path.join(config['dataDir'],'{sample}_R1_{idx}.fastq'),
-            os.path.join(config['dataDir'],'{sample}_R2_{idx}.fastq')
-    output: os.path.join(config['dataDir'],'{sample}_{idx}.bam')
+    input:  get_fastq_filenames
+    output: '{dir}/{sample}.raw.bam'
     resources: mem=4
-    params: logid="{sample}_{idx}",
+    params: logid="{sample}",
             center=config["seq_center"]
-    shell:  "read_utils.py fastq_to_bam {input} {output} --sampleName {wildcards.sample} --picardOptions PLATFORM=illumina SEQUENCING_CENTER={params.center} LIBRARY_NAME={wildcards.sample}_{wildcards.idx} SORT_ORDER=queryname"
+    run:
+            run_date = get_file_date(input[0])
+            info = get_sample_info(wildcards.sample, config["samples"])
+            hash = make_run_hash(config["samples"])
+            shell("{config[binDir]}/read_utils.py fastq_to_bam {input} {output} " \
+                + "--sampleName {wildcards.sample} --picardOptions " \
+                + "SEQUENCING_CENTER={params.center} " \
+                + "RUN_DATE={run_date} " \
+                + "PLATFORM_UNIT={hash}.1.{info[index]}-{info[index2]} " \
+                + "READ_GROUP_NAME={hash} " \
+                + "PLATFORM=illumina SORT_ORDER=queryname")
+
+##############################################
 
 rule ref_guided_consensus:
-    input:  os.path.join(config['dataDir'],'{sample}.bam')
-    output: os.path.join(config['dataDir'],'{sample}.realigned.bam'),
-            os.path.join(config['dataDir'],'{sample}.vcf.gz'),
-            os.path.join(config['dataDir'],'{sample}.fasta')
+    input:  '{dir}/{sample}.raw.bam'
+    output: '{dir}/{sample}.realigned.bam',
+            '{dir}/{sample}.vcf.gz',
+            '{dir}/{sample}.fasta'
     resources: mem=4
     params: LSF='-W 4:00',
             logid="{sample}",
-            refGenome=config["ref_genome"],
+            refGenome=os.path.expanduser(config["ref_genome"]),
             novoalign_options = "-r Random -l 30 -g 40 -x 20 -t 502",
             min_coverage = "2"
-    shell:  "assembly.py refine_assembly {params.refGenome} {input} {output[2]} --outBam {output[0]} --outVcf {output[1]} --min_coverage {params.min_coverage} --novo_params '{params.novoalign_options}' --keep_all_reads"
+    shell:  "{config[binDir]}/assembly.py refine_assembly {params.refGenome} {input} {output[2]} --outBam {output[0]} --outVcf {output[1]} --min_coverage {params.min_coverage} --novo_params '{params.novoalign_options}' --keep_all_reads"
diff --git a/pipes/ref_assisted/config.json b/pipes/ref_assisted/config.json
index 97ef57cbb..f3662628c 100644
--- a/pipes/ref_assisted/config.json
+++ b/pipes/ref_assisted/config.json
@@ -1,9 +1,15 @@
 {
-  "samples":    "samples.txt",
-  "ref_genome": "~/resources/ref_genome/genome.fasta",
-  "seq_center": "ACEGID_RUN",
-  
-  "dataDir":    "data",
-  "logDir":     "log",
-  "project":    "viral_ngs"
+  "samples":         "SampleSheet.csv",
+
+  "seq_center":      "ACEGID_RUN",
+  "fastqs_gzipped":  false,
+  "n_cores":         4,
+  "max_ram":         16,
+  "ref_genome":      "~/resources/ref_genome/genome.fasta",
+
+  "dataDir":         "data",
+  "logDir":          "log",
+  "binDir":          "~/viral-ngs",
+  "venvDir":         "~/venv",
+  "project":         "viral_ngs"
 }
diff --git a/pipes/ref_assisted/ref_assisted b/pipes/ref_assisted/ref_assisted
index f6791a7f9..8bbc37518 100755
--- a/pipes/ref_assisted/ref_assisted
+++ b/pipes/ref_assisted/ref_assisted
@@ -1,5 +1,12 @@
 #!/bin/bash
+
+# load config dirs from config.json
+BIN_DIR=`python -c 'import json,os.path;f=open("config.json");print(os.path.expanduser(json.load(f)["binDir"]));f.close()'`
+N_CORES=`python -c 'import json;f=open("config.json");print(json.load(f)["n_cores"]);f.close()'`
+MAX_RAM=`python -c 'import json;f=open("config.json");print(json.load(f)["max_ram"]);f.close()'`
+
+# execute snakemake on this machine with specified resources
 snakemake --rerun-incomplete --keep-going \
-	--jobs 4 --resources mem=16 \
-	-s "$HOME/viral-ngs/pipes/ref_assisted/Snakefile" \
+	--jobs $N_CORES --resources mem=$MAX_RAM \
+	-s $BIN_DIR/pipes/ref_assisted/Snakefile \
 	"$@"

From 78c925d1d4c2c9efe4ebabb5dd7f0b6e566463d1 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Sat, 10 Jan 2015 19:50:12 -0500
Subject: [PATCH 3/6] fix hashing

---
 pipes/ref_assisted/Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipes/ref_assisted/Snakefile b/pipes/ref_assisted/Snakefile
index 03fd92cb2..d423c8724 100644
--- a/pipes/ref_assisted/Snakefile
+++ b/pipes/ref_assisted/Snakefile
@@ -60,7 +60,7 @@ def make_run_hash(fname, length=6):
     if 'flowcell' in config:
         return config['flowcell']
     with open(fname, 'rt') as inf:
-        csv = inf.readlines()
+        csv = ''.join(inf.readlines())
     hash_obj = hashlib.sha1(csv.encode('utf-8'))
     b32_str = base64.b32encode(bytes(hash_obj.digest())).decode('utf-8')
     return b32_str[:length]

From 5cd42e4b4fecb68edb8debe5842780e783e12f1f Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Sat, 10 Jan 2015 20:15:45 -0500
Subject: [PATCH 4/6] add env vars reading fro config json

---
 pipes/ref_assisted/Snakefile    | 5 ++++-
 pipes/ref_assisted/ref_assisted | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/pipes/ref_assisted/Snakefile b/pipes/ref_assisted/Snakefile
index d423c8724..3616f459f 100644
--- a/pipes/ref_assisted/Snakefile
+++ b/pipes/ref_assisted/Snakefile
@@ -26,10 +26,13 @@
 
 __author__ = 'Daniel Park <dpark@broadinstitute.org>'
 
-import os.path, time, hashlib, base64
+import os, os.path, time, hashlib, base64
 
 configfile: "config.json"
 
+for k,v in config.get('env_vars', {}).items():
+    os.environ[k] = v
+
 def get_sample_list(fname):
     with open(fname, 'rt') as inf:
         header = None
diff --git a/pipes/ref_assisted/ref_assisted b/pipes/ref_assisted/ref_assisted
index 8bbc37518..0fa8ae72f 100755
--- a/pipes/ref_assisted/ref_assisted
+++ b/pipes/ref_assisted/ref_assisted
@@ -6,7 +6,7 @@ N_CORES=`python -c 'import json;f=open("config.json");print(json.load(f)["n_core
 MAX_RAM=`python -c 'import json;f=open("config.json");print(json.load(f)["max_ram"]);f.close()'`
 
 # execute snakemake on this machine with specified resources
-snakemake --rerun-incomplete --keep-going \
+snakemake --timestamp --rerun-incomplete --keep-going \
 	--jobs $N_CORES --resources mem=$MAX_RAM \
 	-s $BIN_DIR/pipes/ref_assisted/Snakefile \
 	"$@"

From f9e51ae98ec745f2a75c80d11ca2e1910db99d3b Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Sat, 10 Jan 2015 21:26:46 -0500
Subject: [PATCH 5/6] move some options into config.json, increase min_coverage
 to 3, rename output fasta sequence to sample name

---
 assembly.py                    | 22 +++++++++++++++-------
 pipes/ref_assisted/Snakefile   | 13 ++++++++++---
 pipes/ref_assisted/config.json |  5 +++++
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/assembly.py b/assembly.py
index 2c0762149..f5e65d26e 100755
--- a/assembly.py
+++ b/assembly.py
@@ -96,7 +96,7 @@ def align_and_orient_vfat(inFasta, inReference, outFasta, minLength, minUnambig,
 
 def refine_assembly(inFasta, inBam, outFasta,
         outVcf=None, outBam=None, novo_params='', min_coverage=2,
-        keep_all_reads=False, JVMmemory=None):
+        contig_names=[], keep_all_reads=False, JVMmemory=None):
     ''' This a refinement step where we take a crude assembly, align
         all reads back to it, and modify the assembly to the majority
         allele at each position based on read pileups.
@@ -143,9 +143,12 @@ def refine_assembly(inFasta, inBam, outFasta,
     gatk.ug(realignBam, deambigFasta, tmpVcf, JVMmemory=JVMmemory)
     os.unlink(realignBam)
     os.unlink(deambigFasta)
+    name_opts = []
+    if contig_names:
+        name_opts = ['--name'] + contig_names
     main_vcf_to_fasta(parser_vcf_to_fasta().parse_args([
         tmpVcf, outFasta, '--trim_ends', '--min_coverage', str(min_coverage),
-        ]))
+        ] + name_opts))
     if outVcf:
         shutil.copyfile(tmpVcf, outVcf)
         if outVcf.endswith('.gz'):
@@ -178,6 +181,9 @@ def parser_refine_assembly():
     parser.add_argument('--novo_params',
         default='-r Random -l 40 -g 40 -x 20 -t 100',
         help='Alignment parameters for Novoalign.')
+    parser.add_argument("--chr_names", dest="chr_names", nargs="*",
+        help="Rename all output chromosomes (default: retain original chromosome names)",
+        default=[])
     parser.add_argument("--keep_all_reads",
         help="""Retain all reads in BAM file? Default is to remove unaligned and duplicate reads.""",
         default=False, action="store_true", dest="keep_all_reads")
@@ -189,6 +195,7 @@ def parser_refine_assembly():
 def main_refine_assembly(args):
     refine_assembly(args.inFasta, args.inBam, args.outFasta,
         args.outVcf, args.outBam, args.novo_params, args.min_coverage,
+        contig_names=args.chr_names,
         keep_all_reads=args.keep_all_reads, JVMmemory=args.JVMmemory)
     return 0
 __commands__.append(('refine_assembly', main_refine_assembly, parser_refine_assembly))
@@ -582,9 +589,9 @@ def parser_vcf_to_fasta():
         total read count.  This filter will not apply to any sites unless both DP values
         are reported.  [default: %(default)s]""",
         default=0.0)
-    parser.add_argument("--name", dest="name",
-        help="output sequence name (default: reference name in VCF file)",
-        default=None)
+    parser.add_argument("--name", dest="name", nargs="*",
+        help="output sequence names (default: reference names in VCF file)",
+        default=[])
     util.cmd.common_args(parser, (('loglevel',None), ('version',None)))
     return parser
 def main_vcf_to_fasta(args):
@@ -595,13 +602,14 @@ def main_vcf_to_fasta(args):
         chrlens = dict(vcf.chrlens())
         samples = vcf.samples()
     with open(args.outFasta, 'wt') as outf:
+        chr_idx = 0
         for header, seq in vcf_to_seqs(util.file.read_tabfile(args.inVcf),
             chrlens, samples, min_dp=args.min_dp, major_cutoff=args.major_cutoff,
             min_dp_ratio=args.min_dp_ratio):
             if args.trim_ends:
                 seq = seq.strip('Nn')
-            if args.name!=None:
-                header = args.name
+            if args.name:
+                header = args.name[chr_idx % len(args.name)]
             for line in util.file.fastaMaker([(header, seq)]):
                 outf.write(line)
 
diff --git a/pipes/ref_assisted/Snakefile b/pipes/ref_assisted/Snakefile
index 3616f459f..46744c60e 100644
--- a/pipes/ref_assisted/Snakefile
+++ b/pipes/ref_assisted/Snakefile
@@ -116,6 +116,13 @@ rule ref_guided_consensus:
     params: LSF='-W 4:00',
             logid="{sample}",
             refGenome=os.path.expanduser(config["ref_genome"]),
-            novoalign_options = "-r Random -l 30 -g 40 -x 20 -t 502",
-            min_coverage = "2"
-    shell:  "{config[binDir]}/assembly.py refine_assembly {params.refGenome} {input} {output[2]} --outBam {output[0]} --outVcf {output[1]} --min_coverage {params.min_coverage} --novo_params '{params.novoalign_options}' --keep_all_reads"
+            novoalign_options=config["refine_options"]["novoalign"],
+            min_coverage=config["refine_options"]["min_coverage"]
+    shell:  "{config[binDir]}/assembly.py refine_assembly " \
+                + "{params.refGenome} {input} {output[2]} " \
+                + "--outBam {output[0]} " \
+                + "--outVcf {output[1]} " \
+                + "--keep_all_reads " \
+                + "--chr_names {wildcards.sample} " \
+                + "--min_coverage {params.min_coverage} " \
+                + "--novo_params '{params.novoalign_options}' "
diff --git a/pipes/ref_assisted/config.json b/pipes/ref_assisted/config.json
index f3662628c..7254ccdc9 100644
--- a/pipes/ref_assisted/config.json
+++ b/pipes/ref_assisted/config.json
@@ -1,5 +1,10 @@
 {
   "samples":         "SampleSheet.csv",
+  
+  "refine_options": {
+    "novoalign":    "-r Random -l 30 -g 40 -x 20 -t 502",
+    "min_coverage": "3"
+  },
 
   "seq_center":      "ACEGID_RUN",
   "fastqs_gzipped":  false,

From 49c580d57679f5b34a441b1fdf1aca3305b3ec60 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Sat, 10 Jan 2015 22:15:35 -0500
Subject: [PATCH 6/6] update ref_guided_consensus

---
 pipes/rules/interhost.rules | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipes/rules/interhost.rules b/pipes/rules/interhost.rules
index 3a336aa80..b6f02037f 100644
--- a/pipes/rules/interhost.rules
+++ b/pipes/rules/interhost.rules
@@ -28,7 +28,7 @@ rule all_ref_guided:
             config["reportsDir"]+'/summary.coverage_ref.txt.gz'
 
 rule ref_guided_consensus:
-    input:  config["dataDir"]+'/'+config["subdirs"]["source"]+'/{sample}.bam'
+    input:  config["dataDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam'
     output: config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.realigned.bam',
             config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.vcf.gz',
             config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.fasta'
@@ -37,12 +37,12 @@ rule ref_guided_consensus:
             logid="{sample}",
             refGenome=config["ref_genome"],
             novoalign_options="-r Random -l 30 -g 40 -x 20 -t 502",
-            min_coverage="2"
+            min_coverage="3"
     run:
             makedirs(expand("{dir}/{subdir}",
                 dir=[config["dataDir"]],
                 subdir=[config["subdirs"]["align_ref"], config["subdirs"]["assembly"]]))
-            shell("{config[binDir]}/assembly.py refine_assembly {params.refGenome} {input} {output[2]} --outBam {output[0]} --outVcf {output[1]} --min_coverage {params.min_coverage} --novo_params '{params.novoalign_options}' --keep_all_reads")
+            shell("{config[binDir]}/assembly.py refine_assembly {params.refGenome} {input} {output[2]} --outBam {output[0]} --outVcf {output[1]} --min_coverage {params.min_coverage} --novo_params '{params.novoalign_options}' --keep_all_reads --chr_names {wildcards.sample}")
 
 rule ref_guided_diversity:
     input: