From bff059953bd4173477b9d60407582973505fdc7a Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 10 Jan 2015 15:49:25 -0500 Subject: [PATCH 1/6] start some work on ref_assisted assembly --- assembly.py | 21 ++++++++++----- pipes/Snakefile | 15 ++++++----- pipes/config.json | 2 ++ pipes/ref_assisted/Snakefile | 46 +++++++++++++++++++++++++++++++++ pipes/ref_assisted/config.json | 9 +++++++ pipes/ref_assisted/ref_assisted | 5 ++++ pipes/rules/demux.rules | 15 ++++++++++- pipes/rules/interhost.rules | 36 +++++++------------------- 8 files changed, 108 insertions(+), 41 deletions(-) create mode 100644 pipes/ref_assisted/Snakefile create mode 100644 pipes/ref_assisted/config.json create mode 100755 pipes/ref_assisted/ref_assisted diff --git a/assembly.py b/assembly.py index c7ac5c418..2c0762149 100755 --- a/assembly.py +++ b/assembly.py @@ -96,7 +96,7 @@ def align_and_orient_vfat(inFasta, inReference, outFasta, minLength, minUnambig, def refine_assembly(inFasta, inBam, outFasta, outVcf=None, outBam=None, novo_params='', min_coverage=2, - JVMmemory=None): + keep_all_reads=False, JVMmemory=None): ''' This a refinement step where we take a crude assembly, align all reads back to it, and modify the assembly to the majority allele at each position based on read pileups. @@ -122,11 +122,15 @@ def refine_assembly(inFasta, inBam, outFasta, # Novoalign reads to self novoBam = util.file.mkstempfname('.novoalign.bam') + min_qual = 0 if keep_all_reads else 1 novoalign.execute(inBam, inFasta, novoBam, - options=novo_params.split(), min_qual=1, JVMmemory=JVMmemory) + options=novo_params.split(), min_qual=min_qual, JVMmemory=JVMmemory) rmdupBam = util.file.mkstempfname('.rmdup.bam') + opts = ['CREATE_INDEX=true'] + if not keep_all_reads: + opts.append('REMOVE_DUPLICATES=true') picard_mkdup.execute([novoBam], rmdupBam, - picardOptions=['REMOVE_DUPLICATES=true', 'CREATE_INDEX=true'], JVMmemory=JVMmemory) + picardOptions=opts, JVMmemory=JVMmemory) os.unlink(novoBam) realignBam = util.file.mkstempfname('.realign.bam') gatk.local_realign(rmdupBam, deambigFasta, realignBam, JVMmemory=JVMmemory) @@ -168,12 +172,15 @@ def parser_refine_assembly(): parser.add_argument('--outVcf', default=None, help='GATK genotype calls for genome in inFasta coordinate space.') - parser.add_argument('--novo_params', - default='-r Random -l 40 -g 40 -x 20 -t 100', - help='Alignment parameters for Novoalign.') parser.add_argument('--min_coverage', default=3, type=int, help='Minimum read coverage required to call a position unambiguous.') + parser.add_argument('--novo_params', + default='-r Random -l 40 -g 40 -x 20 -t 100', + help='Alignment parameters for Novoalign.') + parser.add_argument("--keep_all_reads", + help="""Retain all reads in BAM file? Default is to remove unaligned and duplicate reads.""", + default=False, action="store_true", dest="keep_all_reads") parser.add_argument('--JVMmemory', default=tools.gatk.GATKTool.jvmMemDefault, help='JVM virtual memory size (default: %(default)s)') @@ -182,7 +189,7 @@ def parser_refine_assembly(): def main_refine_assembly(args): refine_assembly(args.inFasta, args.inBam, args.outFasta, args.outVcf, args.outBam, args.novo_params, args.min_coverage, - JVMmemory=args.JVMmemory) + keep_all_reads=args.keep_all_reads, JVMmemory=args.JVMmemory) return 0 __commands__.append(('refine_assembly', main_refine_assembly, parser_refine_assembly)) diff --git a/pipes/Snakefile b/pipes/Snakefile index d756fab7b..6c198aab5 100644 --- a/pipes/Snakefile +++ b/pipes/Snakefile @@ -9,18 +9,19 @@ __author__ = 'Daniel Park ' +import os.path configfile: "config.json" +pipesDir = os.path.join(os.path.expanduser(config['binDir']), 'pipes', 'rules') -include: config["binDir"]+"/pipes/rules/common.rules" - +include: os.path.join(pipesDir, 'common.rules') set_env_vars() -include: config["binDir"]+"/pipes/rules/demux.rules" -include: config["binDir"]+"/pipes/rules/hs_deplete.rules" -include: config["binDir"]+"/pipes/rules/assembly.rules" -include: config["binDir"]+"/pipes/rules/interhost.rules" -include: config["binDir"]+"/pipes/rules/reports.rules" +include: os.path.join(pipesDir, 'demux.rules') +include: os.path.join(pipesDir, 'hs_deplete.rules') +include: os.path.join(pipesDir, 'assembly.rules') +include: os.path.join(pipesDir, 'interhost.rules') +include: os.path.join(pipesDir, 'reports.rules') rule all: input: diff --git a/pipes/config.json b/pipes/config.json index e6d1037a7..9d3d0e637 100644 --- a/pipes/config.json +++ b/pipes/config.json @@ -4,6 +4,8 @@ "samples_assembly": "samples-assembly.txt", "samples_per_run": "samples-runs.txt", + "seq_center": "BI", + "bmTaggerDbDir": "/idi/sabeti-scratch/kandersen/references/bmtagger", "bmTaggerDbs_remove": [ "hg19", diff --git a/pipes/ref_assisted/Snakefile b/pipes/ref_assisted/Snakefile new file mode 100644 index 000000000..efb77eb84 --- /dev/null +++ b/pipes/ref_assisted/Snakefile @@ -0,0 +1,46 @@ +""" + This performs reference-assisted assembly of viral genomes, starting + from fastqs produced directly from MiSeq machines. + + Make copies of this Snakefile and config.json to your analysis directory and + customize as needed. +""" + +__author__ = 'Daniel Park ' + +import os.path + +configfile: "config.json" + +def read_file(fname): + with open(fname, 'rt') as inf: + for line in inf: + yield line.strip() + +rule all: + input: + expand("{dataDir}/{sample}.fasta", + dataDir=config["dataDir"], + sample=read_file(config["samples"])) + +rule bams_from_fastq: + input: os.path.join(config['dataDir'],'{sample}_R1_{idx}.fastq'), + os.path.join(config['dataDir'],'{sample}_R2_{idx}.fastq') + output: os.path.join(config['dataDir'],'{sample}_{idx}.bam') + resources: mem=4 + params: logid="{sample}_{idx}", + center=config["seq_center"] + shell: "read_utils.py fastq_to_bam {input} {output} --sampleName {wildcards.sample} --picardOptions PLATFORM=illumina SEQUENCING_CENTER={params.center} LIBRARY_NAME={wildcards.sample}_{wildcards.idx} SORT_ORDER=queryname" + +rule ref_guided_consensus: + input: os.path.join(config['dataDir'],'{sample}.bam') + output: os.path.join(config['dataDir'],'{sample}.realigned.bam'), + os.path.join(config['dataDir'],'{sample}.vcf.gz'), + os.path.join(config['dataDir'],'{sample}.fasta') + resources: mem=4 + params: LSF='-W 4:00', + logid="{sample}", + refGenome=config["ref_genome"], + novoalign_options = "-r Random -l 30 -g 40 -x 20 -t 502", + min_coverage = "2" + shell: "assembly.py refine_assembly {params.refGenome} {input} {output[2]} --outBam {output[0]} --outVcf {output[1]} --min_coverage {params.min_coverage} --novo_params '{params.novoalign_options}' --keep_all_reads" diff --git a/pipes/ref_assisted/config.json b/pipes/ref_assisted/config.json new file mode 100644 index 000000000..97ef57cbb --- /dev/null +++ b/pipes/ref_assisted/config.json @@ -0,0 +1,9 @@ +{ + "samples": "samples.txt", + "ref_genome": "~/resources/ref_genome/genome.fasta", + "seq_center": "ACEGID_RUN", + + "dataDir": "data", + "logDir": "log", + "project": "viral_ngs" +} diff --git a/pipes/ref_assisted/ref_assisted b/pipes/ref_assisted/ref_assisted new file mode 100755 index 000000000..f6791a7f9 --- /dev/null +++ b/pipes/ref_assisted/ref_assisted @@ -0,0 +1,5 @@ +#!/bin/bash +snakemake --rerun-incomplete --keep-going \ + --jobs 4 --resources mem=16 \ + -s "$HOME/viral-ngs/pipes/ref_assisted/Snakefile" \ + "$@" diff --git a/pipes/rules/demux.rules b/pipes/rules/demux.rules index cccfe43f1..6d54da11e 100644 --- a/pipes/rules/demux.rules +++ b/pipes/rules/demux.rules @@ -118,6 +118,7 @@ rule illumina_basecalls: resources: mem=64 params: LSF='-q flower', logid="{flowcell}.{lane}", + center=config['seq_center'], outdir=config['tmpDir']+'/'+config['subdirs']['demux']+'/bams_per_lane/{flowcell}.{lane}' run: shutil.rmtree(params.outdir, ignore_errors=True) @@ -125,7 +126,7 @@ rule illumina_basecalls: lane = get_one_lane_from_run(wildcards.flowcell, wildcards.lane, config['seqruns_demux']) dir = lane['bustard_dir'] run_date = lane.get('seq_run_date') - shell("{config[binDir]}/broad_utils.py illumina_basecalls {dir} {input[1]} {wildcards.flowcell} {wildcards.lane} {input[0]} --include_non_pf_reads=false --run_start_date={run_date}") + shell("{config[binDir]}/broad_utils.py illumina_basecalls {dir} {input[1]} {wildcards.flowcell} {wildcards.lane} {input[0]} --include_non_pf_reads=false --run_start_date={run_date} --sequencing_center={params.center}") def demux_move_bams_inputs(wildcards): lane = get_one_lane_from_run(wildcards.flowcell, wildcards.lane, config.get('seqruns_demux','')) @@ -141,3 +142,15 @@ rule move_bams_demux: makedirs(os.path.join(config['dataDir'], config['subdirs']['source'])) shutil.move(input[0], output[0]) +rule bams_from_fastq: + input: os.path.join(config['dataDir'],config['subdirs']['source'],'{sample}_R1_{idx}.fastq'), + os.path.join(config['dataDir'],config['subdirs']['source'],'{sample}_R2_{idx}.fastq') + output: os.path.join(config['dataDir'],config['subdirs']['source'],'{sample}_{idx}.bam') + params: logid="{sample}_{idx}", + center=config["seq_center"] + run: + makedirs(os.path.join(config['dataDir'], config['subdirs']['source'])) + shell("{config[binDir]}/read_utils.py fastq_to_bam {input} {output} --sampleName {wildcards.sample} --picardOptions PLATFORM=illumina SEQUENCING_CENTER={params.center} LIBRARY_NAME={wildcards.sample}_{wildcards.idx} SORT_ORDER=queryname") + +ruleorder: move_bams_demux > bams_from_fastq + diff --git a/pipes/rules/interhost.rules b/pipes/rules/interhost.rules index 0f7ce11fa..3a336aa80 100644 --- a/pipes/rules/interhost.rules +++ b/pipes/rules/interhost.rules @@ -27,38 +27,22 @@ rule all_ref_guided: sample=read_samples_file(config["samples_per_run"])), config["reportsDir"]+'/summary.coverage_ref.txt.gz' -rule map_reads_to_ref: - input: config["dataDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam' - output: config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.bam', - config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.mapped.bam', - config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.rmdup.bam', - config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.realigned.bam' - resources: mem=8 +rule ref_guided_consensus: + input: config["dataDir"]+'/'+config["subdirs"]["source"]+'/{sample}.bam' + output: config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.realigned.bam', + config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.vcf.gz', + config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.fasta' + resources: mem=4 params: LSF='-W 4:00', logid="{sample}", refGenome=config["ref_genome"], - novoalign_options="-r Random -l 30 -g 40 -x 20 -t 502" + novoalign_options="-r Random -l 30 -g 40 -x 20 -t 502", + min_coverage="2" run: makedirs(expand("{dir}/{subdir}", - dir=[config["dataDir"], config["tmpDir"]], + dir=[config["dataDir"]], subdir=[config["subdirs"]["align_ref"], config["subdirs"]["assembly"]])) - shell("{config[binDir]}/read_utils.py novoalign {input[0]} {params.refGenome} {output[0]} --options '{params.novoalign_options}'") - shell("{config[binDir]}/read_utils.py filter_bam_mapped_only {output[0]} {output[1]}") - shell("{config[binDir]}/read_utils.py mkdup_picard {output[1]} {output[2]} --remove --picardOptions CREATE_INDEX=true") - shell("{config[binDir]}/read_utils.py gatk_realign {output[2]} {params.refGenome} {output[3]}") - -rule ref_guided_consensus: - input: config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.realigned.bam' - output: config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.vcf.gz', - config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.fasta' - resources: mem=8 - params: LSF='-W 4:00', - logid="{sample}", - refGenome=config["ref_genome"] - run: - update_timestamps(input) - gatk_ug(input[0], params.refGenome, output[0]) - shell("{config[binDir]}/assembly.py vcf_to_fasta {output[0]} {output[1]} --min_coverage 2 --name {wildcards.sample}") + shell("{config[binDir]}/assembly.py refine_assembly {params.refGenome} {input} {output[2]} --outBam {output[0]} --outVcf {output[1]} --min_coverage {params.min_coverage} --novo_params '{params.novoalign_options}' --keep_all_reads") rule ref_guided_diversity: input: From 1865b2acff9bd3ef12a9d641936ae368087527e0 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 10 Jan 2015 19:38:31 -0500 Subject: [PATCH 2/6] update nigeria run miseq analysis pipeline --- pipes/ref_assisted/Snakefile | 102 +++++++++++++++++++++++++++----- pipes/ref_assisted/config.json | 20 ++++--- pipes/ref_assisted/ref_assisted | 11 +++- 3 files changed, 109 insertions(+), 24 deletions(-) diff --git a/pipes/ref_assisted/Snakefile b/pipes/ref_assisted/Snakefile index efb77eb84..03fd92cb2 100644 --- a/pipes/ref_assisted/Snakefile +++ b/pipes/ref_assisted/Snakefile @@ -4,43 +4,115 @@ Make copies of this Snakefile and config.json to your analysis directory and customize as needed. + + This operates on the SampleSheet.csv file that is used as input to the MiSeq + machine and all of the fastq files that are emitted by that machine. Put + the SampleSheet.csv in the project directory and put the fastq files in a + subdirectory called "data". Copy the config.json to the project directory. + Then type "ref_assisted" and wait a few hours for aligned BAMs, VCFs, and + FASTAs. + + This is designed for use on a single linux computer (e.g. Ubuntu 14.04 LTS) with: + apt-get install: + python3 python3-pip python-software-properties + zlib zlib1g zlib1g-dev + libblas3gf libblas-dev liblapack3gf liblapack-dev + libatlas-dev libatlas3-base libatlas3gf-base libatlas-base-dev + gfortran git oracle-java8-installer + libncurses5-dev python3-nose + pip3 install -r requirements.txt + pip3 install snakemake==3.2 """ __author__ = 'Daniel Park ' -import os.path +import os.path, time, hashlib, base64 configfile: "config.json" -def read_file(fname): +def get_sample_list(fname): + with open(fname, 'rt') as inf: + header = None + for line in inf: + if line.startswith('Sample_ID'): + header = line.strip().split(',') + elif header and line: + yield line.strip().split(',')[0] + +def get_sample_info(sample, fname): with open(fname, 'rt') as inf: + header = None + n = 0 for line in inf: - yield line.strip() + if line.startswith('Sample_ID'): + header = line.strip().split(',') + elif header and line: + n += 1 + out = dict(zip(header, line.strip().split(','))) + out['n'] = n + if out['Sample_ID']==sample: + return out + +def get_file_date(fname): + return time.strftime("%Y-%m-%d", time.localtime(os.path.getmtime(fname))) + +def make_run_hash(fname, length=6): + if 'flowcell' in config: + return config['flowcell'] + with open(fname, 'rt') as inf: + csv = inf.readlines() + hash_obj = hashlib.sha1(csv.encode('utf-8')) + b32_str = base64.b32encode(bytes(hash_obj.digest())).decode('utf-8') + return b32_str[:length] + +############################################## rule all: input: expand("{dataDir}/{sample}.fasta", dataDir=config["dataDir"], - sample=read_file(config["samples"])) + sample=get_sample_list(config["samples"])) +############################################## + +def get_fastq_filenames(wildcards): + info = get_sample_info(wildcards.sample, config["samples"]) + suffix = config.get('fastqs_gzipped', False) and ".gz" or "" + return [ + os.path.join(wildcards.dir, + '{sample}_S{idx}_L001_R{dir}_001.fastq{suffix}'.format( + sample=wildcards.sample, idx=info['n'], dir=direction, + suffix=suffix)) + for direction in ('1','2')] rule bams_from_fastq: - input: os.path.join(config['dataDir'],'{sample}_R1_{idx}.fastq'), - os.path.join(config['dataDir'],'{sample}_R2_{idx}.fastq') - output: os.path.join(config['dataDir'],'{sample}_{idx}.bam') + input: get_fastq_filenames + output: '{dir}/{sample}.raw.bam' resources: mem=4 - params: logid="{sample}_{idx}", + params: logid="{sample}", center=config["seq_center"] - shell: "read_utils.py fastq_to_bam {input} {output} --sampleName {wildcards.sample} --picardOptions PLATFORM=illumina SEQUENCING_CENTER={params.center} LIBRARY_NAME={wildcards.sample}_{wildcards.idx} SORT_ORDER=queryname" + run: + run_date = get_file_date(input[0]) + info = get_sample_info(wildcards.sample, config["samples"]) + hash = make_run_hash(config["samples"]) + shell("{config[binDir]}/read_utils.py fastq_to_bam {input} {output} " \ + + "--sampleName {wildcards.sample} --picardOptions " \ + + "SEQUENCING_CENTER={params.center} " \ + + "RUN_DATE={run_date} " \ + + "PLATFORM_UNIT={hash}.1.{info[index]}-{info[index2]} " \ + + "READ_GROUP_NAME={hash} " \ + + "PLATFORM=illumina SORT_ORDER=queryname") + +############################################## rule ref_guided_consensus: - input: os.path.join(config['dataDir'],'{sample}.bam') - output: os.path.join(config['dataDir'],'{sample}.realigned.bam'), - os.path.join(config['dataDir'],'{sample}.vcf.gz'), - os.path.join(config['dataDir'],'{sample}.fasta') + input: '{dir}/{sample}.raw.bam' + output: '{dir}/{sample}.realigned.bam', + '{dir}/{sample}.vcf.gz', + '{dir}/{sample}.fasta' resources: mem=4 params: LSF='-W 4:00', logid="{sample}", - refGenome=config["ref_genome"], + refGenome=os.path.expanduser(config["ref_genome"]), novoalign_options = "-r Random -l 30 -g 40 -x 20 -t 502", min_coverage = "2" - shell: "assembly.py refine_assembly {params.refGenome} {input} {output[2]} --outBam {output[0]} --outVcf {output[1]} --min_coverage {params.min_coverage} --novo_params '{params.novoalign_options}' --keep_all_reads" + shell: "{config[binDir]}/assembly.py refine_assembly {params.refGenome} {input} {output[2]} --outBam {output[0]} --outVcf {output[1]} --min_coverage {params.min_coverage} --novo_params '{params.novoalign_options}' --keep_all_reads" diff --git a/pipes/ref_assisted/config.json b/pipes/ref_assisted/config.json index 97ef57cbb..f3662628c 100644 --- a/pipes/ref_assisted/config.json +++ b/pipes/ref_assisted/config.json @@ -1,9 +1,15 @@ { - "samples": "samples.txt", - "ref_genome": "~/resources/ref_genome/genome.fasta", - "seq_center": "ACEGID_RUN", - - "dataDir": "data", - "logDir": "log", - "project": "viral_ngs" + "samples": "SampleSheet.csv", + + "seq_center": "ACEGID_RUN", + "fastqs_gzipped": false, + "n_cores": 4, + "max_ram": 16, + "ref_genome": "~/resources/ref_genome/genome.fasta", + + "dataDir": "data", + "logDir": "log", + "binDir": "~/viral-ngs", + "venvDir": "~/venv", + "project": "viral_ngs" } diff --git a/pipes/ref_assisted/ref_assisted b/pipes/ref_assisted/ref_assisted index f6791a7f9..8bbc37518 100755 --- a/pipes/ref_assisted/ref_assisted +++ b/pipes/ref_assisted/ref_assisted @@ -1,5 +1,12 @@ #!/bin/bash + +# load config dirs from config.json +BIN_DIR=`python -c 'import json,os.path;f=open("config.json");print(os.path.expanduser(json.load(f)["binDir"]));f.close()'` +N_CORES=`python -c 'import json;f=open("config.json");print(json.load(f)["n_cores"]);f.close()'` +MAX_RAM=`python -c 'import json;f=open("config.json");print(json.load(f)["max_ram"]);f.close()'` + +# execute snakemake on this machine with specified resources snakemake --rerun-incomplete --keep-going \ - --jobs 4 --resources mem=16 \ - -s "$HOME/viral-ngs/pipes/ref_assisted/Snakefile" \ + --jobs $N_CORES --resources mem=$MAX_RAM \ + -s $BIN_DIR/pipes/ref_assisted/Snakefile \ "$@" From 78c925d1d4c2c9efe4ebabb5dd7f0b6e566463d1 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 10 Jan 2015 19:50:12 -0500 Subject: [PATCH 3/6] fix hashing --- pipes/ref_assisted/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/ref_assisted/Snakefile b/pipes/ref_assisted/Snakefile index 03fd92cb2..d423c8724 100644 --- a/pipes/ref_assisted/Snakefile +++ b/pipes/ref_assisted/Snakefile @@ -60,7 +60,7 @@ def make_run_hash(fname, length=6): if 'flowcell' in config: return config['flowcell'] with open(fname, 'rt') as inf: - csv = inf.readlines() + csv = ''.join(inf.readlines()) hash_obj = hashlib.sha1(csv.encode('utf-8')) b32_str = base64.b32encode(bytes(hash_obj.digest())).decode('utf-8') return b32_str[:length] From 5cd42e4b4fecb68edb8debe5842780e783e12f1f Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 10 Jan 2015 20:15:45 -0500 Subject: [PATCH 4/6] add env vars reading fro config json --- pipes/ref_assisted/Snakefile | 5 ++++- pipes/ref_assisted/ref_assisted | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pipes/ref_assisted/Snakefile b/pipes/ref_assisted/Snakefile index d423c8724..3616f459f 100644 --- a/pipes/ref_assisted/Snakefile +++ b/pipes/ref_assisted/Snakefile @@ -26,10 +26,13 @@ __author__ = 'Daniel Park ' -import os.path, time, hashlib, base64 +import os, os.path, time, hashlib, base64 configfile: "config.json" +for k,v in config.get('env_vars', {}).items(): + os.environ[k] = v + def get_sample_list(fname): with open(fname, 'rt') as inf: header = None diff --git a/pipes/ref_assisted/ref_assisted b/pipes/ref_assisted/ref_assisted index 8bbc37518..0fa8ae72f 100755 --- a/pipes/ref_assisted/ref_assisted +++ b/pipes/ref_assisted/ref_assisted @@ -6,7 +6,7 @@ N_CORES=`python -c 'import json;f=open("config.json");print(json.load(f)["n_core MAX_RAM=`python -c 'import json;f=open("config.json");print(json.load(f)["max_ram"]);f.close()'` # execute snakemake on this machine with specified resources -snakemake --rerun-incomplete --keep-going \ +snakemake --timestamp --rerun-incomplete --keep-going \ --jobs $N_CORES --resources mem=$MAX_RAM \ -s $BIN_DIR/pipes/ref_assisted/Snakefile \ "$@" From f9e51ae98ec745f2a75c80d11ca2e1910db99d3b Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 10 Jan 2015 21:26:46 -0500 Subject: [PATCH 5/6] move some options into config.json, increase min_coverage to 3, rename output fasta sequence to sample name --- assembly.py | 22 +++++++++++++++------- pipes/ref_assisted/Snakefile | 13 ++++++++++--- pipes/ref_assisted/config.json | 5 +++++ 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/assembly.py b/assembly.py index 2c0762149..f5e65d26e 100755 --- a/assembly.py +++ b/assembly.py @@ -96,7 +96,7 @@ def align_and_orient_vfat(inFasta, inReference, outFasta, minLength, minUnambig, def refine_assembly(inFasta, inBam, outFasta, outVcf=None, outBam=None, novo_params='', min_coverage=2, - keep_all_reads=False, JVMmemory=None): + contig_names=[], keep_all_reads=False, JVMmemory=None): ''' This a refinement step where we take a crude assembly, align all reads back to it, and modify the assembly to the majority allele at each position based on read pileups. @@ -143,9 +143,12 @@ def refine_assembly(inFasta, inBam, outFasta, gatk.ug(realignBam, deambigFasta, tmpVcf, JVMmemory=JVMmemory) os.unlink(realignBam) os.unlink(deambigFasta) + name_opts = [] + if contig_names: + name_opts = ['--name'] + contig_names main_vcf_to_fasta(parser_vcf_to_fasta().parse_args([ tmpVcf, outFasta, '--trim_ends', '--min_coverage', str(min_coverage), - ])) + ] + name_opts)) if outVcf: shutil.copyfile(tmpVcf, outVcf) if outVcf.endswith('.gz'): @@ -178,6 +181,9 @@ def parser_refine_assembly(): parser.add_argument('--novo_params', default='-r Random -l 40 -g 40 -x 20 -t 100', help='Alignment parameters for Novoalign.') + parser.add_argument("--chr_names", dest="chr_names", nargs="*", + help="Rename all output chromosomes (default: retain original chromosome names)", + default=[]) parser.add_argument("--keep_all_reads", help="""Retain all reads in BAM file? Default is to remove unaligned and duplicate reads.""", default=False, action="store_true", dest="keep_all_reads") @@ -189,6 +195,7 @@ def parser_refine_assembly(): def main_refine_assembly(args): refine_assembly(args.inFasta, args.inBam, args.outFasta, args.outVcf, args.outBam, args.novo_params, args.min_coverage, + contig_names=args.chr_names, keep_all_reads=args.keep_all_reads, JVMmemory=args.JVMmemory) return 0 __commands__.append(('refine_assembly', main_refine_assembly, parser_refine_assembly)) @@ -582,9 +589,9 @@ def parser_vcf_to_fasta(): total read count. This filter will not apply to any sites unless both DP values are reported. [default: %(default)s]""", default=0.0) - parser.add_argument("--name", dest="name", - help="output sequence name (default: reference name in VCF file)", - default=None) + parser.add_argument("--name", dest="name", nargs="*", + help="output sequence names (default: reference names in VCF file)", + default=[]) util.cmd.common_args(parser, (('loglevel',None), ('version',None))) return parser def main_vcf_to_fasta(args): @@ -595,13 +602,14 @@ def main_vcf_to_fasta(args): chrlens = dict(vcf.chrlens()) samples = vcf.samples() with open(args.outFasta, 'wt') as outf: + chr_idx = 0 for header, seq in vcf_to_seqs(util.file.read_tabfile(args.inVcf), chrlens, samples, min_dp=args.min_dp, major_cutoff=args.major_cutoff, min_dp_ratio=args.min_dp_ratio): if args.trim_ends: seq = seq.strip('Nn') - if args.name!=None: - header = args.name + if args.name: + header = args.name[chr_idx % len(args.name)] for line in util.file.fastaMaker([(header, seq)]): outf.write(line) diff --git a/pipes/ref_assisted/Snakefile b/pipes/ref_assisted/Snakefile index 3616f459f..46744c60e 100644 --- a/pipes/ref_assisted/Snakefile +++ b/pipes/ref_assisted/Snakefile @@ -116,6 +116,13 @@ rule ref_guided_consensus: params: LSF='-W 4:00', logid="{sample}", refGenome=os.path.expanduser(config["ref_genome"]), - novoalign_options = "-r Random -l 30 -g 40 -x 20 -t 502", - min_coverage = "2" - shell: "{config[binDir]}/assembly.py refine_assembly {params.refGenome} {input} {output[2]} --outBam {output[0]} --outVcf {output[1]} --min_coverage {params.min_coverage} --novo_params '{params.novoalign_options}' --keep_all_reads" + novoalign_options=config["refine_options"]["novoalign"], + min_coverage=config["refine_options"]["min_coverage"] + shell: "{config[binDir]}/assembly.py refine_assembly " \ + + "{params.refGenome} {input} {output[2]} " \ + + "--outBam {output[0]} " \ + + "--outVcf {output[1]} " \ + + "--keep_all_reads " \ + + "--chr_names {wildcards.sample} " \ + + "--min_coverage {params.min_coverage} " \ + + "--novo_params '{params.novoalign_options}' " diff --git a/pipes/ref_assisted/config.json b/pipes/ref_assisted/config.json index f3662628c..7254ccdc9 100644 --- a/pipes/ref_assisted/config.json +++ b/pipes/ref_assisted/config.json @@ -1,5 +1,10 @@ { "samples": "SampleSheet.csv", + + "refine_options": { + "novoalign": "-r Random -l 30 -g 40 -x 20 -t 502", + "min_coverage": "3" + }, "seq_center": "ACEGID_RUN", "fastqs_gzipped": false, From 49c580d57679f5b34a441b1fdf1aca3305b3ec60 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 10 Jan 2015 22:15:35 -0500 Subject: [PATCH 6/6] update ref_guided_consensus --- pipes/rules/interhost.rules | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipes/rules/interhost.rules b/pipes/rules/interhost.rules index 3a336aa80..b6f02037f 100644 --- a/pipes/rules/interhost.rules +++ b/pipes/rules/interhost.rules @@ -28,7 +28,7 @@ rule all_ref_guided: config["reportsDir"]+'/summary.coverage_ref.txt.gz' rule ref_guided_consensus: - input: config["dataDir"]+'/'+config["subdirs"]["source"]+'/{sample}.bam' + input: config["dataDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam' output: config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.realigned.bam', config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.vcf.gz', config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.fasta' @@ -37,12 +37,12 @@ rule ref_guided_consensus: logid="{sample}", refGenome=config["ref_genome"], novoalign_options="-r Random -l 30 -g 40 -x 20 -t 502", - min_coverage="2" + min_coverage="3" run: makedirs(expand("{dir}/{subdir}", dir=[config["dataDir"]], subdir=[config["subdirs"]["align_ref"], config["subdirs"]["assembly"]])) - shell("{config[binDir]}/assembly.py refine_assembly {params.refGenome} {input} {output[2]} --outBam {output[0]} --outVcf {output[1]} --min_coverage {params.min_coverage} --novo_params '{params.novoalign_options}' --keep_all_reads") + shell("{config[binDir]}/assembly.py refine_assembly {params.refGenome} {input} {output[2]} --outBam {output[0]} --outVcf {output[1]} --min_coverage {params.min_coverage} --novo_params '{params.novoalign_options}' --keep_all_reads --chr_names {wildcards.sample}") rule ref_guided_diversity: input: