Skip to content

Commit

Permalink
Merge pull request #114 from broadinstitute/dp-libstats
Browse files Browse the repository at this point in the history
incorporate library stats to isnv pipeline
  • Loading branch information
dpark01 committed Mar 18, 2015
2 parents adf364e + c934722 commit 96321bd
Show file tree
Hide file tree
Showing 13 changed files with 222 additions and 142 deletions.
174 changes: 142 additions & 32 deletions intrahost.py

Large diffs are not rendered by default.

70 changes: 0 additions & 70 deletions old-scripts/kga/trees.sh

This file was deleted.

39 changes: 36 additions & 3 deletions pipes/rules/intrahost.rules
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ import os, os.path, time


rule all_intrahost:
input: config["dataDir"]+'/'+config["subdirs"]["intrahost"] +'/isnvs.vcf.gz'
input:
config["dataDir"]+'/'+config["subdirs"]["intrahost"] +'/isnvs.vcf.gz',
config["dataDir"]+'/'+config["subdirs"]["intrahost"] +'/isnvs.filtered.vcf.gz'

rule isnvs_per_sample:
input:
Expand Down Expand Up @@ -46,9 +48,40 @@ rule isnvs_vcf:
run:
shell("{config[binDir]}/intrahost.py merge_to_vcf {params.refGenome} {output[0]}"
+ " --samples {params.samples}"
+ " --isnvs " + " ".join("{config[dataDir]}/{config[subdirs][intrahost]}/vphaser2."+s+".txt" for s in params.samples)
+ " --isnvs " + " ".join("{config[dataDir]}/{config[subdirs][intrahost]}/vphaser2."+s+".txt.gz" for s in params.samples)
+ " --assemblies " + " ".join("{config[dataDir]}/{config[subdirs][assembly]}/"+s+".fasta" for s in params.samples)
+ " --strip_chr_version"
)
shell("{config[binDir]}/interhost.py snpEff {output[0]} {params.snpEff_ref} {output[1]}")
shell("{config[binDir]}/intrahost.py iSNV_table {output[1]} {output[2]}")

rule isnvs_vcf_filtered:
input:
expand("{dataDir}/{subdir}/vphaser2.{sample}.txt.gz",
dataDir=config["dataDir"],
subdir=config["subdirs"]["intrahost"],
sample=read_samples_file(config["samples_assembly"])),
expand("{dataDir}/{subdir}/{sample}.fasta",
dataDir=config["dataDir"],
subdir=config["subdirs"]["assembly"],
sample=read_samples_file(config["samples_assembly"]))
output:
config["dataDir"]+'/'+config["subdirs"]["intrahost"] +'/isnvs.filtered.vcf.gz',
config["dataDir"]+'/'+config["subdirs"]["intrahost"] +'/isnvs.filtered.annot.vcf.gz',
config["dataDir"]+'/'+config["subdirs"]["intrahost"] +'/isnvs.filtered.annot.txt.gz'
resources: mem=4
params: LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'),
logid="all",
refGenome=config["ref_genome"],
snpEff_ref=config["snpEff_genome"],
samples=read_samples_file(config["samples_assembly"])
run:
shell("{config[binDir]}/intrahost.py merge_to_vcf {params.refGenome} {output[0]}"
+ " --samples {params.samples}"
+ " --isnvs " + " ".join("{config[dataDir]}/{config[subdirs][intrahost]}/vphaser2."+s+".txt.gz" for s in params.samples)
+ " --assemblies " + " ".join("{config[dataDir]}/{config[subdirs][assembly]}/"+s+".fasta" for s in params.samples)
+ " --strip_chr_version --naive_filter"
)
shell("{config[binDir]}/interhost.py snpEff {output[0]} {params.snpEff_ref} {output[1]} --strip_chr_version")
shell("{config[binDir]}/interhost.py snpEff {output[0]} {params.snpEff_ref} {output[1]}")
shell("{config[binDir]}/intrahost.py iSNV_table {output[1]} {output[2]}")

Binary file modified test/input/TestPerSample/in.2libs.bam
Binary file not shown.
Binary file removed test/input/TestPerSample/in.2libs.bam.bai
Binary file not shown.
Binary file added test/input/TestPerSample/in.2libs3rgs.bam
Binary file not shown.
Binary file modified test/input/TestPerSample/in.3libs.bam
Binary file not shown.
Binary file removed test/input/TestPerSample/in.3libs.bam.bai
Binary file not shown.
Binary file modified test/input/TestPerSample/in.bam
Binary file not shown.
Binary file modified test/input/TestPerSample/in.indels.bam
Binary file not shown.
Binary file removed test/input/TestPerSample/in.indels.bam.bai
Binary file not shown.
74 changes: 37 additions & 37 deletions test/unit/test_intrahost.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,16 +306,16 @@ def test_simple_snps(self):
self.assertEqual(rows[0].pos+1, 3)
self.assertEqual(rows[0].ref, 'C')
self.assertEqual(rows[0].alt, 'A')
self.assertEqual(rows[0][0], '0:0.2')
self.assertEqual(rows[0][1], '0:0.1')
self.assertEqual(rows[0][2], '0:0.0')
self.assertEqual(':'.join(rows[0][0].split(':')[:2]), '0:0.2')
self.assertEqual(':'.join(rows[0][1].split(':')[:2]), '0:0.1')
self.assertEqual(':'.join(rows[0][2].split(':')[:2]), '0:0.0')
self.assertEqual(rows[1].contig, 'ref1')
self.assertEqual(rows[1].pos+1, 6)
self.assertEqual(rows[1].ref, 'A')
self.assertEqual(rows[1].alt, 'T')
self.assertEqual(rows[1][0], '0:0.0')
self.assertEqual(rows[1][1], '0:0.0')
self.assertEqual(rows[1][2], '0:0.3')
self.assertEqual(':'.join(rows[1][0].split(':')[:2]), '0:0.0')
self.assertEqual(':'.join(rows[1][1].split(':')[:2]), '0:0.0')
self.assertEqual(':'.join(rows[1][2].split(':')[:2]), '0:0.3')

def test_snps_downstream_of_indels(self):
merger = VcfMergeRunner([('ref1', 'ATCGGACT')])
Expand All @@ -331,9 +331,9 @@ def test_snps_downstream_of_indels(self):
self.assertEqual(rows[0].pos+1, 6)
self.assertEqual(rows[0].ref, 'A')
self.assertEqual(rows[0].alt, 'C')
self.assertEqual(rows[0][0], '0:0.2')
self.assertEqual(rows[0][1], '0:0.1')
self.assertEqual(rows[0][2], '1:0.7')
self.assertEqual(':'.join(rows[0][0].split(':')[:2]), '0:0.2')
self.assertEqual(':'.join(rows[0][1].split(':')[:2]), '0:0.1')
self.assertEqual(':'.join(rows[0][2].split(':')[:2]), '1:0.7')

def test_sample_major_allele_not_ref_allele(self):
# make sure we can invert the allele frequency of the isnv
Expand All @@ -347,7 +347,7 @@ def test_sample_major_allele_not_ref_allele(self):
self.assertEqual(rows[0].pos+1, 3)
self.assertEqual(rows[0].ref, 'C')
self.assertEqual(rows[0].alt, 'A')
self.assertEqual(rows[0][0], '1:0.9')
self.assertEqual(':'.join(rows[0][0].split(':')[:2]), '1:0.9')

def test_backfill_sample_from_assembly(self):
# one sample has no isnv, but another does, so we fill it in
Expand All @@ -367,9 +367,9 @@ def test_backfill_sample_from_assembly(self):
self.assertEqual(rows[0].pos+1, 3)
self.assertEqual(rows[0].ref, 'C')
self.assertEqual(rows[0].alt, 'A')
self.assertEqual(rows[0][0], '0:0.1')
self.assertEqual(rows[0][1], '1:1.0')
self.assertEqual(rows[0][2], '.:.')
self.assertEqual(':'.join(rows[0][0].split(':')[:2]), '0:0.1')
self.assertEqual(':'.join(rows[0][1].split(':')[:2]), '1:1.0')
self.assertEqual(rows[0][2], '.:.:.:.')

def test_simple_insertions(self):
# IA, ITCG, etc
Expand All @@ -394,9 +394,9 @@ def test_simple_insertions(self):
self.assertEqual(rows[0].pos+1, 2)
self.assertEqual(rows[0].ref, 'T')
self.assertEqual(rows[0].alt, 'TAA,TGCC,TAT')
self.assertEqual(rows[0][0], '0:0.2,0.0,0.0')
self.assertEqual(rows[0][1], '0:0.0,0.0,0.1')
self.assertEqual(rows[0][2], '0:0.05,0.15,0.0')
self.assertEqual(':'.join(rows[0][0].split(':')[:2]), '0:0.2,0.0,0.0')
self.assertEqual(':'.join(rows[0][1].split(':')[:2]), '0:0.0,0.0,0.1')
self.assertEqual(':'.join(rows[0][2].split(':')[:2]), '0:0.05,0.15,0.0')

def test_simple_deletions(self):
# D1, D2, etc...
Expand All @@ -420,9 +420,9 @@ def test_simple_deletions(self):
self.assertEqual(rows[0].pos+1, 2)
self.assertEqual(rows[0].ref, 'TCG')
self.assertEqual(rows[0].alt, 'TG,T')
self.assertEqual(rows[0][0], '0:0.0,0.2')
self.assertEqual(rows[0][1], '0:0.1,0.0')
self.assertEqual(rows[0][2], '0:0.15,0.05')
self.assertEqual(':'.join(rows[0][0].split(':')[:2]), '0:0.0,0.2')
self.assertEqual(':'.join(rows[0][1].split(':')[:2]), '0:0.1,0.0')
self.assertEqual(':'.join(rows[0][2].split(':')[:2]), '0:0.15,0.05')

def test_deletion_spans_deletion(self):
# sample assembly has deletion against reference and isnv deletes even more
Expand All @@ -439,7 +439,7 @@ def test_deletion_spans_deletion(self):
self.assertEqual(rows[0].pos+1, 4)
self.assertEqual(rows[0].ref, 'GTTC')
self.assertEqual(rows[0].alt, 'GC,G')
self.assertEqual(rows[0][0], '1:0.9,0.1')
self.assertEqual(rows[0][0], '1:0.9,0.1:0,1,1:.,1.0,1.0')

def test_insertion_spans_deletion(self):
# sample assembly has deletion against reference and isnv inserts back into it
Expand All @@ -462,9 +462,9 @@ def test_insertion_spans_deletion(self):
self.assertEqual(rows[0].pos+1, 4)
self.assertEqual(rows[0].ref, 'GTT')
self.assertEqual(rows[0].alt, 'G,GT,GTTC')
self.assertEqual(rows[0][0], '1:0.7,0.3,0.0')
self.assertEqual(rows[0][1], '1:0.8,0.0,0.0')
self.assertEqual(rows[0][2], '1:0.9,0.0,0.1')
self.assertEqual(':'.join(rows[0][0].split(':')[:2]), '1:0.7,0.3,0.0')
self.assertEqual(':'.join(rows[0][1].split(':')[:2]), '1:0.8,0.0,0.0')
self.assertEqual(':'.join(rows[0][2].split(':')[:2]), '1:0.9,0.0,0.1')

def test_snp_within_insertion(self):
# sample assembly has insertion against reference and isnp modifies it
Expand All @@ -486,9 +486,9 @@ def test_snp_within_insertion(self):
self.assertEqual(rows[0].pos+1, 4)
self.assertEqual(rows[0].ref, 'G')
self.assertEqual(rows[0].alt, 'GTT,CTT,GCT,GTC')
self.assertEqual(rows[0][0], '1:0.7,0.3,0.0,0.0')
self.assertEqual(rows[0][1], '1:0.8,0.0,0.2,0.0')
self.assertEqual(rows[0][2], '1:0.9,0.0,0.0,0.1')
self.assertEqual(':'.join(rows[0][0].split(':')[:2]), '1:0.7,0.3,0.0,0.0')
self.assertEqual(':'.join(rows[0][1].split(':')[:2]), '1:0.8,0.0,0.2,0.0')
self.assertEqual(':'.join(rows[0][2].split(':')[:2]), '1:0.9,0.0,0.0,0.1')

def test_2snps_within_insertion_same_sample(self):
# Sample assembly has insertion against reference containing two SNPs
Expand Down Expand Up @@ -525,8 +525,8 @@ def test_deletion_past_end_of_some_consensus(self):
self.assertEqual(rows[0].pos+1, 4)
self.assertEqual(rows[0].ref, 'GAA')
self.assertEqual(rows[0].alt, 'G,A')
self.assertEqual(rows[0][0], '1:0.7,0.3')
self.assertEqual(rows[0][1], '.:.')
self.assertEqual(rows[0][0], '1:0.7,0.3:0,1,1:.,1.0,1.0')
self.assertEqual(rows[0][1], '.:.:.:.')

def test_snp_past_end_of_some_consensus(self):
# Some sample contains SNP beyond the end of the consensus
Expand All @@ -546,8 +546,8 @@ def test_snp_past_end_of_some_consensus(self):
self.assertEqual(rows[0].pos+1, 2)
self.assertEqual(rows[0].ref, 'T')
self.assertEqual(rows[0].alt, 'G')
self.assertEqual(rows[0][0], '0:0.3')
self.assertEqual(rows[0][1], '.:.')
self.assertEqual(rows[0][0], '0:0.3:1,1:1.0,1.0')
self.assertEqual(rows[0][1], '.:.:.:.')

def test_deletion_within_insertion(self):
# sample assembly has insertion against reference and isnv deletes from it
Expand All @@ -574,10 +574,10 @@ def test_deletion_within_insertion(self):
self.assertEqual(rows[0].pos+1, 4)
self.assertEqual(rows[0].ref, 'GG')
self.assertEqual(rows[0].alt, 'GTTG,GTG,GTT,G,GT')
self.assertEqual(rows[0][0], '1:0.4,0.3,0.0,0.1,0.0')
self.assertEqual(rows[0][1], '1:0.8,0.2,0.0,0.0,0.0')
self.assertEqual(rows[0][2], '1:0.85,0.1,0.0,0.0,0.05')
self.assertEqual(rows[0][3], '1:0.9,0.0,0.1,0.0,0.0')
self.assertEqual(':'.join(rows[0][0].split(':')[:2]), '1:0.4,0.3,0.0,0.1,0.0')
self.assertEqual(':'.join(rows[0][1].split(':')[:2]), '1:0.8,0.2,0.0,0.0,0.0')
self.assertEqual(':'.join(rows[0][2].split(':')[:2]), '1:0.85,0.1,0.0,0.0,0.05')
self.assertEqual(':'.join(rows[0][3].split(':')[:2]), '1:0.9,0.0,0.1,0.0,0.0')

def test_insertion_within_insertion(self):
# sample assembly has insertion against reference and isnv puts even more in
Expand All @@ -599,9 +599,9 @@ def test_insertion_within_insertion(self):
self.assertEqual(rows[0].pos+1, 4)
self.assertEqual(rows[0].ref, 'G')
self.assertEqual(rows[0].alt, 'GTT,GATT,GTAT,GTTA')
self.assertEqual(rows[0][0], '1:0.7,0.3,0.0,0.0')
self.assertEqual(rows[0][1], '1:0.8,0.0,0.2,0.0')
self.assertEqual(rows[0][2], '1:0.9,0.0,0.0,0.1')
self.assertEqual(':'.join(rows[0][0].split(':')[:2]), '1:0.7,0.3,0.0,0.0')
self.assertEqual(':'.join(rows[0][1].split(':')[:2]), '1:0.8,0.0,0.2,0.0')
self.assertEqual(':'.join(rows[0][2].split(':')[:2]), '1:0.9,0.0,0.0,0.1')

def test_indel_collapse(self):
# vphaser describes insertions and deletions separately
Expand Down
7 changes: 7 additions & 0 deletions tools/samtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@ def execute(self, command, args, stdin=None, stdout=None, stderr=None):
def view(self, args, inFile, outFile, regions=[]):
self.execute('view', args + ['-o', outFile, inFile] + regions)

def merge(self, inFiles, outFile, options=['-f']):
"Merge a list of inFiles to create outFile."
# We are using -f for now because mkstempfname actually makes an empty
# file, and merge fails with that as output target without the -f.
# When mkstempfname is fixed, we should remove the -f.
self.execute('merge', options + [outFile] + inFiles)

def index(self, inBam):
self.execute('index', [inBam])

Expand Down

0 comments on commit 96321bd

Please sign in to comment.