Skip to content

Commit

Permalink
Merge pull request #356 from broadinstitute/hm-add-read-count-to-vcf
Browse files Browse the repository at this point in the history
Add read depth to merged intrahost variation output
  • Loading branch information
dpark01 authored Jun 20, 2016
2 parents 2da1faf + 8016f2e commit e45c1ba
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 10 deletions.
9 changes: 7 additions & 2 deletions intrahost.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,7 @@ def merge_to_vcf(
outf.write('##fileformat=VCFv4.1\n')
outf.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
outf.write('##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n')
outf.write('##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">\n')
outf.write('##FORMAT=<ID=NL,Number=R,Type=Integer,Description="Number of libraries observed per allele">\n')
outf.write(
'##FORMAT=<ID=LB,Number=R,Type=Float,Description="Library bias observed per allele (Fishers Exact P-value)">\n')
Expand Down Expand Up @@ -686,6 +687,7 @@ def merge_to_vcf(

# define genotypes and fractions
iSNVs = {} # {sample : {allele : fraction, ...}, ...}
iSNVs_read_depth = {} # {sample: read depth}
iSNVs_n_libs = {} # {sample : {allele : n libraries > 0, ...}, ...}
iSNVs_lib_bias = {} # {sample : {allele : pval, ...}, ...}
for s in samplesToUse:
Expand All @@ -711,6 +713,7 @@ def merge_to_vcf(
consAllele = consAlleles[s]
tot_n = sum(acounts.values())
iSNVs[s] = {} # {allele : fraction, ...}
iSNVs_read_depth[s] = tot_n
iSNVs_n_libs[s] = {}
iSNVs_lib_bias[s] = {}
for orig_a, n in acounts.items():
Expand Down Expand Up @@ -794,6 +797,8 @@ def merge_to_vcf(
# AF col emitted below, everything excluding the ref allele (one float per non-ref allele)
freqs = [(s in iSNVs) and ','.join(map(str, [iSNVs[s].get(a, 0.0) for a in alleles[1:]])) or '.'
for s in samplesToUse]
# DP col emitted below
depths = [str(iSNVs_read_depth.get(s, '.')) for s in samplesToUse]
# NL col, everything including the ref allele (one int per allele)
nlibs = [(s in iSNVs_n_libs) and ','.join([str(iSNVs_n_libs[s].get(a, 0)) for a in alleles]) or '.'
for s in samplesToUse]
Expand All @@ -807,8 +812,8 @@ def merge_to_vcf(
c = parse_accession_str(c)
if strip_chr_version:
c = strip_accession_version(c)
out = [c, pos, '.', alleles[0], ','.join(alleles[1:]), '.', '.', '.', 'GT:AF:NL:LB']
out = out + list(map(':'.join, zip(genos, freqs, nlibs, pvals)))
out = [c, pos, '.', alleles[0], ','.join(alleles[1:]), '.', '.', '.', 'GT:AF:DP:NL:LB']
out = out + list(map(':'.join, zip(genos, freqs, depths, nlibs, pvals)))
outf.write('\t'.join(map(str, out)) + '\n')

# compress output if requested
Expand Down
44 changes: 36 additions & 8 deletions test/unit/test_intrahost.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,34 @@ def test_simple_snps(self):
self.assertEqual(':'.join(rows[1][1].split(':')[:2]), '0:0.0')
self.assertEqual(':'.join(rows[1][2].split(':')[:2]), '0:0.3')

def test_snps_with_varying_read_depth(self):
merger = VcfMergeRunner([('ref1', 'ATCGGACT')])
merger.add_genome('s1', [('s1_1', 'ATCGGAC')])
# ATCGGAC-
merger.add_genome('s2', [('s2_1', 'TCGGACT')])
# -TCGGACT
merger.add_genome('s3', [('s3_1', 'TCGGACT')])
# -TCGGACT
merger.add_snp('s1', 's1_1', 3, [('C', 60, 100), ('A', 25, 15)])
merger.add_snp('s2', 's2_1', 2, [('C', 12, 6), ('A', 2, 0)])
merger.add_snp('s3', 's3_1', 5, [('A', 7, 4), ('T', 2, 3)])
rows = merger.run_and_get_vcf_rows()
self.assertEqual(len(rows), 2)
self.assertEqual(rows[0].contig, 'ref1')
self.assertEqual(rows[0].pos + 1, 3)
self.assertEqual(rows[0].ref, 'C')
self.assertEqual(rows[0].alt, 'A')
self.assertEqual(':'.join(rows[0][0].split(':')[:3]), '0:0.2:200')
self.assertEqual(':'.join(rows[0][1].split(':')[:3]), '0:0.1:20')
self.assertEqual(':'.join(rows[0][2].split(':')[:3]), '0:0.0:.')
self.assertEqual(rows[1].contig, 'ref1')
self.assertEqual(rows[1].pos + 1, 6)
self.assertEqual(rows[1].ref, 'A')
self.assertEqual(rows[1].alt, 'T')
self.assertEqual(':'.join(rows[1][0].split(':')[:3]), '0:0.0:.')
self.assertEqual(':'.join(rows[1][1].split(':')[:3]), '0:0.0:.')
self.assertEqual(':'.join(rows[1][2].split(':')[:3]), '0:0.3125:16')

def test_snps_downstream_of_indels(self):
merger = VcfMergeRunner([('ref1', 'ATCGGACT')])
merger.add_genome('s1', [('s1_1', 'ATCGTTGACT')])
Expand Down Expand Up @@ -408,7 +436,7 @@ def test_backfill_sample_from_assembly(self):
self.assertEqual(rows[0].alt, 'A')
self.assertEqual(':'.join(rows[0][0].split(':')[:2]), '0:0.1')
self.assertEqual(':'.join(rows[0][1].split(':')[:2]), '1:1.0')
self.assertEqual(rows[0][2], '.:.:.:.')
self.assertEqual(rows[0][2], '.:.:.:.:.')

def test_simple_insertions(self):
# IA, ITCG, etc
Expand Down Expand Up @@ -478,7 +506,7 @@ def test_deletion_spans_deletion(self):
self.assertEqual(rows[0].pos + 1, 4)
self.assertEqual(rows[0].ref, 'GTTC')
self.assertEqual(rows[0].alt, 'GC,G')
self.assertEqual(rows[0][0], '1:0.9,0.1:0,1,1:.,1.0,1.0')
self.assertEqual(rows[0][0], '1:0.9,0.1:200:0,1,1:.,1.0,1.0')

def test_insertion_spans_deletion(self):
# sample assembly has deletion against reference and isnv inserts back into it
Expand Down Expand Up @@ -565,14 +593,14 @@ def test_deletion_past_end_of_some_consensus(self):
self.assertEqual(rows[0].ref, 'GAA')
# multiple options because the allele frequencies can be the same
self.assertIn(rows[0].alt, ['C,G,A', 'G,C,A'])
self.assertIn(rows[0][0], ['2:0.0,0.7,0.3:0,0,1,1:.,.,1.0,1.0', '1:0.7,0.0,0.3:0,1,0,1:.,1.0,.,1.0'])
self.assertIn(rows[0][1], ['1:1.0,0.0,0.0:.:.', '2:0.0,1.0,0.0:.:.'])
self.assertIn(rows[0][0], ['2:0.0,0.7,0.3:200:0,0,1,1:.,.,1.0,1.0', '1:0.7,0.0,0.3:200:0,1,0,1:.,1.0,.,1.0'])
self.assertIn(rows[0][1], ['1:1.0,0.0,0.0:.:.:.', '2:0.0,1.0,0.0:.:.:.'])
self.assertEqual(rows[1].contig, 'ref1')
self.assertEqual(rows[1].pos + 1, 7)
self.assertEqual(rows[1].ref, 'C')
self.assertEqual(rows[1].alt, 'T')
self.assertEqual(rows[1][0], '0:0.0:.:.')
self.assertEqual(rows[1][1], '1:0.8:1,1:1.0,1.0')
self.assertEqual(rows[1][0], '0:0.0:.:.:.')
self.assertEqual(rows[1][1], '1:0.8:200:1,1:1.0,1.0')

def test_snp_past_end_of_some_consensus(self):
# Some sample contains SNP beyond the end of the consensus
Expand All @@ -592,8 +620,8 @@ def test_snp_past_end_of_some_consensus(self):
self.assertEqual(rows[0].pos + 1, 2)
self.assertEqual(rows[0].ref, 'T')
self.assertEqual(rows[0].alt, 'G')
self.assertEqual(rows[0][0], '0:0.3:1,1:1.0,1.0')
self.assertEqual(rows[0][1], '.:.:.:.')
self.assertEqual(rows[0][0], '0:0.3:200:1,1:1.0,1.0')
self.assertEqual(rows[0][1], '.:.:.:.:.')

def test_deletion_within_insertion(self):
# sample assembly has insertion against reference and isnv deletes from it
Expand Down

0 comments on commit e45c1ba

Please sign in to comment.