Skip to content

Commit

Permalink
Merge pull request #112 from broadinstitute/irwin-stats
Browse files Browse the repository at this point in the history
Irwin stats - closes #108, closes #109
  • Loading branch information
dpark01 committed Mar 12, 2015
2 parents 5c38693 + 2eb22e2 commit 0f21fae
Show file tree
Hide file tree
Showing 16 changed files with 647 additions and 133 deletions.
2 changes: 1 addition & 1 deletion interhost.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
except ImportError :
from itertools import izip_longest as zip_longest
import tools.muscle, tools.snpeff
import util.cmd, util.file, util.vcf, util.misc
import util.cmd, util.file, util.vcf
from collections import OrderedDict, Sequence

log = logging.getLogger(__name__)
Expand Down
25 changes: 13 additions & 12 deletions intrahost.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import Bio.AlignIO, Bio.SeqIO, Bio.Data.IUPACData
import pysam
import util.cmd, util.file, util.vcf, util.misc
from util.misc import mean, median, fisher_exact, chi2_contingency
from util.stats import mean, median, fisher_exact, chi2_contingency
from interhost import CoordMapper
from tools.vphaser2 import Vphaser2Tool
from tools.samtools import SamtoolsTool
Expand Down Expand Up @@ -98,8 +98,8 @@ def vphaser_one_sample(inBam, inConsFasta, outTab, vphaserNumThreads = None,
sequence/chrom name, and library counts and p-values appended to
the counts for each allele.
'''
if minReadsEach != None and minReadsEach <= 0:
raise Exception('minReadsEach must be at least 1.')
if minReadsEach != None and minReadsEach < 0:
raise Exception('minReadsEach must be at least 0.')
variantIter = Vphaser2Tool().iterate(inBam, vphaserNumThreads)
filteredIter = filter_strand_bias(variantIter, minReadsEach, maxBias)
libraryFilteredIter = compute_library_bias(filteredIter, inBam, inConsFasta)
Expand All @@ -120,8 +120,9 @@ def filter_strand_bias(isnvs, minReadsEach = None, maxBias = None) :
front = row[:alleleCol]
for fieldInd in range(len(row) - 1, alleleCol - 1, -1) :
f, r = AlleleFieldParser(row[fieldInd]).strand_counts()
if not (int(f)>=minReadsEach and int(r)>=minReadsEach
and maxBias >= (float(f)/float(r)) >= 1.0/maxBias) :
if (int(f)<minReadsEach or int(r)<minReadsEach
or (minReadsEach > 0 and not
(maxBias >= (float(f)/float(r)) >= 1.0/maxBias))) :
del row[fieldInd]
if len(row) > alleleCol + 1:
row[alleleCol:] = sorted(row[alleleCol:],
Expand Down Expand Up @@ -171,13 +172,14 @@ def compute_library_bias(isnvs, inBam, inConsFasta) :
contingencyTable = [
[ countsRow[alleleInd] for countsRow in countsMatrix],
[sum(countsRow) - countsRow[alleleInd] for countsRow in countsMatrix]]
if len(libCounts) < 2 :
rowSums = map(sum, contingencyTable)
dofs = len(libCounts) - 1
if dofs < 1 :
pval = 1.0
elif len(libCounts) == 2 :
elif min(rowSums) ** dofs / dofs < 10000 :
# At this cutoff, fisher_exact should take <~ 0.1 sec
pval = fisher_exact(contingencyTable)
else :
# The following is not recommended if any of the counts or
# expected counts are less than 5.
pval = chi2_contingency(contingencyTable)
row[alleleCol + alleleInd] = str(AlleleFieldParser(None,
*(row[alleleCol + alleleInd].split(':') +
Expand Down Expand Up @@ -261,11 +263,10 @@ def parser_vphaser_one_sample(parser = argparse.ArgumentParser()) :
parser.add_argument("--vphaserNumThreads", type = int, default = None,
help="Number of threads in call to V-Phaser 2.")
parser.add_argument("--minReadsEach", type = int, default = defaultMinReads,
help = """Minimum number of reads on each strand (default: %(default)s).
Must be at least 1.""")
help = "Minimum number of reads on each strand (default: %(default)s).")
parser.add_argument("--maxBias", type = int, default = defaultMaxBias,
help = """Maximum allowable ratio of number of reads on the two strands
(default: %(default)s).""")
(default: %(default)s). Ignored if minReadsEach = 0.""")
util.cmd.common_args(parser, (('loglevel', None), ('version', None)))
util.cmd.attach_main(parser, vphaser_one_sample, split_args = True)
return parser
Expand Down
2 changes: 1 addition & 1 deletion reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import util.cmd, util.file, util.misc
import tools.samtools
import assembly
from util.misc import mean, median
from util.stats import mean, median

log = logging.getLogger(__name__)

Expand Down
Binary file removed test/input/TestPerSample/in.2libs.bam.bti
Binary file not shown.
Binary file added test/input/TestPerSample/in.3libs.bam
Binary file not shown.
Binary file added test/input/TestPerSample/in.3libs.bam.bai
Binary file not shown.
Binary file removed test/input/TestPerSample/in.bam.bti
Binary file not shown.
Binary file added test/input/TestPerSample/in.indels.bam
Binary file not shown.
Binary file added test/input/TestPerSample/in.indels.bam.bai
Binary file not shown.
317 changes: 317 additions & 0 deletions test/input/TestPerSample/ref.indels.fasta

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions test/input/TestPerSample/ref.indels.fasta.fai
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Seed-stock-137_S2_L001_001 18950 28 60 61
15 changes: 15 additions & 0 deletions test/input/TestPerSample/vphaser_one_sample_3libs_expected.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
chr1 55 T C 0.8156 snp 16.1017 C:65:34:45:21:11:8:9:5:0.425 T:13:6:10:6:2:0:1:0:0.425
chr1 483 T C 0.5142 snp 23.6111 C:34:21:24:11:5:5:5:5:0.9192 T:8:9:5:7:1:1:2:1:0.9192
chr1 573 T C 0.5073 snp 13.7405 C:66:47:50:29:12:10:4:8:0.1632 T:12:6:6:3:3:2:3:1:0.1632
chr1 660 C T 0.2772 snp 44.2724 T:109:71:79:43:18:14:12:14:0.6371 C:78:65:55:48:13:11:10:6:0.6371
chr1 999 T C 0.9182 snp 12.8478 C:527:538:349:356:81:97:97:85:0.3398 T:80:77:60:52:10:15:10:10:0.3398
chr1 1117 A G 0.838 snp 15.0547 G:449:482:304:327:64:81:81:74:0.7364 A:84:81:57:50:13:14:14:17:0.7364
chr1 1293 T C 0.09971 snp 17.2697 C:258:245:183:172:32:37:43:36:0.3501 T:63:42:42:29:10:10:11:3:0.3501
chr2 8766 T C 0.6568 snp 8.33333 C:132:99:96:67:15:15:21:18:0.5642 T:13:8:8:5:3:1:2:2:0.5642
chr2 8808 T C 0.1318 snp 16.8421 C:130:107:93:74:15:17:22:16:0.7091 T:32:16:22:10:6:0:4:6:0.7091
chr2 9123 T C 0.6938 snp 14.5161 C:150:168:100:113:25:21:25:34:0.7732 T:24:30:18:21:3:4:3:5:0.7732
chr2 9151 T C 0.4294 snp 15 C:142:147:98:100:20:18:24:31:0.801 T:22:29:15:20:4:4:3:5:0.801
chr2 9272 G A 0.2018 snp 17.5214 A:82:111:53:75:14:12:15:24:0.8461 G:28:13:22:5:3:4:4:4:0.8461
chr2 9687 T C 0.7036 snp 11.8182 C:67:127:43:85:14:19:10:23:0.9523 T:8:18:6:11:1:4:1:3:0.9523
chr2 9912 T C 0.1715 snp 24.8744 C:104:195:74:123:18:34:12:38:0.9697 T:42:57:27:39:7:9:8:9:0.9697
chr2 9960 C T 0.3394 snp 11.9481 T:120:219:80:144:22:32:18:43:0.5227 C:13:33:8:22:2:8:3:3:0.5227
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Seed-stock-137_S2_L001_001 6911 IA d 0.1453 lp 5.04492 d:1059:315:1132:328:1 IA:56:10:56:10:1 IAA:6:0:6:0:1 IAAA:1:0:1:0:1
Seed-stock-137_S2_L001_001 13181 D1 i 0.3567 lp 5.71688 i:879:160:939:166:1 D1:56:7:56:7:1
74 changes: 53 additions & 21 deletions test/unit/test_intrahost.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,8 @@ def __init__(self):
self.isnvs = {}
self.chroms = []
def add_snp(self, chrom, pos, acounts, libinfo=None):
''' Add an iSNP at this chrom,pos. acounts is a list of triples:
(allele, fwd count, rev count).
'''
# Add an iSNP at this chrom,pos. acounts is a list of triples:
# (allele, fwd count, rev count).
assert type(pos) == int and pos>0 and len(acounts)>1
for a,f,r in acounts:
assert a in ('A','C','G','T')
Expand Down Expand Up @@ -128,20 +127,18 @@ def test_single_strand_bias_hard_filter(self):
self.assertEqual(output[0][7:], expected[7:])

def test_vphaser_one_sample(self):
"""
Files here were created as follows:
- in.bam was copied from input directory for TestVPhaser2; see notes
there on how it was created.
- ref.fasta was created by making two identical chromosomes, chr1
and chr2, with the sequence from West Nile virus isolate
WNV-1/US/BID-V7821/2011. That genome is a guess for the reference
of the V-Phaser 2 test file because BLAST matches the reads to
West Nile virus and that isolate has the size reported in the bam file.
Note that ref.fasta is not exactly the consensus of the reads in in.bam;
for example, pos 660 is C in ref.fasta, but more reads have T there
than C in in.bam. So we are actually testint the case that
V-Phaser 2 consensus != our consensus.
"""
# Files here were created as follows:
# - in.bam was copied from input directory for TestVPhaser2; see notes
# there on how it was created.
# - ref.fasta was created by making two identical chromosomes, chr1
# and chr2, with the sequence from West Nile virus isolate
# WNV-1/US/BID-V7821/2011. That genome is a guess for the reference
# of the V-Phaser 2 test file because BLAST matches the reads to
# West Nile virus and that isolate has the size reported in the bam file.
# Note that ref.fasta is not exactly the consensus of the reads in in.bam;
# for example, pos 660 is C in ref.fasta, but more reads have T there
# than C in in.bam. So we are actually testing the case that
# V-Phaser 2 consensus != our consensus.
myInputDir = util.file.get_test_input_path(self)
inBam = os.path.join(myInputDir, 'in.bam')
refFasta = os.path.join(myInputDir, 'ref.fasta')
Expand All @@ -151,11 +148,29 @@ def test_vphaser_one_sample(self):
expected = os.path.join(myInputDir, 'vphaser_one_sample_expected.txt')
self.assertEqualContents(outTab, expected)

def test_vphaser_one_sample_indels(self):
# Files here were created as follows:
# ref.indels.fasta is Seed-stock-137_S2_L001_001.fasta
# in.indels.bam was created from Seed-stock-137_S2_L001_001.mapped.bam
# as follows:
# Created two .sam files using samtools view, restricting to ranges
# 6811-7011 and 13081-13281, respectively. Paired reads were removed
# from those files by throwing out the second occurence of any read name
# and anding the flag fields with 16. Then, a random 90% of reads were
# removed, except that any reads containing the indel variants at
# 6911 and 13181 were kept. Then the resulting 2 files were combined.
myInputDir = util.file.get_test_input_path(self)
inBam = os.path.join(myInputDir, 'in.indels.bam')
refFasta = os.path.join(myInputDir, 'ref.indels.fasta')
outTab = util.file.mkstempfname('.txt')
intrahost.vphaser_one_sample(inBam, refFasta, outTab,
vphaserNumThreads = 4, minReadsEach = 0)
expected = os.path.join(myInputDir, 'vphaser_one_sample_indels_expected.txt')
self.assertEqualContents(outTab, expected)

def test_vphaser_one_sample_2libs(self):
"""
in.2libs.bam was created by "manually" editing in.bam and moving about
1/3 of the reads to ReadGroup2.
"""
# in.2libs.bam was created by "manually" editing in.bam and moving about
# 1/3 of the reads to ReadGroup2.
myInputDir = util.file.get_test_input_path(self)
inBam = os.path.join(myInputDir, 'in.2libs.bam')
refFasta = os.path.join(myInputDir, 'ref.fasta')
Expand All @@ -165,6 +180,23 @@ def test_vphaser_one_sample_2libs(self):
expected = os.path.join(myInputDir, 'vphaser_one_sample_2libs_expected.txt')
self.assertEqualContents(outTab, expected)

def test_vphaser_one_sample_3libs_and_chi2(self):
# In addition to testing that we can handle 3 libraries, this is testing
# the chi2_contingency approximation to fisher_exact. The 4th, 5th,
# and 6th rows have large enough minor allele count that their
# p-values are calculated using the chi2 approximation. The other
# rows are testing the 2 x 3 case of fisher_exact.
# in.3libs.bam was created by "manually" editing in.2libs.bam and moving
# about 1/2 of the reads in ReadGroup2 to ReadGroup3.
myInputDir = util.file.get_test_input_path(self)
inBam = os.path.join(myInputDir, 'in.3libs.bam')
refFasta = os.path.join(myInputDir, 'ref.fasta')
outTab = util.file.mkstempfname('.txt')
intrahost.vphaser_one_sample(inBam, refFasta, outTab,
vphaserNumThreads = 4, minReadsEach = 6, maxBias = 3)
expected = os.path.join(myInputDir, 'vphaser_one_sample_3libs_expected.txt')
self.assertEqualContents(outTab, expected)

@unittest.skip('not implemented')
def test_single_lib(self):
data = MockVphaserOutput()
Expand Down
98 changes: 0 additions & 98 deletions util/misc.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
'''A few miscellaneous tools. '''
from __future__ import division # Division of integers with / should never round!
from math import exp, log, pi
import itertools
# import scipy # Commented out until installation on Travis is working

__author__ = "[email protected]"

Expand Down Expand Up @@ -59,28 +57,6 @@ def intervals(i, n, l):
stop = l
return (start,stop)


try:
# Python 3.4
from statistics import mean, median
except ImportError:
# Python <3.4, avoid numpy if these two methods are all we really need
def mean(l):
if len(l)>0:
return float(sum(l))/len(l)
else:
raise Exception("empty list for mean")
def median(l):
if len(l)>0:
half = len(l) // 2
l.sort()
if len(l) % 2 == 0:
return (l[half-1] + l[half]) / 2.0
else:
return l[half]
else:
raise Exception("empty list for median")

# from http://stackoverflow.com/a/312467
def batch_iterator(iterator, batch_size) :
"""Returns lists of length batch_size.
Expand All @@ -100,77 +76,3 @@ def batch_iterator(iterator, batch_size) :
yield item
item = list(itertools.islice(it, batch_size))

def chi2_contingency(contingencyTable) :
""" Return two-tailed p-value for an n x m contingency table using chi-square
distribution. Not recommended if any of the counts or expected counts are
less than 5. Input is a sequence of n sequences of size m.
"""
# Commented out scipy version until scipy installation is working:
# return scipy.stats.chi2_contingency(contingencyTable)[1]

return 1.0

def log_stirling(n) :
"""Return Stirling's approximation for log(n!) using up to n^7 term.
Provides exact right answer (when rounded to int) for 16! and lower.
Correct to 10 digits for 5! and above."""
n2 = n * n
n3 = n * n2
n5 = n3 * n2
n7 = n5 * n2
return n * log(n) - n + 0.5 * log(2 * pi * n) + \
1 / 12.0 / n - 1 / 360.0 / n3 + 1 / 1260.0 / n5 - 1 / 1680.0 / n7

def fisher_exact(contingencyTable2x2) :
""" Return two-tailed p-value for a 2 x 2 contingency table using Fisher's
exact test. Input is a sequence of 2 sequences of size 2.
"""
# Python code below is accurate to around 1e-11 and is much faster than
# the scipy version. However, if for some reason we want to use the scipy
# version instead, here's the call:
# return scipy.stats.fisher_exact(contingencyTable)[1]

a, b = contingencyTable2x2[0][0], contingencyTable2x2[0][1]
c, d = contingencyTable2x2[1][0], contingencyTable2x2[1][1]

def log_choose(n, k) :
k = min(k, n - k)
if k <= 10 :
result = 0.0
for ii in range(1, k + 1) :
result += log(n - ii + 1) - log(ii)
else :
result = log_stirling(n) - log_stirling(k) - log_stirling(n-k)
return result

s = a + b
t = a + c
n = a + b + c + d
logChooseNT = log_choose(n, t)

def mydhyper(a, s, t, n) :
# Probability that intersection of subsets of size s and t of a set with
# n elements has size exactly a.
if max(0, s + t - n) <= a <= min(s, t) :
return exp(log_choose(s, a) + log_choose(n - s, t - a) - logChooseNT)
else :
return 0

result = p0 = mydhyper(a, s, t, n)

# Count up from 0 and down from min(s, t) adding all smaller p-vals
for x in range(a) :
prob = mydhyper(x, s, t, n)
if prob <= p0 :
result += prob
else :
break
for y in range(min(s, t), a, -1) :
prob = mydhyper(y, s, t, n)
if prob <= p0 :
result += prob
else :
break

return result

Loading

0 comments on commit 0f21fae

Please sign in to comment.