Merge pull request #112 from broadinstitute/irwin-stats

Irwin stats - closes #108, closes #109
broadinstitute · Mar 12, 2015 · 0f21fae · 0f21fae
2 parents 5c38693 + 2eb22e2
commit 0f21fae
Show file tree

Hide file tree

Showing 16 changed files with 647 additions and 133 deletions.
diff --git a/interhost.py b/interhost.py
@@ -14,7 +14,7 @@
 except ImportError :
     from itertools import izip_longest as zip_longest
 import tools.muscle, tools.snpeff
-import util.cmd, util.file, util.vcf, util.misc
+import util.cmd, util.file, util.vcf
 from collections import OrderedDict, Sequence
 
 log = logging.getLogger(__name__)

diff --git a/intrahost.py b/intrahost.py
@@ -11,7 +11,7 @@
 import Bio.AlignIO, Bio.SeqIO, Bio.Data.IUPACData
 import pysam
 import util.cmd, util.file, util.vcf, util.misc
-from util.misc import mean, median, fisher_exact, chi2_contingency
+from util.stats import mean, median, fisher_exact, chi2_contingency
 from interhost import CoordMapper
 from tools.vphaser2 import Vphaser2Tool
 from tools.samtools import SamtoolsTool
@@ -98,8 +98,8 @@ def vphaser_one_sample(inBam, inConsFasta, outTab, vphaserNumThreads = None,
             sequence/chrom name, and library counts and p-values appended to
             the counts for each allele.
     '''
-    if minReadsEach != None and minReadsEach <= 0:
-        raise Exception('minReadsEach must be at least 1.')
+    if minReadsEach != None and minReadsEach < 0:
+        raise Exception('minReadsEach must be at least 0.')
     variantIter = Vphaser2Tool().iterate(inBam, vphaserNumThreads)
     filteredIter = filter_strand_bias(variantIter, minReadsEach, maxBias)
     libraryFilteredIter = compute_library_bias(filteredIter, inBam, inConsFasta)
@@ -120,8 +120,9 @@ def filter_strand_bias(isnvs, minReadsEach = None, maxBias = None) :
         front = row[:alleleCol]
         for fieldInd in range(len(row) - 1, alleleCol - 1, -1) :
             f, r = AlleleFieldParser(row[fieldInd]).strand_counts()
-            if not (int(f)>=minReadsEach and int(r)>=minReadsEach
-                    and maxBias >= (float(f)/float(r)) >= 1.0/maxBias) :
+            if  (int(f)<minReadsEach or int(r)<minReadsEach
+                    or (minReadsEach > 0 and not
+                        (maxBias >= (float(f)/float(r)) >= 1.0/maxBias))) :
                 del row[fieldInd]
         if len(row) > alleleCol + 1:
             row[alleleCol:] = sorted(row[alleleCol:],
@@ -171,13 +172,14 @@ def compute_library_bias(isnvs, inBam, inConsFasta) :
             contingencyTable = [
                 [         countsRow[alleleInd]         for countsRow in countsMatrix],
                 [sum(countsRow) - countsRow[alleleInd] for countsRow in countsMatrix]]
-            if len(libCounts) < 2 :
+            rowSums = map(sum, contingencyTable)
+            dofs = len(libCounts) - 1
+            if dofs < 1 :
                 pval = 1.0
-            elif len(libCounts) == 2 :
+            elif min(rowSums) ** dofs / dofs < 10000 :
+                # At this cutoff, fisher_exact should take <~ 0.1 sec
                 pval = fisher_exact(contingencyTable)
             else :
-                # The following is not recommended if any of the counts or
-                # expected counts are less than 5.
                 pval = chi2_contingency(contingencyTable)
             row[alleleCol + alleleInd] = str(AlleleFieldParser(None,
                 *(row[alleleCol + alleleInd].split(':') +
@@ -261,11 +263,10 @@ def parser_vphaser_one_sample(parser = argparse.ArgumentParser()) :
     parser.add_argument("--vphaserNumThreads", type = int, default = None,
         help="Number of threads in call to V-Phaser 2.")
     parser.add_argument("--minReadsEach", type = int, default = defaultMinReads,
-        help = """Minimum number of reads on each strand (default: %(default)s).
-                Must be at least 1.""")
+        help = "Minimum number of reads on each strand (default: %(default)s).")
     parser.add_argument("--maxBias", type = int, default = defaultMaxBias,
         help = """Maximum allowable ratio of number of reads on the two strands
-                (default: %(default)s).""")
+                (default: %(default)s). Ignored if minReadsEach = 0.""")
     util.cmd.common_args(parser, (('loglevel', None), ('version', None)))
     util.cmd.attach_main(parser, vphaser_one_sample, split_args = True)
     return parser

diff --git a/reports.py b/reports.py
@@ -10,7 +10,7 @@
 import util.cmd, util.file, util.misc
 import tools.samtools
 import assembly
-from util.misc import mean, median
+from util.stats import mean, median
 
 log = logging.getLogger(__name__)
 

diff --git a/test/input/TestPerSample/in.2libs.bam.bti b/test/input/TestPerSample/in.2libs.bam.bti
diff --git a/test/input/TestPerSample/in.3libs.bam b/test/input/TestPerSample/in.3libs.bam
diff --git a/test/input/TestPerSample/in.3libs.bam.bai b/test/input/TestPerSample/in.3libs.bam.bai
diff --git a/test/input/TestPerSample/in.bam.bti b/test/input/TestPerSample/in.bam.bti
diff --git a/test/input/TestPerSample/in.indels.bam b/test/input/TestPerSample/in.indels.bam
diff --git a/test/input/TestPerSample/in.indels.bam.bai b/test/input/TestPerSample/in.indels.bam.bai
diff --git a/test/input/TestPerSample/ref.indels.fasta b/test/input/TestPerSample/ref.indels.fasta
diff --git a/test/input/TestPerSample/ref.indels.fasta.fai b/test/input/TestPerSample/ref.indels.fasta.fai
@@ -0,0 +1 @@
+Seed-stock-137_S2_L001_001	18950	28	60	61
diff --git a/test/input/TestPerSample/vphaser_one_sample_3libs_expected.txt b/test/input/TestPerSample/vphaser_one_sample_3libs_expected.txt
@@ -0,0 +1,15 @@
+chr1	55	T	C	0.8156	snp	16.1017	C:65:34:45:21:11:8:9:5:0.425	T:13:6:10:6:2:0:1:0:0.425
+chr1	483	T	C	0.5142	snp	23.6111	C:34:21:24:11:5:5:5:5:0.9192	T:8:9:5:7:1:1:2:1:0.9192
+chr1	573	T	C	0.5073	snp	13.7405	C:66:47:50:29:12:10:4:8:0.1632	T:12:6:6:3:3:2:3:1:0.1632
+chr1	660	C	T	0.2772	snp	44.2724	T:109:71:79:43:18:14:12:14:0.6371	C:78:65:55:48:13:11:10:6:0.6371
+chr1	999	T	C	0.9182	snp	12.8478	C:527:538:349:356:81:97:97:85:0.3398	T:80:77:60:52:10:15:10:10:0.3398
+chr1	1117	A	G	0.838	snp	15.0547	G:449:482:304:327:64:81:81:74:0.7364	A:84:81:57:50:13:14:14:17:0.7364
+chr1	1293	T	C	0.09971	snp	17.2697	C:258:245:183:172:32:37:43:36:0.3501	T:63:42:42:29:10:10:11:3:0.3501
+chr2	8766	T	C	0.6568	snp	8.33333	C:132:99:96:67:15:15:21:18:0.5642	T:13:8:8:5:3:1:2:2:0.5642
+chr2	8808	T	C	0.1318	snp	16.8421	C:130:107:93:74:15:17:22:16:0.7091	T:32:16:22:10:6:0:4:6:0.7091
+chr2	9123	T	C	0.6938	snp	14.5161	C:150:168:100:113:25:21:25:34:0.7732	T:24:30:18:21:3:4:3:5:0.7732
+chr2	9151	T	C	0.4294	snp	15	C:142:147:98:100:20:18:24:31:0.801	T:22:29:15:20:4:4:3:5:0.801
+chr2	9272	G	A	0.2018	snp	17.5214	A:82:111:53:75:14:12:15:24:0.8461	G:28:13:22:5:3:4:4:4:0.8461
+chr2	9687	T	C	0.7036	snp	11.8182	C:67:127:43:85:14:19:10:23:0.9523	T:8:18:6:11:1:4:1:3:0.9523
+chr2	9912	T	C	0.1715	snp	24.8744	C:104:195:74:123:18:34:12:38:0.9697	T:42:57:27:39:7:9:8:9:0.9697
+chr2	9960	C	T	0.3394	snp	11.9481	T:120:219:80:144:22:32:18:43:0.5227	C:13:33:8:22:2:8:3:3:0.5227
diff --git a/test/input/TestPerSample/vphaser_one_sample_indels_expected.txt b/test/input/TestPerSample/vphaser_one_sample_indels_expected.txt
@@ -0,0 +1,2 @@
+Seed-stock-137_S2_L001_001	6911	IA	d	0.1453	lp	5.04492	d:1059:315:1132:328:1	IA:56:10:56:10:1	IAA:6:0:6:0:1	IAAA:1:0:1:0:1
+Seed-stock-137_S2_L001_001	13181	D1	i	0.3567	lp	5.71688	i:879:160:939:166:1	D1:56:7:56:7:1
diff --git a/test/unit/test_intrahost.py b/test/unit/test_intrahost.py
@@ -28,9 +28,8 @@ def __init__(self):
         self.isnvs = {}
         self.chroms = []
     def add_snp(self, chrom, pos, acounts, libinfo=None):
-        ''' Add an iSNP at this chrom,pos. acounts is a list of triples:
-            (allele, fwd count, rev count).
-        '''
+        # Add an iSNP at this chrom,pos. acounts is a list of triples:
+        # (allele, fwd count, rev count).
         assert type(pos) == int and pos>0 and len(acounts)>1
         for a,f,r in acounts:
             assert a in ('A','C','G','T')
@@ -128,20 +127,18 @@ def test_single_strand_bias_hard_filter(self):
         self.assertEqual(output[0][7:], expected[7:])
 
     def test_vphaser_one_sample(self):
-        """
-        Files here were created as follows:
-        - in.bam was copied from input directory for TestVPhaser2; see notes
-          there on how it was created.
-        - ref.fasta was created by making two identical chromosomes, chr1
-          and chr2, with the sequence from West Nile virus isolate
-          WNV-1/US/BID-V7821/2011. That genome is a guess for the reference
-          of the V-Phaser 2 test file because BLAST matches the reads to
-          West Nile virus and that isolate has the size reported in the bam file.
-          Note that ref.fasta is not exactly the consensus of the reads in in.bam;
-          for example, pos 660 is C in ref.fasta, but more reads have T there
-          than C in in.bam. So we are actually testint the case that
-          V-Phaser 2 consensus != our consensus.
-        """
+        # Files here were created as follows:
+        # - in.bam was copied from input directory for TestVPhaser2; see notes
+        #   there on how it was created.
+        # - ref.fasta was created by making two identical chromosomes, chr1
+        #   and chr2, with the sequence from West Nile virus isolate
+        #   WNV-1/US/BID-V7821/2011. That genome is a guess for the reference
+        #   of the V-Phaser 2 test file because BLAST matches the reads to
+        #   West Nile virus and that isolate has the size reported in the bam file.
+        #   Note that ref.fasta is not exactly the consensus of the reads in in.bam;
+        #   for example, pos 660 is C in ref.fasta, but more reads have T there
+        #   than C in in.bam. So we are actually testing the case that
+        #   V-Phaser 2 consensus != our consensus.
         myInputDir = util.file.get_test_input_path(self)
         inBam = os.path.join(myInputDir, 'in.bam')
         refFasta = os.path.join(myInputDir, 'ref.fasta')
@@ -151,11 +148,29 @@ def test_vphaser_one_sample(self):
         expected = os.path.join(myInputDir, 'vphaser_one_sample_expected.txt')
         self.assertEqualContents(outTab, expected)
 
+    def test_vphaser_one_sample_indels(self):
+        # Files here were created as follows:
+        # ref.indels.fasta is Seed-stock-137_S2_L001_001.fasta
+        # in.indels.bam was created from Seed-stock-137_S2_L001_001.mapped.bam
+        #     as follows:
+        # Created two .sam files using samtools view, restricting to ranges
+        # 6811-7011 and 13081-13281, respectively. Paired reads were removed
+        # from those files by throwing out the second occurence of any read name
+        # and anding the flag fields with 16. Then, a random 90% of reads were
+        # removed, except that any reads containing the indel variants at
+        # 6911 and 13181 were kept. Then the resulting 2 files were combined.
+        myInputDir = util.file.get_test_input_path(self)
+        inBam = os.path.join(myInputDir, 'in.indels.bam')
+        refFasta = os.path.join(myInputDir, 'ref.indels.fasta')
+        outTab = util.file.mkstempfname('.txt')
+        intrahost.vphaser_one_sample(inBam, refFasta, outTab,
+            vphaserNumThreads = 4, minReadsEach = 0)
+        expected = os.path.join(myInputDir, 'vphaser_one_sample_indels_expected.txt')
+        self.assertEqualContents(outTab, expected)
+
     def test_vphaser_one_sample_2libs(self):
-        """
-        in.2libs.bam was created by "manually" editing in.bam and moving about
-        1/3 of the reads to ReadGroup2.
-        """
+        # in.2libs.bam was created by "manually" editing in.bam and moving about
+        # 1/3 of the reads to ReadGroup2.
         myInputDir = util.file.get_test_input_path(self)
         inBam = os.path.join(myInputDir, 'in.2libs.bam')
         refFasta = os.path.join(myInputDir, 'ref.fasta')
@@ -165,6 +180,23 @@ def test_vphaser_one_sample_2libs(self):
         expected = os.path.join(myInputDir, 'vphaser_one_sample_2libs_expected.txt')
         self.assertEqualContents(outTab, expected)
 
+    def test_vphaser_one_sample_3libs_and_chi2(self):
+        # In addition to testing that we can handle 3 libraries, this is testing
+        #    the chi2_contingency approximation to fisher_exact. The 4th, 5th,
+        #    and 6th rows have large enough minor allele count that their
+        #    p-values are calculated using the chi2 approximation. The other
+        #    rows are testing the 2 x 3 case of fisher_exact.
+        # in.3libs.bam was created by "manually" editing in.2libs.bam and moving
+        # about 1/2 of the reads in ReadGroup2 to ReadGroup3.
+        myInputDir = util.file.get_test_input_path(self)
+        inBam = os.path.join(myInputDir, 'in.3libs.bam')
+        refFasta = os.path.join(myInputDir, 'ref.fasta')
+        outTab = util.file.mkstempfname('.txt')
+        intrahost.vphaser_one_sample(inBam, refFasta, outTab,
+            vphaserNumThreads = 4, minReadsEach = 6, maxBias = 3)
+        expected = os.path.join(myInputDir, 'vphaser_one_sample_3libs_expected.txt')
+        self.assertEqualContents(outTab, expected)
+
     @unittest.skip('not implemented')
     def test_single_lib(self):
         data = MockVphaserOutput()

diff --git a/util/misc.py b/util/misc.py
@@ -1,8 +1,6 @@
 '''A few miscellaneous tools. '''
 from __future__ import division # Division of integers with / should never round!
-from math import exp, log, pi
 import itertools
-# import scipy # Commented out until installation on Travis is working
 
 __author__ = "[email protected]"
 
@@ -59,28 +57,6 @@ def intervals(i, n, l):
         stop = l
     return (start,stop)
 
-
-try:
-    # Python 3.4
-    from statistics import mean, median
-except ImportError:
-    # Python <3.4, avoid numpy if these two methods are all we really need
-    def mean(l):
-        if len(l)>0:
-            return float(sum(l))/len(l)
-        else:
-            raise Exception("empty list for mean")
-    def median(l):
-        if len(l)>0:
-            half = len(l) // 2
-            l.sort()
-            if len(l) % 2 == 0:
-                return (l[half-1] + l[half]) / 2.0
-            else:
-                return l[half]
-        else:
-            raise Exception("empty list for median")
-
 # from http://stackoverflow.com/a/312467
 def batch_iterator(iterator, batch_size) :
     """Returns lists of length batch_size.
@@ -100,77 +76,3 @@ def batch_iterator(iterator, batch_size) :
         yield item
         item = list(itertools.islice(it, batch_size))
 
-def chi2_contingency(contingencyTable) :
-    """ Return two-tailed p-value for an n x m contingency table using chi-square
-        distribution. Not recommended if any of the counts or expected counts are
-        less than 5. Input is a sequence of n sequences of size m.
-    """
-    # Commented out scipy version until scipy installation is working:
-    # return scipy.stats.chi2_contingency(contingencyTable)[1]
-
-    return 1.0
-
-def log_stirling(n) :
-    """Return Stirling's approximation for log(n!) using up to n^7 term.
-       Provides exact right answer (when rounded to int) for 16! and lower.
-       Correct to 10 digits for 5! and above."""
-    n2 = n * n
-    n3 = n * n2
-    n5 = n3 * n2
-    n7 = n5 * n2
-    return n * log(n) - n + 0.5 * log(2 * pi * n) +  \
-           1 / 12.0 / n - 1 / 360.0 / n3 + 1 / 1260.0 / n5 - 1 / 1680.0 / n7
-
-def fisher_exact(contingencyTable2x2) :
-    """ Return two-tailed p-value for a 2 x 2 contingency table using Fisher's
-            exact test. Input is a sequence of 2 sequences of size 2.
-    """
-    # Python code below is accurate to around 1e-11 and is much faster than
-    # the scipy version. However, if for some reason we want to use the scipy
-    # version instead, here's the call:
-    # return scipy.stats.fisher_exact(contingencyTable)[1]
-
-    a, b = contingencyTable2x2[0][0], contingencyTable2x2[0][1]
-    c, d = contingencyTable2x2[1][0], contingencyTable2x2[1][1]
-
-    def log_choose(n, k) :
-        k = min(k, n - k)
-        if k <= 10 :
-            result = 0.0
-            for ii in range(1, k + 1) :
-                result += log(n - ii + 1) - log(ii)
-        else :
-            result = log_stirling(n) - log_stirling(k) - log_stirling(n-k)
-        return result
-
-    s = a + b
-    t = a + c
-    n = a + b + c + d
-    logChooseNT = log_choose(n, t)
-
-    def mydhyper(a, s, t, n) :
-        # Probability that intersection of subsets of size s and t of a set with
-        # n elements has size exactly a.
-        if max(0, s + t - n) <= a <= min(s, t) :
-            return exp(log_choose(s, a) + log_choose(n - s, t - a) - logChooseNT)
-        else :
-            return 0
-
-    result = p0 = mydhyper(a, s, t, n)
-
-    # Count up from 0 and down from min(s, t) adding all smaller p-vals
-    for x in range(a) :
-        prob = mydhyper(x, s, t, n)
-        if prob <= p0 :
-            result += prob
-        else :
-            break
-    for y in range(min(s, t), a, -1) :
-        prob = mydhyper(y, s, t, n)
-        if prob <= p0 :
-            result += prob
-        else :
-            break
-
-    return result
-
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Seed-stock-137_S2_L001_001 6911 IA d 0.1453 lp 5.04492 d:1059:315:1132:328:1 IA:56:10:56:10:1 IAA:6:0:6:0:1 IAAA:1:0:1:0:1
		Seed-stock-137_S2_L001_001 13181 D1 i 0.3567 lp 5.71688 i:879:160:939:166:1 D1:56:7:56:7:1