Merge pull request #635 from broadinstitute/is-new-add-utils

added various file-handling utils, and tests for them
broadinstitute · Jun 21, 2017 · 123401f · 123401f
2 parents 2f9f312 + f04aa1c
commit 123401f
Show file tree

Hide file tree

Showing 5 changed files with 167 additions and 8 deletions.
diff --git a/test/unit/test_tools_samtools.py b/test/unit/test_tools_samtools.py
@@ -3,9 +3,10 @@
 __author__ = "[email protected]"
 
 import unittest
-import os
+import os, os.path
 import tempfile
 import shutil
+import Bio.SeqIO, Bio.SeqRecord, Bio.Seq
 import util
 import util.file
 import tools
@@ -67,4 +68,16 @@ def test_filterByCigarString(self):
         # '^((?:[0-9]+[ID]){1}(?:[0-9]+[MNIDSHPX=])+)|((?:[0-9]+[MNIDSHPX=])+(?:[0-9]+[ID]){1})$'
         samtools.filterByCigarString(in_sam, out_bam)
 
-        assert samtools.count(out_bam)==39, "Output read count does not match the expected count."
+        assert samtools.count(out_bam)==39, "Output read count does not match the expected count."
+
+    def test_bam2fa(self):
+        samtools = tools.samtools.SamtoolsTool()
+        sam = os.path.join(util.file.get_test_input_path(self), 'simple.sam')
+
+        with samtools.bam2fa_tmp(sam) as (fa1, fa2):
+            for fa in (fa1, fa2):
+                assert len(list(Bio.SeqIO.parse(fa, 'fasta')))==1
+
+        assert not os.path.isfile(fa1) and not os.path.isfile(fa2)
+
+
diff --git a/test/unit/test_util_file.py b/test/unit/test_util_file.py
@@ -0,0 +1,50 @@
+# Unit tests for util.file.py
+
+__author__ = "[email protected]"
+
+import os, os.path
+import pytest
+import util.file
+
+def testTempFiles():
+    '''Test creation of tempfiles using context managers, as well as dump_file/slurp_file routines'''
+    tmp_fns = []
+    sfx='tmp-file-test'
+    with util.file.tempfname(sfx) as my_tmp_fn:
+        tmp_dir_listing = os.listdir( os.path.dirname( my_tmp_fn ) )
+        assert os.path.basename( my_tmp_fn ) in tmp_dir_listing
+
+        with util.file.tempfnames((sfx+'-A',sfx+'-B')) as my_tmp_fns:
+
+            assert len(my_tmp_fns)==2
+            for fn in my_tmp_fns:
+                assert os.path.dirname(fn) == os.path.dirname(my_tmp_fn)
+                assert os.path.basename(fn) not in tmp_dir_listing
+
+            for fn in my_tmp_fns:
+
+                assert sfx in fn
+                assert os.path.isfile(fn)
+                assert os.path.getsize(fn)==0
+                assert os.access(fn, os.R_OK | os.W_OK)
+
+                fileValue='my\ntest\ndata\n' + fn + '\n'
+                util.file.dump_file( fname=fn, value=fileValue )
+                assert os.path.isfile(fn)
+                assert os.path.getsize(fn) == len(fileValue)
+                assert util.file.slurp_file(fn) == fileValue
+
+                util.file.make_empty(fn)
+                assert os.path.getsize(fn)==0
+
+                tmp_fns.append(fn)
+
+        assert os.path.isfile(my_tmp_fn) and not os.path.isfile(my_tmp_fns[0]) and not os.path.isfile(my_tmp_fns[1])
+
+        largeString = 'A' * (2*1024*1024)
+        util.file.dump_file(fname=my_tmp_fn, value=largeString)
+        with pytest.raises(RuntimeError):
+            util.file.slurp_file(my_tmp_fn, maxSizeMb=1)
+
+    assert not os.path.isfile(my_tmp_fn)
+
diff --git a/tools/picard.py b/tools/picard.py
@@ -8,6 +8,7 @@
 import tempfile
 import shutil
 import subprocess
+import contextlib
 from decimal import *
 
 import pysam
@@ -101,19 +102,50 @@ class SamToFastqTool(PicardTools):
     subtoolName = 'SamToFastq'
     illumina_clipping_attribute = 'XT'
 
-    def execute(self, inBam, outFastq1, outFastq2, picardOptions=None, JVMmemory=None):    # pylint: disable=W0221
+    def execute(self, inBam, outFastq1, outFastq2, outFastq0=None,
+                illuminaClipping=False,
+                picardOptions=None, JVMmemory=None):    # pylint: disable=W0221
+        '''Write paired reads from `inBam` to `outFastq1` and `outFastq1`.  If `outFastq0` is given, write
+        any unpaired reads from `inBam` there, else ignore them.  If `illuminaClipping` is True, 
+        trim reads at the clipping position specified by the Illumina clipping attribute
+        (which is defined by the class variable SamToFastqTool.illumina_clipping_attribute).'''
+
         if tools.samtools.SamtoolsTool().isEmpty(inBam):
             # Picard SamToFastq cannot deal with an empty input BAM file
-            with open(outFastq1, 'wt') as outf:
-                pass
-            with open(outFastq2, 'wt') as outf:
-                pass
+            for f in (outFastq1, outFastq2, outFastq0):
+                if f: util.file.make_empty(f)
         else:
             picardOptions = picardOptions or []
+
             opts = [
                 'FASTQ=' + outFastq1, 'SECOND_END_FASTQ=' + outFastq2, 'INPUT=' + inBam, 'VALIDATION_STRINGENCY=SILENT'
             ]
+            if outFastq0: 
+                assert outFastq2, "outFastq0 option only applies in paired-end output mode"
+                opts += [ 'UNPAIRED_FASTQ=' + outFastq0 ]
+
+            if illuminaClipping: 
+                opts += PicardTools.dict_to_picard_opts({
+                    'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute,
+                    'CLIPPING_ACTION': 'X'
+                })
+
             PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory)
+
+    @contextlib.contextmanager
+    def execute_tmp(self, inBam, sfx='', includeUnpaired=False, **kwargs):
+        '''Output reads from `inBam` to temp fastq files, and yield their filenames as a tuple.
+        If `includeUnpaired` is True, output unpaired reads to a third temp fastq file and yield it as a third
+        element of the tuple.
+        '''
+        if not includeUnpaired:
+            with util.file.tempfnames((sfx+'.1.fq', sfx+'.2.fq')) as (outFastq1, outFastq2):
+                self.execute(inBam, outFastq1, outFastq2, **kwargs)
+                yield outFastq1, outFastq2
+        else:
+            with util.file.tempfnames((sfx+'.1.fq', sfx+'.2.fq', sfx+'.0.fq')) as (outFastq1, outFastq2, outFastq0):
+                self.execute(inBam, outFastq1, outFastq2, outFastq0=outFastq0, **kwargs)
+                yield outFastq1, outFastq2, outFastq0
 
     def per_read_group(self, inBam, outDir, picardOptions=None, JVMmemory=None):
         if tools.samtools.SamtoolsTool().isEmpty(inBam):

diff --git a/tools/samtools.py b/tools/samtools.py
@@ -22,6 +22,7 @@
 import os.path
 import subprocess
 import tempfile
+import contextlib
 from collections import OrderedDict
 from decimal import *
 
@@ -73,6 +74,26 @@ def bam2fq(self, inBam, outFq1, outFq2=None):
         else:
             self.execute('bam2fq', ['-1', outFq1, '-2', outFq2, inBam])
 
+    def bam2fa(self, inBam, outFa1, outFa2=None, outFa0=None):
+        args=['-1', outFa1]
+        if outFa2: args += ['-2', outFa2]
+        if outFa0: args += ['-0', outFa0]
+        args += [inBam]
+
+        self.execute('fasta', args)
+
+    @contextlib.contextmanager
+    def bam2fq_tmp(self, inBam):
+        with util.file.tempfnames(('.1.fq', '.2.fq')) as (reads1, reads2):
+            self.bam2fq(inBam, reads1, reads2)
+            yield reads1, reads2
+
+    @contextlib.contextmanager
+    def bam2fa_tmp(self, inBam):
+        with util.file.tempfnames(('.1.fa', '.2.fa')) as (reads1, reads2):
+            self.bam2fa(inBam, reads1, reads2)
+            yield reads1, reads2
+
     def sort(self, inFile, outFile, args=None, threads=None):
         # inFile can be .sam, .bam, .cram
         # outFile can be .sam, .bam, .cram

diff --git a/util/file.py b/util/file.py
@@ -7,7 +7,7 @@
 __date__ = "PLACEHOLDER"
 
 import contextlib
-import os
+import os, os.path
 import gzip
 import tempfile
 import shutil
@@ -94,6 +94,27 @@ def mkstempfname(suffix='', prefix='tmp', directory=None, text=False):
     return fn
 
 
+@contextlib.contextmanager
+def tempfname(*args, **kwargs):
+    '''Create a tempfile name on context entry, delete the file (if it exists) on context exit.'''
+    fn = mkstempfname(*args, **kwargs)
+    try:
+        yield fn
+    finally:
+        if os.path.isfile(fn): os.unlink(fn)
+
+
+@contextlib.contextmanager
+def tempfnames(suffixes, *args, **kwargs):
+    '''Create a set of tempfile names on context entry, delete the files (if they exist) on context exit.'''
+    fns = [mkstempfname(sfx, *args, **kwargs) for sfx in suffixes]
+    try:
+        yield fns
+    finally:
+        for fn in fns: 
+            if os.path.isfile(fn): 
+                os.unlink(fn)
+
 def set_tmp_dir(name):
     proposed_prefix = ['tmp']
     if name:
@@ -473,3 +494,25 @@ def line_count(infname):
 def touch(fname, times=None):
     with open(fname, 'a'):
         os.utime(fname, times)
+
+def make_empty(fname):
+    '''Make `fname` a zero-length file with the current timestamp.'''
+    with open(fname, 'w'):
+        pass
+
+def dump_file(fname, value):
+    """store string in file"""
+    with open(fname, 'w')  as out: 
+        out.write(str(value))
+
+def slurp_file(fname, maxSizeMb=50):
+    """Read entire file into one string.  If file is gzipped, uncompress it on-the-fly.  If file is larger
+    than `maxSizeMb` megabytes, throw an error; this is to encourage proper use of iterators for reading
+    large files.  If `maxSizeMb` is None or 0, file size is unlimited."""
+    fileSize = os.path.getsize(fname)
+    if maxSizeMb  and  fileSize > maxSizeMb*1024*1024: 
+        raise RuntimeError('Tried to slurp large file {} (size={}); are you sure?  Increase `maxSizeMb` param if yes'.
+                           format(fname, fileSize))
+    with open_or_gzopen(fname) as f:
+        return f.read()
+