Skip to content

Commit

Permalink
Merge pull request #635 from broadinstitute/is-new-add-utils
Browse files Browse the repository at this point in the history
added various file-handling utils, and tests for them
  • Loading branch information
notestaff authored Jun 21, 2017
2 parents 2f9f312 + f04aa1c commit 123401f
Show file tree
Hide file tree
Showing 5 changed files with 167 additions and 8 deletions.
17 changes: 15 additions & 2 deletions test/unit/test_tools_samtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
__author__ = "[email protected]"

import unittest
import os
import os, os.path
import tempfile
import shutil
import Bio.SeqIO, Bio.SeqRecord, Bio.Seq
import util
import util.file
import tools
Expand Down Expand Up @@ -67,4 +68,16 @@ def test_filterByCigarString(self):
# '^((?:[0-9]+[ID]){1}(?:[0-9]+[MNIDSHPX=])+)|((?:[0-9]+[MNIDSHPX=])+(?:[0-9]+[ID]){1})$'
samtools.filterByCigarString(in_sam, out_bam)

assert samtools.count(out_bam)==39, "Output read count does not match the expected count."
assert samtools.count(out_bam)==39, "Output read count does not match the expected count."

def test_bam2fa(self):
samtools = tools.samtools.SamtoolsTool()
sam = os.path.join(util.file.get_test_input_path(self), 'simple.sam')

with samtools.bam2fa_tmp(sam) as (fa1, fa2):
for fa in (fa1, fa2):
assert len(list(Bio.SeqIO.parse(fa, 'fasta')))==1

assert not os.path.isfile(fa1) and not os.path.isfile(fa2)


50 changes: 50 additions & 0 deletions test/unit/test_util_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Unit tests for util.file.py

__author__ = "[email protected]"

import os, os.path
import pytest
import util.file

def testTempFiles():
'''Test creation of tempfiles using context managers, as well as dump_file/slurp_file routines'''
tmp_fns = []
sfx='tmp-file-test'
with util.file.tempfname(sfx) as my_tmp_fn:
tmp_dir_listing = os.listdir( os.path.dirname( my_tmp_fn ) )
assert os.path.basename( my_tmp_fn ) in tmp_dir_listing

with util.file.tempfnames((sfx+'-A',sfx+'-B')) as my_tmp_fns:

assert len(my_tmp_fns)==2
for fn in my_tmp_fns:
assert os.path.dirname(fn) == os.path.dirname(my_tmp_fn)
assert os.path.basename(fn) not in tmp_dir_listing

for fn in my_tmp_fns:

assert sfx in fn
assert os.path.isfile(fn)
assert os.path.getsize(fn)==0
assert os.access(fn, os.R_OK | os.W_OK)

fileValue='my\ntest\ndata\n' + fn + '\n'
util.file.dump_file( fname=fn, value=fileValue )
assert os.path.isfile(fn)
assert os.path.getsize(fn) == len(fileValue)
assert util.file.slurp_file(fn) == fileValue

util.file.make_empty(fn)
assert os.path.getsize(fn)==0

tmp_fns.append(fn)

assert os.path.isfile(my_tmp_fn) and not os.path.isfile(my_tmp_fns[0]) and not os.path.isfile(my_tmp_fns[1])

largeString = 'A' * (2*1024*1024)
util.file.dump_file(fname=my_tmp_fn, value=largeString)
with pytest.raises(RuntimeError):
util.file.slurp_file(my_tmp_fn, maxSizeMb=1)

assert not os.path.isfile(my_tmp_fn)

42 changes: 37 additions & 5 deletions tools/picard.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import tempfile
import shutil
import subprocess
import contextlib
from decimal import *

import pysam
Expand Down Expand Up @@ -101,19 +102,50 @@ class SamToFastqTool(PicardTools):
subtoolName = 'SamToFastq'
illumina_clipping_attribute = 'XT'

def execute(self, inBam, outFastq1, outFastq2, picardOptions=None, JVMmemory=None): # pylint: disable=W0221
def execute(self, inBam, outFastq1, outFastq2, outFastq0=None,
illuminaClipping=False,
picardOptions=None, JVMmemory=None): # pylint: disable=W0221
'''Write paired reads from `inBam` to `outFastq1` and `outFastq1`. If `outFastq0` is given, write
any unpaired reads from `inBam` there, else ignore them. If `illuminaClipping` is True,
trim reads at the clipping position specified by the Illumina clipping attribute
(which is defined by the class variable SamToFastqTool.illumina_clipping_attribute).'''

if tools.samtools.SamtoolsTool().isEmpty(inBam):
# Picard SamToFastq cannot deal with an empty input BAM file
with open(outFastq1, 'wt') as outf:
pass
with open(outFastq2, 'wt') as outf:
pass
for f in (outFastq1, outFastq2, outFastq0):
if f: util.file.make_empty(f)
else:
picardOptions = picardOptions or []

opts = [
'FASTQ=' + outFastq1, 'SECOND_END_FASTQ=' + outFastq2, 'INPUT=' + inBam, 'VALIDATION_STRINGENCY=SILENT'
]
if outFastq0:
assert outFastq2, "outFastq0 option only applies in paired-end output mode"
opts += [ 'UNPAIRED_FASTQ=' + outFastq0 ]

if illuminaClipping:
opts += PicardTools.dict_to_picard_opts({
'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute,
'CLIPPING_ACTION': 'X'
})

PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory)

@contextlib.contextmanager
def execute_tmp(self, inBam, sfx='', includeUnpaired=False, **kwargs):
'''Output reads from `inBam` to temp fastq files, and yield their filenames as a tuple.
If `includeUnpaired` is True, output unpaired reads to a third temp fastq file and yield it as a third
element of the tuple.
'''
if not includeUnpaired:
with util.file.tempfnames((sfx+'.1.fq', sfx+'.2.fq')) as (outFastq1, outFastq2):
self.execute(inBam, outFastq1, outFastq2, **kwargs)
yield outFastq1, outFastq2
else:
with util.file.tempfnames((sfx+'.1.fq', sfx+'.2.fq', sfx+'.0.fq')) as (outFastq1, outFastq2, outFastq0):
self.execute(inBam, outFastq1, outFastq2, outFastq0=outFastq0, **kwargs)
yield outFastq1, outFastq2, outFastq0

def per_read_group(self, inBam, outDir, picardOptions=None, JVMmemory=None):
if tools.samtools.SamtoolsTool().isEmpty(inBam):
Expand Down
21 changes: 21 additions & 0 deletions tools/samtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import os.path
import subprocess
import tempfile
import contextlib
from collections import OrderedDict
from decimal import *

Expand Down Expand Up @@ -73,6 +74,26 @@ def bam2fq(self, inBam, outFq1, outFq2=None):
else:
self.execute('bam2fq', ['-1', outFq1, '-2', outFq2, inBam])

def bam2fa(self, inBam, outFa1, outFa2=None, outFa0=None):
args=['-1', outFa1]
if outFa2: args += ['-2', outFa2]
if outFa0: args += ['-0', outFa0]
args += [inBam]

self.execute('fasta', args)

@contextlib.contextmanager
def bam2fq_tmp(self, inBam):
with util.file.tempfnames(('.1.fq', '.2.fq')) as (reads1, reads2):
self.bam2fq(inBam, reads1, reads2)
yield reads1, reads2

@contextlib.contextmanager
def bam2fa_tmp(self, inBam):
with util.file.tempfnames(('.1.fa', '.2.fa')) as (reads1, reads2):
self.bam2fa(inBam, reads1, reads2)
yield reads1, reads2

def sort(self, inFile, outFile, args=None, threads=None):
# inFile can be .sam, .bam, .cram
# outFile can be .sam, .bam, .cram
Expand Down
45 changes: 44 additions & 1 deletion util/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
__date__ = "PLACEHOLDER"

import contextlib
import os
import os, os.path
import gzip
import tempfile
import shutil
Expand Down Expand Up @@ -94,6 +94,27 @@ def mkstempfname(suffix='', prefix='tmp', directory=None, text=False):
return fn


@contextlib.contextmanager
def tempfname(*args, **kwargs):
'''Create a tempfile name on context entry, delete the file (if it exists) on context exit.'''
fn = mkstempfname(*args, **kwargs)
try:
yield fn
finally:
if os.path.isfile(fn): os.unlink(fn)


@contextlib.contextmanager
def tempfnames(suffixes, *args, **kwargs):
'''Create a set of tempfile names on context entry, delete the files (if they exist) on context exit.'''
fns = [mkstempfname(sfx, *args, **kwargs) for sfx in suffixes]
try:
yield fns
finally:
for fn in fns:
if os.path.isfile(fn):
os.unlink(fn)

def set_tmp_dir(name):
proposed_prefix = ['tmp']
if name:
Expand Down Expand Up @@ -473,3 +494,25 @@ def line_count(infname):
def touch(fname, times=None):
with open(fname, 'a'):
os.utime(fname, times)

def make_empty(fname):
'''Make `fname` a zero-length file with the current timestamp.'''
with open(fname, 'w'):
pass

def dump_file(fname, value):
"""store string in file"""
with open(fname, 'w') as out:
out.write(str(value))

def slurp_file(fname, maxSizeMb=50):
"""Read entire file into one string. If file is gzipped, uncompress it on-the-fly. If file is larger
than `maxSizeMb` megabytes, throw an error; this is to encourage proper use of iterators for reading
large files. If `maxSizeMb` is None or 0, file size is unlimited."""
fileSize = os.path.getsize(fname)
if maxSizeMb and fileSize > maxSizeMb*1024*1024:
raise RuntimeError('Tried to slurp large file {} (size={}); are you sure? Increase `maxSizeMb` param if yes'.
format(fname, fileSize))
with open_or_gzopen(fname) as f:
return f.read()

0 comments on commit 123401f

Please sign in to comment.