Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Samtools #101

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 152 additions & 0 deletions bio_bits/cigar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
from functools import partial
from itertools import dropwhile, takewhile, starmap, combinations, imap, ifilter
import itertools
from toolz.itertoolz import accumulate, first, last, drop, mapcat, take
from toolz.functoolz import complement, compose
from operator import itemgetter, add
import fn
import sh
import re
import multiprocessing as mp
import sys

#TODO: in order to *totally* recreate the reads, you need to account for the MD (mismatching positions) tag.
#This requires running bwa with different options. http://bio-bwa.sourceforge.net/bwa.shtml

#def ilen(seq): sum(1 for _ in seq)
#indel_pos = compose(ilen, partial(takewhile, '-'.__ne__))

# adding gaps is dependent on there being a difference between ref and pos.

def transform(seqs, positions):
''' given ref & query and positions and offsets, builds a sort of alignment by filling in
gaps appropriately with '-'.
:param tuple seqs reference, query strings
:param tuple positions
:return tuple of reference, query strings '''
(ref, query), ((rpos, qpos), (a, b)) = seqs, positions
def fill(s, pos): return s[:pos] + '-'*abs(a-b) + s[pos:]
if a < b: ref = fill(ref, rpos)
elif a > b: query = fill(query, qpos)
return ref, query

no_add = lambda x, y : x
dx_dy={
'M' : (add, add), 'X' : (add, add), '=' : (add, add),
'D' : (add, no_add), 'add' : (add, no_add),
'I' : (no_add, add), 'S' : (no_add, add),
'H' : (no_add, no_add)
}
def next_cigar_position(positions, cigarval, withtype=False):
'''compute the next position for the reference and position by "applying"
the position change that letter indicates, using `dx_dy` defined above.
if withtype is True, also returns the letter itself, and expects the last count
and last letter..'''
if withtype: (count, sym), (refpos, qpos, oldcount, oldsym) = cigarval, positions
else: (count, sym), (refpos, qpos) = cigarval, positions
f = dx_dy[sym]
if withtype: return f[0](refpos, count), f[1](qpos, count), count, sym
return f[0](refpos, count), f[1](qpos, count)

split_cig = re.compile(r'(?:([0-9]+)([A-Z]))').findall
cig_diffs = partial(map, partial(next_cigar_position, (0, 0)))
get_cig = lambda C: map(lambda x: (int(x[0]), x[1]), split_cig(C))
_gen_cigar = lambda c: accumulate(next_cigar_position, c, (0, 0))
gen_cigar = compose(_gen_cigar, get_cig)

def undo_cigar(x, y, z):
cig = get_cig(z)
return reduce(transform, zip(_gen_cigar(cig), cig_diffs(cig)), (x, y))

def get_insert_quals(ref, pos, bases):

'''separate i/o and destructuring from logic.
make generic for indel/snp.'''
view = sh.samtools('view', "{ref}:{pos}-{pos}")
#split tabs, etc.
has_insert = lambda l: 'I' in l[5]
# etc.
#samtools calcmd?

assert undo_cigar('AAGC', 'AGTT', '1M1D1M1X1I') == ('AAGC-', 'A-GTT')
assert list(starmap(undo_cigar, [
["AGG", "AG" ,"1M1D1M"],
["AGG", "AG" ,"1M1D1M"],
["TT" , "TAT" , "1M1I1M"],
["TA" , "TATAA", "1M3I1M"],
['AAGC', 'AGTT', '1M1D1M1X1I']])) == \
[('AGG', 'A-G'), ('AGG', 'A-G'), ('T-T', 'TAT'), ('T---A', 'TATAA'), ('AAGC-', 'A-GTT')]
assert list(gen_cigar("1M2D1M")) == [(0, 0), (1, 1), (3, 1), (4, 2)]

def intersection(pred, seq): return last(takewhile(complement(pred), seq))
def firstwhere(pred, seq): return first(dropwhile(complement(pred), seq))

assert intersection(lambda x: x[0] > 2, gen_cigar("1M2D1M")) == (1, 1)
assert firstwhere(lambda x: x[0] >= 2, gen_cigar("1M1D1M")) == (2, 1)


assert reduce(next_cigar_position, \ #corresponds to ('AAGC-', 'A-GTT')
takewhile(lambda x: x[1] != 'I', get_cig('1M1D1M1X1I')), (0, 0)) == (4, 3)

def mutpos(cig_str, types):
''':return (refpos, mutpos'''
assert any(map(types.__contains__, cig_str)), "Cigar string %s does not have mutation of type %s" % (cig_str, types)
return reduce(next_cigar_position, takewhile(lambda x: x[1] not in types, get_cig(cig_str)), (0, 0))

indelpos = partial(mutpos, types='DI')
query_indel_pos = compose(last, indelpos)
ref_indel_pos = compose(first, indelpos)

ref_del_pos = compose(first, partial(mutpos, types='D'))
ref_ins_pos = compose(first, partial(mutpos, types='I'))
ref_snp_pos = compose(first, partial(mutpos, types='X'))

assert ref_ins_pos('1M1D1M1X1I') == 4

def mutations(cig_str):
return drop(1, accumulate(partial(next_cigar_position, withtype=True), get_cig(cig_str), (0, 0, 0, 0)))

#TODO: Unfortnuately, need to use the MD tag, because bwa does not provide
# the 'X' tag for mismatches.
snps = compose(partial(filter, lambda x: x[-1] == 'X'), mutations)

def snp_rpos_base(qstring, cig_str):
return compose(partial(mapcat, partial(rpos_base, qstring)), snps)(cig_str)

def rpos_base(qstring, (rpos, qpos, count, type)):
''':return refpos, snp'''
return ((qstring[(qpos-count)+i], rpos +i) for i in xrange(0, count))

try:
mutpos('1M1D1M', 'X')
except AssertionError:
pass

def get_info(samrow):
#TODO: extract query_string and cigar_string from file
qname, mpos, cigar, qstring = itemgetter(0, 3, 5, 9)(samrow.split('\t'))
mpos = int(mpos)
_snps = snp_rpos_base(qstring, cigar)
#have to add map positions to computed rpos's.
return map(lambda (b,i):(b, i+mpos), _snps)

''' :lhn int minimum shared mutations in order to mark a group of reads as a local haplotype cluster.
:ghn int minimum shared mutations in order to mark a group of reads as a GLOBAL haplotype cluster. '''
if __name__ == '__main__':
'''split by RG (read group), then get_info, want to keep track of qname to prob.
Find all local and global haplotype clusters.'''
lhn = ghn = 3
shareCount = compose(len, lambda (x,y): x & y)
is_hap = lambda n: lambda x: shareCount(x) > n
_fn = sys.argv[1]
p = mp.Pool()
shortReads = imap(get_info, take(3, sh.samtools.view(_fn, _iter=True)))
shortReads = imap(set, shortReads)
cartProduct = itertools.product(*list(itertools.tee(shortReads)))
shortReads = ifilter(is_hap(lhn), cartProduct)
# shortReads should become the cluster of local haplotypes.
for r in shortReads:
if r: print r

#ghs = filter(is_hap(ghn), combinations(longReads, longReads))
#lhs = filter(is_hap(lhn), combinations(shortReads, shortReads))
6 changes: 3 additions & 3 deletions bio_bits/vcfcat_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Command-line utility for querying VCF files. By default, outputs a full vcf file matching the query.

Usage:
vcfcat filter <FILE1> ( --tag=<TAG> (--ge | --le | --gt | --lt | --eq | --neq) <VALUE> ) [-c]
vcfcat filter <FILE1> ( --tag=<TAG> (--ge | --le | --gt | --lt | --eq | --ne) <VALUE> ) [-c]
vcfcat exists <FILE1> (--tag=<TAG> ) [-c]
vcfcat ambiguous <FILE1> [-c]
vcfcat vcall <FILE1> [--csv | -c ]
Expand All @@ -19,7 +19,7 @@
--lt Get records Less Thaan <VALUE>
--leq Get records Less than or equal to <VALUE>
--eq Get records Exactly Equal to <VALUE>
--neq Get records Not Equal to <VALUE>
--ne Get records Not Equal to <VALUE>

Arguments:
filter: print only vcf records matching the filter as a new vcf file.
Expand All @@ -42,7 +42,7 @@
import sys
import vcf
#TODO: Find a better way to dispatch commandline apps
ops = ['--ge', '--le', '--gt' , '--lt' , '--eq' , '--neq']
ops = ['--ge', '--le', '--gt' , '--lt' , '--eq' , '--ne']
def validate_value(val):
if val is None or not val.isdigit():
return val
Expand Down
6 changes: 0 additions & 6 deletions tests/test_deprecation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,3 @@ def test_can_import_bio_pieces(self, mock_serr):
actual_msg = mock_serr.write.call_args[0][0]
self.assertIn(dep_msg, actual_msg)

def test_can_import_sub_module(self, mock_serr):
from bio_pieces import version
r = version.get_release()
self.assertEqual(1.0, r)
actual_msg = mock_serr.write.call_args[0][0]
self.assertIn(dep_msg, actual_msg)