-
Notifications
You must be signed in to change notification settings - Fork 2
/
weeder.py
119 lines (109 loc) · 4.97 KB
/
weeder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#################################################################
# @Program: weeder.py #
# @Version: 1 #
# @Author: Chris Plaisier #
# @Sponsored by: #
# Nitin Baliga, ISB #
# Institute for Systems Biology #
# 401 Terry Ave North #
# Seattle, Washington 98109-5234 #
# (216) 732-2139 #
# @Also Sponsored by: #
# Luxembourg Systems Biology Grant #
# #
# If this program is used in your analysis please mention who #
# built it. Thanks. :-) #
# #
# Copyrighted by Chris Plaisier 5/17/2012 #
#################################################################
def runWeeder(i):
weeder(seqFile=clusterFileNames[i], bgFile=weederVars['bgFile'], size=weederVars['size'], enriched=weederVars['enriched'], revComp=weederVars['revComp'])
# Run weeder and parse its output
# First weederTFBS -W 6 -e 1, then weederTFBS -W 8 -e 2, and finally adviser
def weeder(seqFile=None, bgFile='HS', size='small', enriched='T50', revComp=False):
print seqFile
if not os.path.exists('tmp/weeder'):
os.makedirs('tmp/weeder')
# First run weederTFBS
weederArgs = str(seqFile)+' '+str(bgFile)+' '+str(size)+' '+str(enriched)
if revComp==True:
weederArgs += ' S'
#errOut = open('tmp/weeder/stderr.out','w')
#weederProc = Popen("weederlauncher " + weederArgs, shell=True,stdout=PIPE,stderr=errOut)
#output = weederProc.communicate()
# Now parse output from weeder
PSSMs = []
output = open(str(seqFile)+'.wee','r')
outLines = [line for line in output.readlines() if line.strip()]
hitBp = {}
# Get top hit of 6bp look for "1)"
while 1:
outLine = outLines.pop(0)
if not outLine.find('1) ') == -1:
break
hitBp[6] = outLine.strip().split(' ')[1:]
# Scroll to where the 8bp reads wll be
while 1:
outLine = outLines.pop(0)
if not outLine.find('Searching for motifs of length 8') == -1:
break
# Get top hit of 8bp look for "1)"
while 1:
outLine = outLines.pop(0)
if not outLine.find('1) ') == -1:
break
hitBp[8] = outLine.strip().split(' ')[1:]
# Scroll to where the 8bp reads wll be
while 1:
outLine = outLines.pop(0)
if not outLine.find('Your sequences:') == -1:
break
# Get into the highest ranking motifs
seqDict = {}
while 1:
outLine = outLines.pop(0)
if not outLine.find('**** MY ADVICE ****') == -1:
break
splitUp = outLine.strip().split(' ')
seqDict[splitUp[1]] = splitUp[3].lstrip('>')
# Get into the highest ranking motifs
while 1:
outLine = outLines.pop(0)
if not outLine.find('Interesting motifs (highest-ranking)') == -1:
break
motifId = 1
biclusterId = str((seqFile.split('_')[1]).split('.')[0])
while 1:
if revComp==True:
name = outLines.pop(0).strip() # Get match
name += '_'+outLines.pop(0).strip()
if revComp==False:
name = outLines.pop(0).strip() # Get match
if not name.find('(not highest-ranking)') == -1:
break
# Get redundant motifs
outLines.pop(0)
redMotifs = [i for i in outLines.pop(0).strip().split(' ') if not i=='-']
outLines.pop(0)
outLines.pop(0)
line = outLines.pop(0)
instances = []
while line.find('Frequency Matrix') == -1:
splitUp = [i for i in line.strip().split(' ') if i]
instances.append({'gene':seqDict[splitUp[0]], 'strand':splitUp[1], 'site':splitUp[2], 'start':splitUp[3], 'match':splitUp[4].lstrip('(').rstrip(')') })
line = outLines.pop(0)
# Read in Frequency Matrix
outLines.pop(0)
outLines.pop(0)
matrix = []
col = outLines.pop(0)
while col.find('======') == -1:
nums = [i for i in col.strip().split('\t')[1].split(' ') if i]
colSum = 0
for i in nums:
colSum += int(i.strip())
matrix += [[ float(nums[0])/float(colSum), float(nums[1])/float(colSum), float(nums[2])/float(colSum), float(nums[3])/float(colSum)]]
col = outLines.pop(0)
PSSMs += [pssm(biclusterName=str(biclusterId)+'_motif'+str(motifId)+'_weeder',nsites=instances,eValue=hitBp[len(matrix)][1],pssm=matrix,genes=redMotifs,de_novo_method='weeder')]
motifId += 1
weederResults[int((seqFile.split('_')[1]).split('.')[0])] = PSSMs