-
Notifications
You must be signed in to change notification settings - Fork 0
/
paring.py
273 lines (247 loc) · 9.22 KB
/
paring.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
import math
#how long the words will be, probably should remain the same as alignment
wordLength = 5
#Lower bound for inclusion in high scoring words, lower than alignment to allow for variance?
highScoreThreshold = 2.0
#Distance that two words must be under to group into a segment pair
segPairWordMaxDist = 10
#How far around the high score segment should be included?
paringBufferLength = 2
blankProb = [0.25,0.25,0.25,0.25]
#How large of a standard deviation in mean alignment length should cause paring?
paringDevAllowance = 1.0
#How much difference from the mode length should be removed?
paringLengthSimilarity = 1
#Takes a sequence (or mean) and returns the list of words of length wordLength
#Used lots, copied here for access to wordLength
def wordify (seq):
seqWords = []
for i in range((len(seq)-wordLength+1)):
seqWords += [seq[i:(i+wordLength)]]
return seqWords
def scoreMeanWord(meanWord):
score = 0
for probArray in meanWord:
score += max(probArray)
return score
#Perhaps account for small gaps in a single motif here?
#Returns means from the paring data, modified for targetLength addition
def processSegments(segments, mean):
print segments
newMeans = []
for (score,i,j) in segments:
overflowL = paringBufferLength - i
overflowR = j + paringBufferLength - len(mean)
newMean = []
if overflowL > 0:
if overflowR > 0:
newMean = ([blankProb] * overflowL) + mean + ([blankProb] * overflowR)
else:
newMean = ([blankProb] * overflowL) + mean[:j]
elif overflowR > 0:
newMean = mean[i:] + ([blankProb] * overflowR)
else:
newMean = mean[i:j]
newMeans += [(newMean,-1,-1)]
# print str(newMeans) + '\n'
return newMeans
#Somewhat similar to alignment, generates high scoring fragments of centroids and branches them off as means
#Modified for use with (sequence, score) means - unboxing and return type
def originalPare((mean,targetLength,targetIndex)):
meanWords = wordify(mean)
numMeanWords = len(meanWords)
scores = []
for m in range(numMeanWords):
score = scoreMeanWord(meanWords[m])
scores += [score]
segments = []
i = 0
while i < numMeanWords:
currSegScore = scores[i]
if currSegScore > highScoreThreshold:
j = 1 # want to start at the next word
#FIX THIS - what if there is a non-high score word in the middle of a motif? (actually, that's pretty unlikely)
while j < segPairWordMaxDist and i+j < numMeanWords and scores[i+j] > highScoreThreshold:
currSegScore += max(mean[i+j])
j += 1
#So stores it as score, beginning word, ending word
#Score currently isn't used, but may be
segments += [(currSegScore,i,i+j+wordLength)]
i += j
i += 1
#Segments need to turn into means that account for buffer distances and the length of meanWords
return processSegments(segments, mean)
#Returns a mean from a selected alignment datum
def processAlignment((i,m,score,length), mean):
print (i,m,score,length)
i1 = max(0, m - paringBufferLength)
i2 = min(len(mean),m+length+paringBufferLength)
return mean[i1:i2]
#Used iteratively in the paring function to guess new means,
#Essentially just a variance function
def lengthVar(lengths):
numLengths = float(len(lengths))
sumLengths = 0
for length in lengths:
sumLengths += length
avgLength = sumLengths/float(numLengths)
sumVar = 0
for length in lengths:
sumVar += (avgLength - float(length))**2
lengthVariance = sumVar / float(numLengths)
return lengthVariance
#BELOW: SERIOUS BUGCHECK MAY BE NECESSARY, as with originaPare()
#Somewhat similar to alignment, generates high scoring fragments of centroids and branches them off as means
#Now uses mean alignment length to guess length and position of new means
#For a combined length and conserved alignment,
def pare((mean, targetLength, targetIndex), lengthAlignments):
#Odd artifact here, the targetLength is from alignments on the previous clustering,
# so they don't match the new score. Take out the extra alignmentMatrix generation?
#FIXED with summing new memoized alignments, but may want to check this
numPeaks = len(lengthAlignments)
if numPeaks <= 0:
return []
lengths = []
for (i,m,score,length) in lengthAlignments:
lengths += [length]
stdDev = math.sqrt(lengthVar(lengths))
newMeans = []
#Currently this DOESN'T check or utilize TARGETLENGTH
if stdDev <= paringLengthSimilarity:
modeLength = max(set(lengths), key=lengths.count)
l = 0
while 0 <= l < numPeaks:
(i,m,score,length) = lengthAlignments[l]
if length == modeLength:
newMeans += [(processAlignment(lengthAlignments[l],mean),-1)]
l = -1
else:
l += 1
return newMeans
while stdDev >= paringDevAllowance:
#finds the most common length, the mode
modeLength = max(set(lengths), key=lengths.count)
#Temp value, selectedAlignment should always be changed, as something has the mode length
selectedAlignment = (0,0,0,0)
k = 0
while k < len(lengthAlignments):
#length = lengths[i]
(i,m,score,length) = lengthAlignments[k]
if length == modeLength:
#removes the values for a second iteration
selectedAlignment = lengthAlignments.pop(k)
lengths.pop(k)
elif length - paringLengthSimilarity <= length <= length + paringLengthSimilarity:
#removes close lengths too
lengthAlignments.pop(k)
lengths.pop(k)
else:
k += 1
newMeans += [(processAlignment(selectedAlignment,mean),-1)]
if len(lengthAlignments) <= 0:
return newMeans
stdDev = math.sqrt(lengthVar(lengths))
return newMeans
#Takes a sequence (or mean) and a length parameter
# and returns the list of words of the length parameter
#Used lots, copied here for access to wordLength
def parWordify (seq,length):
seqWords = []
#the len(seq)-length+1 is necessary to count every word:
# for len(seq) = 4, length = 3, len(seq)-length would give only i=0, or one word when to are needed
for i in range(len(seq)-length+1):
seqWords += [seq[i:(i+length)]]
return seqWords
def scoreMeanWord(meanWord):
score = 0
for probArray in meanWord:
score += max(probArray)
return score
#Somewhat similar to alignment, generates the highest scoring fragment of the targetLength + buffer
# and generates a new mean from that segment
#IMPORTANT: ASSUMES THAT THE LENGTHALIGNMENTS AND TARGETLENGTH <= len(mean)
def segPare((mean, targetLength, targetIndex), lengthAlignments):
#Odd artifact here, the targetLength is from alignments on the previous clustering,
# so they don't match the new score. Use lengthAlignments?
numPeaks = len(lengthAlignments)
if numPeaks <= 0:
print []
return []
lengths = []
for (i,m,score,length) in lengthAlignments:
lengths += [length]
newMeans = []
# stdDev = math.sqrt(lengthVar(lengths))
# #Currently this DOESN'T check or utilize TARGETLENGTH
# if stdDev <= paringLengthSimilarity:
# modeLength = max(set(lengths), key=lengths.count)
# l = 0
# while 0 <= l < numPeaks:
# (i,m,score,length) = lengthAlignments[l]
# if length == modeLength:
# newMeans += [(processAlignment(lengthAlignments[l],mean),-1)]
# l = -1
# else:
# l += 1
# return newMeans
# while stdDev >= paringDevAllowance:
# #finds the most common length, the mode
# modeLength = max(set(lengths), key=lengths.count)
# #Temp value, selectedAlignment should always be changed, as something has the mode length
# selectedAlignment = (0,0,0,0)
# k = 0
# while k < len(lengthAlignments):
# #length = lengths[i]
# (i,m,score,length) = lengthAlignments[k]
# if length == modeLength:
# #removes the values for a second iteration
# selectedAlignment = lengthAlignments.pop(k)
# lengths.pop(k)
# elif length - paringLengthSimilarity <= length <= length + paringLengthSimilarity:
# #removes close lengths too
# lengthAlignments.pop(k)
# lengths.pop(k)
# else:
# k += 1
# newMeans += [(processAlignment(selectedAlignment,mean),-1,-1)]
# if len(lengthAlignments) <= 0:
# return newMeans
# stdDev = math.sqrt(lengthVar(lengths))
targetLength = sum(lengths)/float(numPeaks)
newLength = int(targetLength + 2*paringBufferLength)
#now deal with newLength > len(mean),
#assumes that it will never be > len(mean) + 2*paringBufferLength
if newLength > len(mean):
mean = ([blankProb] * paringBufferLength) + mean + ([blankProb] * paringBufferLength)
# print mean
meanWords = parWordify(mean,newLength)
# print newLength
# print meanWords
segments = []
for word in meanWords:
segments += [(scoreMeanWord(word),word)]
segments.sort(key = lambda seg: seg[0], reverse=True)
(score, meanWord) = segments[0]
print (score, targetLength)
newMeans += [(meanWord,targetLength,paringBufferLength)]
return newMeans
# #Currently the algorithm pares every mean every iteration, maybe too much?
# def paredMeans(means,varAlignmentMatrix):
# newMeans = []
# for j in range(len(means)):
# #make sure the pare function doesn't change the alignment matrix
# newMeans += pare(means[j],list(varAlignmentMatrix[j]))
# return newMeans
#Currently the algorithm pares every mean every iteration, maybe too much?
def paredMeans(means,varAlignmentMatrix):
newMeans = []
for j in range(len(means)):
#make sure the pare function doesn't change the alignment matrix
newMeans += segPare(means[j],list(varAlignmentMatrix[j]))
return newMeans
# # Uses originalPare, but unboxes
# def paredMeans(means,alignmentMatrix):
# newMeans = []
# for mean in means:
# newMeans += originalPare(mean)
# return newMeans