-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
167 lines (149 loc) · 6.69 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import align
import cluster
import seed
import matrixSeed
import paring
import math
import copy #need to make deep copies anywhere?
import scipy
from scipy import stats
#The p-value to check against to stop the clustering algorithm. Change this
probabilityThreshold = .96
#Use an asymptotic function with the Welch's t-test p-value as a seed for
#how far along the function to pick (closer to stable -> fewer new means)
#Tie this to subsample in the future
def guessInitMeans(peaks):
return (len(peaks) // 15) + 5
#Guesses how many more means will be needed
#Just iterates by 5. Ouch.
def guessNewMeans(peaks, means, p_val):
return 6
#Guesses how many more means will be needed
#Based on having a known/expected k
def guessNewMeansSetK(peaks, means, p_val, expectedK):
return abs(len(means) - expectedK) + 1
#Now using varAlignmentMatrix
def variance (cluster,meanNum,varAlignmentMatrix):
sumVar = 0
meanLength = len(cluster[0])
for (i,m,score,length) in varAlignmentMatrix[meanNum]:
#This definition of distance is kinda inconsistent -
# alignment gives word and segment scores, not whole sequence alignments
sumVar += (meanLength - score)**2
n = len(cluster) - 1
if n != len(varAlignmentMatrix[meanNum]):
print "the number of peaks in the cluster doesn\'t match the number of alignments - invariant invalidity"
if n > 0:
#Integer division is "//" while "/" does floating point
return (sumVar / n)
#What do you do if NOTHING gets clustered with the mean?
else:
return 0
# def variance (cluster,meanNum,alignmentMatrix):
# sumVar = 0
# meanLength = len(cluster[0])
# for peak in cluster[1:]:
# peakNum = peak[1]
# (i,m,score,length) = alignmentMatrix[peakNum][meanNum]
# #This definition of distance is kinda inconsistent -
# # alignment gives word and segment scores, not whole sequence alignments
# sumVar += (meanLength - score)**2
# n = len(cluster) - 1
# if n > 0:
# #Integer division is "//" while "/" does floating point
# return (sumVar / n)
# #What do you do if NOTHING gets clustered with the mean?
# else:
# return 0
# returns the standard deviation of a cluster
# measured by alignment score from the mean
def stdDev (cluster):
return math.sqrt(variance(cluster))
#New with lower momoization costs (only visible in variance calculation)
def welchTest (currClusters,alignmentMatrix,prevClusterVariances):
currClusterVariances = []
for c in range(len(currClusters)-1):
currClusterVariances += [variance(currClusters[c+1],c,alignmentMatrix)]
(t_stat,p_val) = scipy.stats.ttest_ind(prevClusterVariances, currClusterVariances, equal_var = False)
print p_val
return (p_val,currClusterVariances)
#temporary initial step for the welch's test (?)
def clustrifyMeans (means):
listifiedMeans = []
for i in range(len(means)):
#Double brackets, as single brackets just recostructs means
listifiedMeans += [[means[i]]]
# print listifiedMeans[0]
return listifiedMeans
#SO MUCH MEMOIZATION
# print a line when the variance goes back up between means
# Requires a list of peaks where the sequence is the first element of the peak
def main (peaks):
# means = seed.pickInitMeans(peaks,guessInitMeans(peaks))
#
# numMeans = guessInitMeans(peaks)
# numMatrixSeeds = int(0.75 * numMeans) + 1
# means = matrixSeed.pickInitSeeds(peaks,numMatrixSeeds)
# means += seed.pickInitMeans(peaks,numMeans-numMatrixSeeds)
#
means = matrixSeed.pickInitSeeds(peaks,guessInitMeans(peaks))
print peaks[0]
#The extra list at the beginning is for outliers,and is initialized with all peaks
clusters = [peaks] + clustrifyMeans(means)
alignmentMatrix = align.generate_align_matrix(peaks,means)
clusterVariances = [0]*5 #just something so that the first Welch's test doesn't cause termination
print 'first runthrough of clustering'
(means,clusters) = cluster.cluster(peaks,means,alignmentMatrix)
varAlignmentMatrix = align.generate_var_align_matrix(clusters)
print 'starting welch\'s t-test clustering with centroid means'
(p_val, clusterVariances) = welchTest(clusters,varAlignmentMatrix,clusterVariances)
while p_val < probabilityThreshold:
means = paring.paredMeans(means,varAlignmentMatrix)
numNewMeans = guessNewMeans(peaks, means, p_val)
#currently, no correlation between how many means duplicated/dropped in paring
#and how many and from where they are added in mean picking
# means += seed.pickMeans(peaks, numNewMeans)
means += seed.pickNewMeansOutliersToRandom(clusters, numNewMeans)
# means += seed.pickNewMeans(clusters, numNewMeans, clusterVariances)
alignmentMatrix = align.generate_align_matrix(peaks,means)
(means,clusters) = cluster.cluster(peaks,means,alignmentMatrix)
varAlignmentMatrix = align.generate_var_align_matrix(clusters)
(p_val, clusterVariances) = welchTest(clusters,varAlignmentMatrix,clusterVariances)
print 'finished clustering of subsequent k guess'
return clusters
#Added for guessed-k functionality
# print a line when the variance goes back up between means?
# Requires a list of peaks where the sequence is the first element of the peak
def mainSetK (peaks,expectedK):
# means = seed.pickInitMeans(peaks,expectedK)
#
# numMeans = expectedK
# numMatrixSeeds = int(0.75 * numMeans) + 1
# means = matrixSeed.pickInitSeeds(peaks,numMatrixSeeds)
# means += seed.pickInitMeans(peaks,numMeans-numMatrixSeeds)
#
means = matrixSeed.pickInitSeeds(peaks,expectedK)
print peaks[0]
#The extra list at the beginning is for outliers,and is initialized with all peaks
clusters = [peaks] + clustrifyMeans(means)
alignmentMatrix = align.generate_align_matrix(peaks,means)
clusterVariances = [0]*5 #just something so that the first Welch's test doesn't cause termination
print 'first runthrough of clustering'
(means,clusters) = cluster.cluster(peaks,means,alignmentMatrix)
varAlignmentMatrix = align.generate_var_align_matrix(clusters)
print 'starting welch\'s t-test clustering with centroid means'
(p_val, clusterVariances) = welchTest(clusters,varAlignmentMatrix,clusterVariances)
while p_val < probabilityThreshold:
means = paring.paredMeans(means,varAlignmentMatrix)
numNewMeans = guessNewMeansSetK(peaks, means, p_val, expectedK)
#currently, no correlation between how many means duplicated/dropped in paring
#and how many and from where they are added in mean picking
# means += seed.pickMeans(peaks, numNewMeans)
means += seed.pickNewMeansOutliersToRandom(clusters, numNewMeans)
# means += seed.pickNewMeans(clusters, numNewMeans, clusterVariances)
alignmentMatrix = align.generate_align_matrix(peaks,means)
(means,clusters) = cluster.cluster(peaks,means,alignmentMatrix)
varAlignmentMatrix = align.generate_var_align_matrix(clusters)
(p_val, clusterVariances) = welchTest(clusters,varAlignmentMatrix,clusterVariances)
print 'finished clustering of subsequent k guess'
return clusters