-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathBA2D.py
120 lines (94 loc) · 2.24 KB
/
BA2D.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import math
def profile_most_probable_kmer(dna, k, profile):
dic = profile
kmers = []
n = len(dna)
for i in range(n-k+1):
kmers.append(dna[i:i+k])
ans = []
dic_kmer = {}
for kmer in kmers:
small_ans = 1.0
for i, base in enumerate(kmer):
small_ans *= (dic[base][i])
ans.append(small_ans)
dic_kmer[kmer] = small_ans
mn = max(ans)
for k in dic_kmer:
if dic_kmer[k] == max(ans):
return k
def create_profile(motifs):
k = len(motifs[0])
profile = {
'A': [0]*k,
'C': [0]*k,
'G': [0]*k,
'T': [0]*k,
}
for i in range(len(motifs)):
for j, base in enumerate(motifs[i]):
profile[base][j] += 1
return profile
def consensus_string(motifs):
c_s = ""
for i in range(len(motifs[0])):
dic = {
'A': 0,
'C': 0,
'G': 0,
'T': 0
}
for motif in motifs:
dic[motif[i]] += 1
mx = max(dic.values())
for k in dic:
if mx == dic[k]:
c_s += k
break
return c_s
def hamming_distance(dna, p):
cnt = 0
for i in range(len(p)):
if not(dna[i] == p[i]):
cnt += 1
return cnt
def score(consensus_motif, motifs):
total_score = 0.0
for motif in motifs:
d = hamming_distance(consensus_motif, motif)
total_score += d
return total_score
def greedy_motif_search(dnas, k, t):
best_motifs = []
best_score = math.inf
for dna in dnas:
best_motifs.append(dna[:k])
first_dna = dnas[0]
n = len(first_dna)
for i in range(n-k+1):
motifs = []
motifs.append(first_dna[i:i+k])
for j in range(1, t):
profile = create_profile(motifs)
frequency_profile = profile.copy()
for key in frequency_profile:
frequency_profile[key] = [val/t for val in frequency_profile[key]]
next_choice = profile_most_probable_kmer(dnas[j], k, frequency_profile)
motifs.append(next_choice)
consensus_motif = consensus_string(motifs)
score_motif = score(consensus_motif, motifs)
if score_motif < best_score:
best_motifs = motifs
best_score = score_motif
return best_motifs
DNA = [
"GGCGTTCAGGCA",
"AAGAATCAGTCA",
"CAAGGAGTTCGC",
"CACGTCAATCAC",
"CAATAATATTCG"
]
k, t = 3, 5
motifs = greedy_motif_search(DNA, k, t)
for motif in motifs:
print(motif)