-
Notifications
You must be signed in to change notification settings - Fork 1
/
BA2E.py
112 lines (96 loc) · 2.44 KB
/
BA2E.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import math
def hamming_distance(text1, text2):
cnt = 0
for i in range(len(text1)):
if not(text1[i] == text2[i]):
cnt += 1
return cnt
def score(motifs, consensus):
total_score = 0
for motif in motifs:
total_score += hamming_distance(motif, consensus)
return total_score
def consensus_string(motifs):
k = len(motifs[0])
c_s = ""
for i in range(k):
dic = {
'A': 0,
'C': 0,
'G': 0,
'T': 0
}
for motif in motifs:
ch = motif[i]
dic[ch] += 1
mx = max(dic.values())
for key in dic:
if mx == dic[key]:
c_s += key
break
return c_s
def get_profile_most_probable_kmer(profile, dna, k, t):
frequency_matrix = profile.copy()
for key in frequency_matrix:
frequency_matrix[key] = [(val+1)/(t+4) for val in frequency_matrix[key]]
n = len(dna)
kmer_list = []
max_score = -1.0
for i in range(n-k+1):
score = 1.0
kmer = dna[i:i+k]
for j, ch in enumerate(kmer):
score *= frequency_matrix[ch][j]
max_score = max(max_score, score)
kmer_list.append((kmer, score))
for kmer in kmer_list:
if max_score == kmer[1]:
return kmer[0]
def create_profile(motifs, k):
matrix = {
'A': [0]*k,
'C': [0]*k,
'G': [0]*k,
'T': [0]*k
}
for motif in motifs:
for i, ch in enumerate(motif):
matrix[ch][i] += 1
return matrix
def greedy_motif_search_with_pseudocount(dnas, k, t):
best_motifs = []
best_score = math.inf
for dna in dnas:
best_motifs.append(dna[:k])
n = len(dnas[0])
first_dna = dnas[0]
for i in range(n-k+1):
motifs = [first_dna[i:i+k]]
for j in range(1, t):
profile = create_profile(motifs, k)
profile_most_probable_kmer = get_profile_most_probable_kmer(profile, dnas[j], k, t)
motifs.append(profile_most_probable_kmer)
consensus = consensus_string(motifs)
score_motif = score(motifs, consensus)
if score_motif < best_score:
best_score = score_motif
best_motifs = motifs
return best_motifs
if __name__ == "__main__":
# k, t = 3, 5
# dnas = [
# "GGCGTTCAGGCA",
# "AAGAATCAGTCA",
# "CAAGGAGTTCGC",
# "CACGTCAATCAC",
# "CAATAATATTCG"
# ]
with open('rosalind_ba2e.txt') as file:
f = file.read().split("\n")
first = f[0].split()
k = int(first[0])
t = int(first[1])
dnas = f[1:]
motifs = greedy_motif_search_with_pseudocount(dnas, k, t)
for motif in motifs:
print(motif)