-
Notifications
You must be signed in to change notification settings - Fork 0
/
Sayers_Problem_Four.py
126 lines (108 loc) · 3.04 KB
/
Sayers_Problem_Four.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#Kevin Sayers
#1/15/2017
from Bio.Seq import Seq
import networkx as nx #used to plot the network graph
import matplotlib.pyplot as plt
#readfile: reads data from file
#input: filename
#output: none
#return: list of lines from the file
def readfile(infile):
data = []
with open(infile) as f:
data = f.read().splitlines()
return data
#reverse_comp: returns the reverse complement of a sequence
#input: a DNA sequence as a string
#output: none
#return: a string representing the reverse complement
def reverse_comp(seq):
return str(Seq(seq).reverse_complement())
#getelements: makes a set of kmers and their reverse complement
#input: list of kmers
#output: none
#return: a set of all kmers and their reverse complement
def getelements(inlist):
result_set = set()
for i in inlist:
result_set.add(i)
result_set.add(reverse_comp(i))
return result_set
#getedges: returns the edges for each node
#input: kmers, and k value
#output:none
#return: a list of tuples for the edges
def getedges(elements,k):
result = []
for i in elements:
result.append((i[0:k-1],i[1:k]))
return result
#getcyclic: finds the cyclic superstring. It finds the edges needed for each kmer once only.
#this is then passed to recurseedges which traverses the nodes creating the superstring.
#input: edges, kmers, and k value
#output: none
#return: a shortest cyclic superstring
def getcyclic(edges,kmers,k):
visited = []
sups = []
superstr = ""
finaledges = []
for i in kmers:
if (i[:k-1],i[1:]) in edges:
finaledges.append((i[:k-1],i[1:]))
return recurseedges(finaledges)
#recurseedges: this function traverses the linked nodes creating the shortest superstring.
#input: edges
#output: none
#return: a shortest cyclic superstring
def recurseedges(edges):
curi = 0
superstr = ""
#The algorithm works by removing visited edges as it traverses.
#it is initialized at an index of 0
while len(edges) > 0:
current = edges.pop(curi)
superstr += current[1][-1]
for i in range(0,len(edges)):
if edges[i][0] == current[1]:
curi = i
return (superstr)
#plotgraph: This function uses networkx module to plot the nodes and edges.
#input: nodes, edges, k
#output: a matplotlib graph of the network
#return: none
def plotgraph(nodes,edges,k):
finaledges = []
for i in nodes:
if (i[:k-1],i[1:]) in edges:
finaledges.append((i[:k-1],i[1:]))
newnodes = []
finaledges = sorted(finaledges)
edgelabels = []
labeldict = {}
for p in finaledges:
newnodes.append(p[0])
labeldict[p] = p[1][-1]
g = nx.DiGraph()
g.add_nodes_from(newnodes)
g.add_edges_from(finaledges)
pos = nx.circular_layout(g)
# labels = nx.draw_networkx_edge_labels(g,pos,labeldict)
nx.draw(g, with_labels=True)
plt.show()
def main():
input = readfile("n4.txt")
k = len(input[0])
elements = getelements(input)
result = getedges(elements,k)
final = getcyclic(result,input,k)
out = open("q4.txt","w")
out.write(final)
plotgraph(input,result,k)
# result = sorted(result)
# for i in result:
# print (i)
# out.write(str(i).replace("'","")+"\n")
# out.close()
if __name__ == "__main__":
main()