-
Notifications
You must be signed in to change notification settings - Fork 0
/
project1.py
174 lines (142 loc) · 6.2 KB
/
project1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import networkx as nx
import csv
import numpy as np
import math
import time
import matplotlib
def read(file): # turns file into two arrays: a list of the variables of each node and a list of rows of data points
data = []
with open(file, newline='\n') as f:
reader = csv.reader(f, delimiter=',')
nodes = list(next(reader))
for row in reader:
line = [int(x) for x in row]
data.append(line)
return nodes, data
def get_num_instantiations(node, nodes, data): # return the number of instantiations that a variable can take on
ind = nodes.index(node)
col = []
for i in range(len(data)):
col.append(data[i][ind])
return max(col)
def get_parents(node, graph): # return the parents of a node in list form
if not graph.predecessors(node):
return []
else:
return list(graph.predecessors(node))
def p_instantiations(parents, nodes, data): # return list of lists of parental instantiation permutations in order
num_parents = len(parents)
instants, matrix = [], []
for i in range(num_parents):
instants.append(get_num_instantiations(parents[i], nodes, data))
# base case: only one parent; return instantiations
if num_parents == 1:
for i in range(1, instants[0] + 1):
row = [i]
matrix.append(row)
return matrix
else:
remaining_p_instants = p_instantiations(parents[1:], nodes, data) # get instants for remaining parents
for i in range(1, instants[0] + 1): # iterate through instants of first parent
for j in remaining_p_instants: # iterate through instants of remaining parents
matrix.append([i] + j) # combine val of first parent with list of instants for remaining parents
return matrix
def fill_matrix(node, nodes, graph, data): # return the n x m counts matrix for a single node
instants = get_num_instantiations(node, nodes, data)
parents = get_parents(node, graph)
rows, j = 1, 0 # if the node has no parents, we want a 1 x number_instantiations matrix and j = 0 because we only have one row
if len(parents) > 0:
p_instants = p_instantiations(parents, nodes, data)
rows = len(p_instants)
count = np.zeros((rows, instants), dtype='i')
for i in range(len(data)):
if len(parents) > 0:
p_vals = []
for p in range(len(parents)): # check if parents have correct instantiations
p_vals.append(data[i][nodes.index(parents[p])])
j = p_instants.index(p_vals)
data_point = data[i][nodes.index(node)]
count[j][data_point - 1] += 1
return count
# return a prior (matrix of all 1s) with the same shape as the n x m matrix of counts for that node
def prior(node, nodes, graph, data):
instants = get_num_instantiations(node, nodes, data)
parents = get_parents(node, graph)
rows = 1
if len(parents) > 0:
p_instants = p_instantiations(parents, nodes, data)
rows = len(p_instants)
p = np.ones((rows, instants), dtype='i') # creates an all-ones matrix with the correct dimensions
return p
def final_m(nodes, data, graph): # returns list of count matrices, one for each node
final = []
for i in range(len(nodes)):
final.append(fill_matrix(nodes[i], nodes, graph, data))
return final
def final_a(nodes, data, graph): # returns list of prior matrices, one for each node
final = []
for i in range(len(nodes)):
final.append(prior(nodes[i], nodes, graph, data))
return final
def bayesian_score_component_r(j, M, a): # computes the rightmost component of the bayes score algorithm (sum ri)
sum_r = 0
for k in range(a.shape[1]): # COLS
sum_r += (math.lgamma(a[j][k] + M[j][k]) - math.lgamma(a[j][k]))
return sum_r
def bayesian_score_component_q(M, a): # computes the rest of the bayes score algorithm (sum qi)
sum_q = 0
for j in range(a.shape[0]): # ROWS
sum_q += ((math.lgamma(sum(a[j])) - math.lgamma(sum(a[j]) + sum(M[j]))) + bayesian_score_component_r(j, M, a))
return sum_q
def bayes_score(nodes, data, graph): # computes final bayes score, adding up the score for each node (sum n)
M, a = final_m(nodes, data, graph), final_a(nodes, data, graph)
n = len(nodes)
score = 0
for i in range(n):
score += bayesian_score_component_q(M[i], a[i])
return score
def valid_edge(child, graph, max_parents): # ensures that adding an edge between two nodes does not create a cyclic graph or exceed a maximum number of parents
if nx.is_directed_acyclic_graph(graph) and len(get_parents(child, graph)) <= max_parents:
return True
return False
def k2_search(nodes, data, max_parents): # standard K2 search algorithm
G = nx.DiGraph() # with n number of nodes
G.add_nodes_from(nodes)
for k in range(1, len(nodes)):
score = bayes_score(nodes, data, G)
while True:
best_score = -10e10
best_node = 0
for j in range(0, k):
if not G.has_edge(nodes[j], nodes[k]):
G.add_edge(nodes[j], nodes[k])
if valid_edge(nodes[k], G, max_parents):
score_temp = bayes_score(nodes, data, G)
if score_temp > best_score:
best_score = score_temp
best_node = nodes[j]
G.remove_edge(nodes[j], nodes[k])
else:
G.remove_edge(nodes[j], nodes[k])
if best_score > score:
score = best_score
G.add_edge(best_node, nodes[k])
else:
break
return G
def write_gph(dag, filename):
with open(filename, 'w') as f:
for edge in dag.edges():
f.write("{}, {}\n".format(edge[0], edge[1]))
def main():
start_time = time.time()
nodes, data = read("data/large.csv")[0], read("data/large.csv")[1]
G = k2_search(nodes, data, 4)
print(bayes_score(nodes, data, G))
#nx.draw_networkx(G, with_labels=True)
#matplotlib.pyplot.show()
end_time = time.time()
print(end_time-start_time)
return write_gph(G, "large.gph")
if __name__ == '__main__':
main()