-
Notifications
You must be signed in to change notification settings - Fork 1
/
nbc.py
executable file
·113 lines (94 loc) · 3.4 KB
/
nbc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
# Naive Bayes Classifier
Classifier using Naive Density Estimator
Command: `python nbc.py car.data car.test`
Result will be the attributes and predictions at the end of each lines followed by Accuracy & Execution Time:
```
{'LugBoot': 'big', 'Maint': 'low', 'Persons': 'more', 'Safety': 'low', 'Doors': '5more', 'Buying': 'low'} Quality: unacc (True)
{'LugBoot': 'big', 'Maint': 'low', 'Persons': 'more', 'Safety': 'med', 'Doors': '5more', 'Buying': 'low'} Quality: acc (False)
{'LugBoot': 'big', 'Maint': 'low', 'Persons': 'more', 'Safety': 'high', 'Doors': '5more', 'Buying': 'low'} Quality: vgood (True)
Accuracy: 63.7426900585%
Execution Time: 0.186897993088s
```
@author [email protected]
"""
import numpy as np
import math
import sys
import time
# NBC class
class NBC(object):
def __init__(self, attrs, rows, coly):
self.attrs = attrs
self.rows = rows
self.coly = coly
self.disy = self.distributions(self.rows[:, self.coly])
self.p = {} # probability cache
def predict(self, values):
res = None
maxp = 0
for valy, pvaly in self.disy.items():
p = pvaly
for c, val in values.items():
# p(c = val | y = cy)
p *= self.probability(c, val, valy)
# choose max p
if p > maxp:
maxp = p
res = valy
return res
def probability(self, c, val, valy):
cache_key = c + '#' + val + '#' + valy
if cache_key in self.p:
# retrieve from cache
return self.p[cache_key]
else:
# rows where coly = valy
rowsvaly = self.rows[self.rows[:, self.coly] == valy]
# rowsvaly where c = val / len(rowsvaly)
indexc = self.attrs.tolist().index(c)
res = len(rowsvaly[rowsvaly[:, indexc] == val]) / float(len(rowsvaly))
# store in the cache
self.p[cache_key] = res
return res
# probability distributions of array elements
@staticmethod
def distributions(arr):
dis = {}
n = len(arr)
for x in arr:
dis[x] = dis[x] + 1 / float(n) if x in dis else 1 / float(n)
return dis
# main program
if __name__ == "__main__":
execution_time = -time.time()
# read training and testing data from argv
train_file = sys.argv[1]
test_file = sys.argv[2]
# read input file
raw = [line.strip().split(',') for line in open(train_file, 'r')]
data = np.array(raw)
# setup NBC
coly = -1
attrs = data[0, :coly] # attributes
rows = data[1:, :] # datasets (including results)
yclass = data[0, coly]
nbc = NBC(attrs, rows, coly)
# read test file
rawtest = [line.strip().split(',') for line in open(test_file, 'r')]
# predict
total_correct = 0
for i in range(len(rawtest)):
# skip header
if i > 0:
values = {}
for j in range(len(attrs)):
values[attrs[j]] = rawtest[i][j]
prediction = nbc.predict(values)
correct = (prediction == rawtest[i][coly])
total_correct += 1 if correct else 0
print('{} {}: {} ({})'.format(values, yclass, prediction, correct))
accuracy = total_correct / float(len(rawtest)-1) * 100
execution_time += time.time()
print('Accuracy: {}%'.format(accuracy))
print('Execution Time: {}s'.format(execution_time))