-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNaiveBayesClassifier.py
128 lines (72 loc) · 2.61 KB
/
NaiveBayesClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from importChatFile import getFile
from posTagging import posTagsFromObject, posTagsFromFile, posTrigrams
from pprint import pprint
import pylangacq as pla
from sklearn import metrics
import numpy as np
from pprint import pprint
from sklearn.preprocessing import OneHotEncoder
train_files = pla.read_chat("/Users/sandeep/Google Drive/Sandeep/College/4th Semester/Computational Linguistics/Computation-Linguistics-Final-Project/Data/Train/*.cha")
types = {'SE' : 3, 'LP' : 1, 'SD' : 2}
X_train = []
y_train = []
for file in train_files.filenames():
item = posTagsFromFile(file)
y_train.append(types[os.path.basename(file)[0:2]])
X_train.append(item)
labels = []
for x in X_train:
for x1 in x:
if x1 not in labels:
labels.append(x1)
###Call your function with X_train as param here###
########################
test_files = pla.read_chat("/Users/sandeep/Google Drive/Sandeep/College/4th Semester/Computational Linguistics/Computation-Linguistics-Final-Project/Data/Test/*.cha")
X_test = []
y_test = []
for file in test_files.filenames():
item = posTagsFromFile(file)
y_test.append(types[os.path.basename(file)[0:2]])
X_test.append(item)
###Call your function with X_train as param here###
#######################
for x in X_test:
for x1 in x:
if x1 not in labels:
labels.append(x1)
for x in X_test:
for num in range(0, len(x)):
idx = labels.index(x[num])
x[num] = str(idx)
for x in X_train:
for num in range(0, len(x)):
idx = labels.index(x[num])
x[num] = str(idx)
X_test_trigrams = []
X_train_trigrams= []
for x in X_train:
X_train_trigrams.append(posTrigrams(x))
for x in X_train:
X_test_trigrams.append(posTrigrams(x))
X_data = X_train + X_test
y_data = y_train + y_test
print(X_test_trigrams)
X_data = np.asarray([np.array(xi) for xi in X_data])
y_data = np.asarray([np.array(xi) for xi in y_data])
X_train = np.asarray([np.array(xi) for xi in X_train_trigrams])
y_train = np.asarray([np.array(xi) for xi in y_train])
X_test = np.asarray([np.array(xi) for xi in X_test_trigrams])
y_test = np.asarray([np.array(xi) for xi in y_test])
#pprint(X_test)
model = GaussianNB()
model.fit(X_train, y_train)
print("Score: ", model.score(X_test, y_test))
'''
predictions = model.predict(X_test)
print('Accuracy Score: ', metrics.accuracy_score(y_test, predictions))
cVal = cross_val_score(GaussianNB(), X_train, y_train, scoring='accuracy', cv=10 )
print("Cross-Validated Score: ", cVal.mean())
'''