-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclassifier.py
140 lines (119 loc) · 6.39 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# Written by Edmund Hee - 20160618
import os, json, re, sys, numpy
from nltk.stem import WordNetLemmatizer
from optparse import OptionParser
from stopword import Stopword
import cPickle as pickle
from lda import LDA
class Classifier:
def __init__(self, options):
self.options = options
self.file_dir = "./build/"
self.labels= []
self.corpus = []
if not os.path.exists(self.file_dir):
os.makedirs("build")
self.stopwords = self.get_stopwords()
def train_model(self, filename, model_name):
self.create_label_corpus(filename)
self.lda = LDA(self.options.K, self.options.alpha, self.options.beta)
self.lda.set_corpus(self.labelset, self.corpus, self.labels)
print "M=%d, V=%d, L=%d, K=%d" % (len(self.corpus), len(self.lda.vocas), len(self.labelset), self.options.K)
for index in range(self.options.iteration):
sys.stderr.write("-- %d : %.4f\n" % (index, self.lda.perplexity()))
print "perplexity : %.4f" % self.lda.perplexity()
phi = self.lda.phi()
theta = self.lda.theta()
new_stopword = []
for k, label in enumerate(self.labelset):
print "\n-- label %d : %s" % (k, label)
for w in numpy.argsort(-phi[k]):
print "%s: %f" % (self.lda.vocas[w], phi[k,w])
self.save_model(model_name)
def lemmatize(self, string):
return WordNetLemmatizer().lemmatize(string, pos='v')
def create_label_corpus(self,filename):
with open(os.path.join(self.file_dir,filename)) as model:
for row in model:
label_class_list = []
selected_words = []
split_row = row.lower().split("\"|\"")
label_array = self.filter_split(split_row[0])
# Create Unicoded label_type
for label_type in self.filter_split(split_row[1]):
label_class_list.append(unicode(label_type,"utf-8"))
for word in label_array:
lemmatized_word = self.lemmatize(word)
if word not in self.stopwords and len(word) > 2 and not bool(re.search(r'\d',lemmatized_word)) and lemmatized_word not in self.stopwords:
selected_words.append(lemmatized_word)
self.corpus.append(selected_words)
self.labels.append(label_class_list)
self.labelset = list(set(reduce(list.__add__, self.labels)))
def filter_split(self,label):
return re.sub(r'\W+',' ',label).split()
def classify(self,model_name,label):
self.lda = self.load_model(model_name)
self.stopwords = self.get_stopwords()
result_vector = numpy.zeros(self.lda.K)
phi = self.lda.phi()
label_array = self.filter_split(label)
for word in label_array:
for r in range(self.lda.K):
lemmatized_word = self.lemmatize(word)
if word not in self.stopwords and len(word) > 2 and not bool(re.search(r'\d',lemmatized_word)) and lemmatized_word not in self.stopwords and lemmatized_word in self.lda.vocas_id:
result_vector[r] += phi[r,self.lda.vocas_id[lemmatized_word]]
result = 0
if result_vector.argmax() == 0:
v = max(n for n in result_vector if n != max(result_vector))
result = numpy.argwhere(result_vector == v)
else:
result = result_vector.argmax()
print self.lda.labelmap.keys()[self.lda.labelmap.values().index(result)]
return self.lda.labelmap.keys()[self.lda.labelmap.values().index(result)]
def save_model(self, model_name):
with open(os.path.join(self.file_dir,model_name + "_trained.p"),'wb') as model_file:
pickle.dump(self.lda,model_file,protocol=pickle.HIGHEST_PROTOCOL)
def load_model(self,model_name):
if os.path.isfile(os.path.join(self.file_dir,model_name+ "_trained.p")):
with open(os.path.join(self.file_dir,model_name + "_trained.p"),'rb') as model_file:
return pickle.load(model_file)
else:
print "Trained model for %s is not found in \"%s\" directory" % ((model_name), (file_dir))
print "Please train the model"
def get_stopwords(self):
return Stopword(self.file_dir).get_stopwords()
parser = OptionParser()
parser.add_option("-f", dest="filename", type="string", help="File Name")
parser.add_option("-m", dest="model_name", type="string", help="Model name")
parser.add_option("-l", dest="label", type="string", help="Label to pass for classification")
parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.005)
parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.005)
parser.add_option("-k", dest="K", type="int", help="number of topics", default=10)
parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100)
parser.add_option("-t", dest="train", help="train data", default= False, action='store_true')
parser.add_option("-c", dest="classify", help="Classify data", default= False, action='store_true')
parser.add_option("--sos", dest="help", help="Help ME!", default= False, action='store_true')
(options, args) = parser.parse_args()
classifier = Classifier(options)
if options.train:
classifier.train_model(options.filename,options.model_name)
elif options.classify:
classifier.classify(options.model_name,options.label)
elif options.help:
print "--alpha : Set alpha value (default 0.005)"
print "--beta : Set beta value (default: 0.005)"
print "-t : Train with training data in build directory"
print "-c : Get classification result from trained model"
print "-m : Model name of data in build directory"
print "-l : label to predict (e.g -l \"<TITLE>\")"
print "-k : Number of topics (default: 10) * will change according to data set"
print "-i : Number of interation (default: 100)"
print "-n : Number of sample size (default: 100) * will change according to data set"
print "-f : Filename (e.g -f <FILENAME>) default:headlones_classified.txt"
print "\n"
print "To train data"
print "python classifier.py -m <MODEL_NAME> -f <TEXT_FILE_NAME>.txt -t"
print "To train classify"
print "python classifier.py -c -m <MODEL_NAME> -l \"<LABEL TO PASS INTO LDA>\""
else:
print "Pleaes Select Type: -t for train model -c for classification"