-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_model.py
executable file
·164 lines (128 loc) · 5.1 KB
/
train_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
###############################################################################
# Written by Ryan Smith
# github.com/ryan597/Precomputation-of-features--classification
###############################################################################
import numpy as np
import h5py
import os
import sys
import getopt
import json
import pickle
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
# Random seed
SEED = 46
###############################################################################
if __name__ == '__main__':
argv = sys.argv[1:]
try:
opts, args = getopt.getopt(argv, "h:c:")
except getopt.GetoptError:
print('python train_test_model.py -c conf_file')
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print('Example usage: python extract_features_imaug.py ' +
'-c conf_mobilenet')
sys.exit()
elif opt in ("-c"):
configfile = arg
# load the user configs
with open(os.getcwd()+os.sep+'conf'+os.sep+configfile+'.json') as f:
config = json.load(f)
# config variables
features_path = config["features_path"]
labels_path = config["labels_path"]
results = config["results"]
train_path = config["train_path"]
classifier_path = config["classifier_path"]
# import features and labels
h5f_data = h5py.File(features_path, 'r')
h5f_label = h5py.File(labels_path, 'r')
features_string = h5f_data['dataset_1']
labels_string = h5f_label['dataset_1']
features = np.array(features_string)
labels = np.array(labels_string)
h5f_data.close()
h5f_label.close()
# verify the shape of features and labels
print("features shape: {}".format(features.shape))
print("labels shape: {}".format(labels.shape))
print("training started...")
# split the training and testing data
(trainData, trainLabels) = shuffle(features, labels, random_state=SEED)
print("splitted train and test data...")
print("train data : {}".format(trainData.shape))
print("train labels: {}".format(trainLabels.shape))
# ------------------------------------------------------
# Model ------------------------------------------------
# use logistic regression as the model
print("creating model...")
# model = LogisticRegression(C=0.5,
# dual=True,
# solver='liblinear',
# random_state=seed,
# class_weight='balanced',
# max_iter=100)
model = LogisticRegression(C=0.5, random_state=SEED,
class_weight='balanced', max_iter=1000)
model.fit(trainData, trainLabels)
# dump classifier to file
print("saving model...")
pickle.dump(model, open(classifier_path, 'wb'))
# Model ------------------------------------------------
# ------------------------------------------------------
"""
# Can check model results on training data
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
# use rank-1 and rank-5 predictions
print ("evaluating model...")
f = open(results, "w")
# rank_1 -> true label is the most likely label
rank_1 = 0
# rank_5 -> true label is in top 5 likely labels
rank_5 = 0
# loop over test data
for (label, features) in zip(trainLabels, trainData):
# predict the probability of each class label and
# take the top-5 class labels
predictions = model.predict_proba(np.atleast_2d(features))[0]
predictions = np.argsort(predictions)[::-1]
# rank-1 prediction increment
if label == predictions[0]:
rank_1 += 1
# rank-5 prediction increment
if label in predictions:
rank_5 += 1
# convert accuracies to percentages
rank_1 = (rank_1 / float(len(trainLabels))) * 100
rank_5 = (rank_5 / float(len(trainLabels))) * 100
# write the accuracies to file
f.write("Rank-1: {:.2f}%\n".format(rank_1))
f.write("Rank-5: {:.2f}%\n\n".format(rank_5))
# evaluate the model of test data
preds = model.predict(trainData)
# write the classification report to file
f.write("{}\n".format(classification_report(trainLabels, preds)))
f.close()
# display the confusion matrix
print ("confusion matrix")
# get the list of training lables
labels = sorted(list(os.listdir(train_path)))
##labels =[t for t in labels if not t.endswith('csv')]
# plot the confusion matrix
cm = confusion_matrix(trainLabels, preds)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm,
annot=True,
cmap = sns.cubehelix_palette(dark=0, light=1, as_cmap=True))
tick_marks = np.arange(len(labels))+.5
plt.xticks(tick_marks, labels, rotation=45,fontsize=5)
plt.yticks(tick_marks, labels,rotation=45, fontsize=5)
"""