-
Notifications
You must be signed in to change notification settings - Fork 0
/
AutomatingClassification.py
181 lines (149 loc) · 7.55 KB
/
AutomatingClassification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
"""
File Summary: This file seeks to automate the tasks of cross validation, hyperparameter tuning, and accuracy evaluation.
"""
# Data preprocessing
import pandas as pd
import numpy as np
# Modeling
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
# Scoring and plotting
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
class AutomaticClassification:
def __init__(self, trainX, trainY, testX, testY, nFolds=5):
self.trainX = trainX
self.trainY = trainY
self.testX = testX
self.testY = testY
self.nFolds = nFolds
def runCrossValidation(self, model):
scores = cross_val_score(model, self.trainX, self.trainY, cv=self.nFolds)
return scores.mean()
def createModel(self, modelClass, **kwargs):
"""
Helper function to create the model for the runCrossValidationAcrossHyperparameters method.
The parameters of the model (including the hyperparameter, should be passed in here).
"""
return modelClass(**kwargs)
def runCrossValidationAcrossHyperparameters(self, modelClass, hyperparamStr, hyperparamValList, modelStr, xAxis, **kwargs):
"""
Function runs cross validation across a list of hyperparameters for a given model.
The regular parameters for the model class should be given in kwargs.
The hyperparameter values will be given in hyperparamValList.
"""
accuracyList = []
for val in hyperparamValList:
# Add the hyperparameter value to the keyword dictionary.
kwargs[hyperparamStr] = val
model = self.createModel(modelClass, **kwargs)
accuracy = self.runCrossValidation(model)
accuracyList.append(accuracy)
plt.figure( figsize=(20, 10) )
plt.plot(hyperparamValList, accuracyList, color='red')
plt.xlabel(xAxis, fontsize=15)
plt.ylabel('Accuracy', fontsize=15)
plt.title('{} {}-Fold Cross Validated Accuracy by Hyperparameter {}'.format(modelStr, self.nFolds, xAxis), fontsize=20)
plt.savefig(modelStr + ' Cross Validation Plot.png', bbox_inches='tight')
return accuracyList
def getBestHyperparameter(self, hyperparamValList, accuracyList):
"""
Function returns the best hyperparameter value and top accuracy from CV
as (hyperparameter, accuracy).
"""
bestHyper = hyperparamValList[np.argmax(accuracyList)]
bestAccuracy = np.max(accuracyList)
return bestHyper, bestAccuracy
def getAccuracyOnTestSet(self, model):
model.fit(self.trainX, self.trainY)
return model.score(self.testX, self.testY)
def logisticRegression(self):
print('Now fitting logistic regression')
hyperparamValList = np.linspace(0.01, 2, 10)
accList = self.runCrossValidationAcrossHyperparameters(LogisticRegression, 'C', hyperparamValList, 'Logistic Regression',
'C', penalty='l2', max_iter=100, n_jobs=-1)
bestCVHyper, bestCVAcc = self.getBestHyperparameter(hyperparamValList, accList)
model = LogisticRegression(penalty='l2', C=bestCVHyper)
testAcc = self.getAccuracyOnTestSet(model)
return ['Logistic Regression', 'C', bestCVHyper, bestCVAcc, testAcc]
def decisionTree(self):
print('Now fitting decision tree')
hyperparamValList = list(range(1, 40, 5))
accList = self.runCrossValidationAcrossHyperparameters(DecisionTreeClassifier, 'max_depth', hyperparamValList,
'Decision Tree', 'Max Depth')
bestCVHyper, bestCVAcc = self.getBestHyperparameter(hyperparamValList, accList)
model = DecisionTreeClassifier(max_depth=bestCVHyper)
testAcc = self.getAccuracyOnTestSet(model)
return ['Decision Tree', 'Max depth', bestCVHyper, bestCVAcc, testAcc]
def kNearestNeighbors(self, nNeighbors=None):
"""
If the hyperparameter nNeighbors is set to a value, then hyperparameter tuning through CV will not be run.
"""
print('Now fitting k-nearest-neighbors')
# Tune hyperparameters if desired
if nNeighbors is None:
hyperparamValList = list(range(1, 100, 10))
accList = self.runCrossValidationAcrossHyperparameters(KNeighborsClassifier, 'n_neighbors',
hyperparamValList, 'K Nearest Neighbors', 'N-Neighbors')
nNeighbors, bestCVAcc = self.getBestHyperparameter(hyperparamValList, accList)
else:
bestCVAcc = '-'
# Calculate accuracy on the test set
model = KNeighborsClassifier(n_neighbors=nNeighbors)
testAcc = self.getAccuracyOnTestSet(model)
return ['KNN', 'n_neighbors', nNeighbors, bestCVAcc, testAcc]
def supportVectorMachine(self):
print('Now fitting support vector machine')
hyperparamValList = np.linspace(0.01, 10, 10)
accList = self.runCrossValidationAcrossHyperparameters(LinearSVC, 'C',
hyperparamValList, 'Support Vector Machine', 'C')
bestCVHyper, bestCVAcc = self.getBestHyperparameter(hyperparamValList, accList)
model = LinearSVC(C=bestCVHyper)
testAcc = self.getAccuracyOnTestSet(model)
return ['Support Vector Machine', 'C', bestCVHyper, bestCVAcc, testAcc]
def randomForest(self):
"""
In general, random forests don't overfit to the training data as more
trees are added. Thus, cross validation is unnecessary.
"""
print('Now fitting random forest classifier')
model = RandomForestClassifier(n_estimators=300)
testAcc = self.getAccuracyOnTestSet(model)
return ['Random Forest', 'n_estimators', '-', '-', testAcc]
def gradientBoosting(self):
print('Now fitting gradient boosting classifier')
hyperparamValList = list(range(50, 510, 100))
accList = self.runCrossValidationAcrossHyperparameters(GradientBoostingClassifier, 'n_estimators', hyperparamValList,
'Gradient Boosting Machine', 'N-Estimators')
bestCVHyper, bestCVAcc = self.getBestHyperparameter(hyperparamValList, accList)
model = GradientBoostingClassifier(n_estimators=bestCVHyper)
testAcc = self.getAccuracyOnTestSet(model)
return ['Gradient Boosting', 'Max depth', bestCVHyper, bestCVAcc, testAcc]
def main(self, classifierArray=(True, True, True, True, True, True)):
"""
The function runs cross validation, chooses the best hyperparameter,
and then evaluates the model on a test set. It reports the best
cross validated accuracy and the testing accuracy.
Use the 'classifierArray' parameter to decide which models to run.
It is simply a boolean array containing either True to run the model
or False to not run the model.
"""
# Fit the desired models.
methodArray = np.array([self.logisticRegression, self.decisionTree, self.kNearestNeighbors,
self.supportVectorMachine, self.randomForest, self.gradientBoosting])
allRows = []
for method in methodArray[classifierArray]:
row = method()
allRows.append(row)
# Create a dataframe of the results.
columns = ['Model', 'Hyperparam', 'Hyperparam val', 'CV Accuracy', 'Test Accuracy']
reportDf = pd.DataFrame(data=allRows, columns=columns)
reportDf.set_index('Model', inplace=True)
return reportDf