-
Notifications
You must be signed in to change notification settings - Fork 0
/
svm_final.py
90 lines (83 loc) · 3.68 KB
/
svm_final.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pandas as pd
import numpy as np
import random as rd
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import accuracy_score
# convert to numeric, drop nas and other [currently] irrelevant features
def preprocess(filename):
df = pd.read_csv(filename+'.csv', sep=",")
# removing unnecessary columns. keeping only numbers atm
unnecessary = ['Body','ClosedDate','CommunityOwnedDate','CreationDate','Id','LastActivityDate',
'LastEditDate','LastEditorUserId','LastEditorDisplayName','OwnerDisplayName',
'OwnerUserId','ParentId','Tags','Title','Clean_Text','AcceptedAnswerId','Score',
'AnswerCount', 'CommentCount', 'FavoriteCount', 'PostTypeId','ViewCount']
droppable = np.intersect1d(df.columns,unnecessary)
df = df.drop(droppable, 1)
df = df.dropna()
if df.shape[0]>30000:
df = df[:30000]
return df
# Split Data Into Train and Test Data
def splitIntoTestAndTrain(dataFrame,splitBy):
rand = list(range(len(dataFrame)))
div = int(len(dataFrame) * splitBy)
rd.shuffle(rand)
train_df_index = rand[:div]
test_df_index = rand[div:]
train_df = dataFrame.iloc[train_df_index, :]
test_df = dataFrame.iloc[test_df_index, :]
return train_df,test_df;
#Best SVM Classifier
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn import svm, grid_search
def svc_param_selection(X, y, nfolds):
param_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
'C': [10.**i for i in np.arange(-2,2)]},
{'kernel': ['linear'], 'C': [10.**i for i in np.arange(-2,2)]}]
#param_grid = [{'C': [10.**i for i in np.arange(-2,2)], 'kernel': ['linear']}]
grid_search = GridSearchCV(svm.SVC(probability=True), param_grid, cv=nfolds)
grid_search.fit(X, y)
#clf=SVC(kernel='linear', probability=True)
#clf.fit(X,y)
#return clf
return grid_search.best_estimator_
def reportCard(y_true, y_pred, y_proba,name):
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
print('\nReport for:',name)
target_names = ['low', 'high']
print(metrics.classification_report(y_true, y_pred, target_names=target_names))
print("Confusion Matrix\n",metrics.confusion_matrix(y_true,y_pred))
print("F1 Score:",metrics.f1_score(y_true,y_pred))
print("Accuracy:",metrics.accuracy_score(y_true,y_pred))
print("Log Loss:",metrics.log_loss(y_true,y_pred))
print("AUC Score:",metrics.roc_auc_score(y_true,y_proba[:,1]))
fpr,tpr,thresholds = metrics.roc_curve(y_true,y_proba[:,1])
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve for {}'.format(name))
plt.legend(loc='best')
plt.show()
def svm_run(filename):
print("\nRunning for",filename)
df = preprocess(filename)
print("dataset size is:",df.shape)
print("columns:",df.columns)
train,test=splitIntoTestAndTrain(df,0.8)
print("\nSVM")
best_svm = svc_param_selection(train.drop('ScoreLabel', 1),train['ScoreLabel'],5)
grid_svm_pred = best_svm.predict(test.drop('ScoreLabel', 1))
grid_svm_pred_prob = best_svm.predict_proba(test.drop('ScoreLabel', 1))
print("*****BOF: {}*****".format(filename))
reportCard(test['ScoreLabel'], grid_svm_pred, grid_svm_pred_prob,'GridSearched SVM')
print("*****EOF: {}*****".format(filename))
if __name__ == "__main__":
grid_svm_pred1 = clf.predict(test.drop('ScoreLabel', 1))
grid_svm_pred_prob1 = clf.predict_proba(test.drop('ScoreLabel', 1))
reportCard(test['ScoreLabel'], grid_svm_pred1, grid_svm_pred_prob1,'GridSearched SVM')