-
Notifications
You must be signed in to change notification settings - Fork 0
/
Credit_Card_Fraud_Detection_Python.py
635 lines (485 loc) · 25.8 KB
/
Credit_Card_Fraud_Detection_Python.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
# -*- coding: utf-8 -*-
"""PRML_Minor_Project_B21CS076_B21EE012.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1qGqgWIhQPw8fAv5hkQ0EsGShnD58--QE
# Minor Project : Tarun Raj Singh (B21CS076) and Aryan Himmatlal Prajapati (B21EE012)
<h3> (PROJECT 2) TOPIC : CREDIT CARD FRAUD DETECTION </h3>
"""
# Importing Necessary Libraries and Modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import lightgbm
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import f1_score as f1
from sklearn.metrics import recall_score as rs
from sklearn.metrics import precision_score as ps
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score as acc
from tabulate import tabulate
import warnings
warnings.filterwarnings('ignore')
# Importing and Reading dataset into Pandas DataFrame
df = pd.read_csv('creditcard.csv')
df
# Checking Number of Instances in each class
num0 = df['Class'].value_counts()
print("Number of Instances of Class 0 ( not fraud ) : ",num0[0])
print("Number of Instances of Class 1 ( fraud ) : ",num0[1])
print("_______________________________________________________")
print("Highly Imbalanced by ratio : " ,num0[0]/num0[1])
"""<h5> Hence we can see that the dataset is highly Imbalanced as Instances in Class 0 >>>> Instances in Class 1 </h5>"""
# Visualizing the Distribution of data by number of Instances in each class using a Bar plot
x = ["0 : Not Fraud","1 : Fraud"]
fig, ax = plt.subplots(figsize=(7,4.50))
bars = ax.bar([0,1],num0,tick_label=x,alpha=0.75)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.yaxis.grid(True, color='#EEEEEE')
bar_color = bars[0].get_facecolor()
for bar in bars:
ax.text(
bar.get_x() + bar.get_width() / 2,
bar.get_height() + 8000,
round(bar.get_height(), 1),
horizontalalignment='center',
color="dimgrey",
weight='bold'
)
ax.set_xlabel('Class', labelpad=15, color='#333333',weight='bold')
ax.set_ylabel('Number of Instances', labelpad=15, color='#333333',weight='bold')
ax.set_title('Distribution of Data', pad=15, color='#333333',
weight='bold')
low_point = (1,20000)
high_point = (0.18,290000)
plt.annotate('Extremely Less instances', xy=low_point, xytext=(1.2,100000),
arrowprops=dict(facecolor='red', shrink=0.05))
plt.annotate('Extremely High instances', xy=high_point, xytext=(0.5, 250000),
arrowprops=dict(facecolor='red', shrink=0.05))
"""# If we apply any classifier Algorithm on this highly imbalanced dataset :
<h3> Using Adaboost </h3>
"""
# Seperating out Features and labels
x_data = df.drop(columns=['Class'])
y_data = df['Class']
# Splitting into training and testing features and labels
x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,random_state = 100,stratify = y_data)
# Creating Adaboost Object
Ada_clf = AdaBoostClassifier(random_state=42)
Ada_clf.fit(x_train,y_train)
# Predicting the test data outcomes
y_preds = Ada_clf.predict(x_test)
# Checking the Classification report
class_names = ["0 : Not Fraud","1 : Fraud"]
print(classification_report(y_true=y_test,y_pred=y_preds,target_names=class_names))
# Plotting the Confusion Matrix for Imbalanced Data and AdaBoost
cf_mat_ada_imb = cm(y_test,y_preds,labels=[0,1])
disp_ada_imb = ConfusionMatrixDisplay(confusion_matrix=cf_mat_ada_imb,display_labels=["0 : Not Fraud","1 : Fraud"])
disp_ada_imb.plot()
"""<h5> From the confusion matrix it is clear that the model is very poorly performing for the Class 1 (fraud) instances , due to the highly Imbalance in the data . </h5>
<h3> Using XGBoost </h3>
"""
# Creating XGBboost Object
XGB_clf = XGBClassifier(random_state=42)
XGB_clf.fit(x_train,y_train)
# Predicting the test data outcomes
y_preds_xgb = XGB_clf.predict(x_test)
# Checking the Classification report
print(classification_report(y_true=y_test,y_pred=y_preds_xgb,target_names=class_names))
# Plotting the Confusion Matrix for Imbalanced Data and XGBoost
cf_mat_xgb_imb = cm(y_test,y_preds_xgb,labels=[0,1])
disp_xgb_imb = ConfusionMatrixDisplay(confusion_matrix=cf_mat_xgb_imb,display_labels=["0 : Not Fraud","1 : Fraud"])
disp_xgb_imb.plot()
"""<h5> Here also we can clearly see from the confusion matrix that the model is very poor in classifying the class 1 (Fraud) Instances </h5>
<h3> Using LGBM </h3>
"""
# Making LGBM data (Imbalanced)
params = {'num_leaves':50}
train_data_imb = lightgbm.Dataset(x_train,label=y_train)
test_data_imb = lightgbm.Dataset(x_test,label=y_test)
# Trainign LGBM model on Imbalanced data
LGBM_imb = lightgbm.train(params,train_data_imb)
# Predicting the results for Testing set
LGBM_preds = LGBM_imb.predict(x_test)
# Converting Predictions to the nearest class labels
lgbm_preds_imb = []
for i in LGBM_preds:
if i <= 0.5:
lgbm_preds_imb.append(0)
else:
lgbm_preds_imb.append(1)
# Checking the classfication report
print(classification_report(y_true=y_test,y_pred=lgbm_preds_imb,target_names=class_names))
# Plotting the Confusion Matrix for Imbalanced Data and LGBM
cf_mat_lgbm_imb = cm(y_test,lgbm_preds_imb,labels=[0,1])
disp_lgbm_imb = ConfusionMatrixDisplay(confusion_matrix=cf_mat_lgbm_imb,display_labels=["0 : Not Fraud","1 : Fraud"])
disp_lgbm_imb.plot()
"""<h5> As we can see that the overall Accuracy of all these models using the Imbalanced data is close to 1 , still the Precision , Recall and F1 score for class 1 is not good enough , this is due to the highly imbalanced data which is biased towards predicting class 0 , To improve this , we can apply various Imbalance handling algorithms.</h5>
# Applying Imbalance handling methods using Adaboost :
<h2> 1. UnderSampling </h2>
"""
# Seperating Instances of each class
cl_0_new1 = df[df['Class'] == 0]
cl_1_new1 = df[df['Class'] == 1]
# UnderSampling the Majority class (i.e , class 0)
cl_0_new1 = cl_0_new1.sample(cl_1_new1.shape[0])
"""Concatinating the undersampled class 0 and original class 1 data and creating new Undersampled dataset having
equal number of instances in each class (i.e , 492 instances for both classes)"""
new_df1 = pd.concat([cl_0_new1,cl_1_new1],axis=0)
# Splitting Data into Training and Testing sets
x_data1 = new_df1.drop(columns=['Class'])
y_data1 = new_df1['Class']
x_train1,x_test1,y_train1,y_test1 = train_test_split(x_data1,y_data1,random_state=42,stratify=y_data1)
# Applying Adaboost on this Undersampled Dataset
Ada_under_1 = AdaBoostClassifier(random_state=100)
Ada_under_1.fit(x_train1,y_train1)
# Predicting the test Instances
y_preds_under1 = Ada_under_1.predict(x_test1)
# Classification report
print(classification_report(y_true=y_test1,y_pred=y_preds_under1,target_names=class_names))
# Plotting the Confusion Matrix for UnderSampling Data and Adaboost
cf_mat_under = cm(y_test1,y_preds_under1,labels=[0,1])
disp = ConfusionMatrixDisplay(confusion_matrix=cf_mat_under,display_labels=["0 : Not Fraud","1 : Fraud"])
disp.plot()
"""<h2> 2. OverSampling </h2> """
# Seperating Instances of each class
cl_0_new2 = df[df['Class'] == 0]
cl_1_new2 = df[df['Class'] == 1]
# OverSampling the Minority class (i.e , class 1)
cl_new_1 = cl_1_new2.sample(cl_0_new2.shape[0],replace=True)
"""Concatinating the oversampled class 1 and original class 0 data and creating new Oversampled dataset having
equal number of instances in each class (i.e , 284315 instances for both classes)"""
new_df2 = pd.concat([cl_0_new2,cl_new_1],axis=0)
# Splitting Data into Training and Testing sets
x_data2 = new_df2.drop(columns=['Class'])
y_data2 = new_df2['Class']
x_train2,x_test2,y_train2,y_test2 = train_test_split(x_data2,y_data2,random_state=42,stratify=y_data2)
# Applying Adaboost on this Oversampled Dataset
Ada_over_1 = AdaBoostClassifier(random_state=100)
Ada_over_1.fit(x_train2,y_train2)
# Predicting the test Instances
y_preds_over1 = Ada_over_1.predict(x_test2)
# Classification report
print(classification_report(y_true=y_test2,y_pred=y_preds_over1,target_names=class_names))
# Plotting the Confusion Matrix for OverSampled Data and Adaboost
cf_mat_over = cm(y_test2,y_preds_over1,labels=[0,1])
disp_over = ConfusionMatrixDisplay(confusion_matrix=cf_mat_over,display_labels=["0 : Not Fraud","1 : Fraud"])
disp_over.plot()
"""<h2> 3. Synthetic Minority Oversampling Technique (SMOTE) </h2>"""
# Creating Sythetic data for the Minority Class (i.e , Class 1) and making instances in both classes = 284315
smote_over = SMOTE(sampling_strategy='minority')
x_syn , y_syn = smote_over.fit_resample(x_data,y_data)
# Splitting data into training and testing sets
x_train_syn , x_test_syn , y_train_syn , y_test_syn = train_test_split(x_syn,y_syn,random_state=42,stratify=y_syn)
# Applying Adaboost on this Synthetic sampled Dataset
Ada_syn_over = AdaBoostClassifier(random_state=100)
Ada_syn_over.fit(x_train_syn,y_train_syn)
# Predicting the test Instances
y_preds_syn = Ada_syn_over.predict(x_test_syn)
# Classification report
print(classification_report(y_true=y_test_syn,y_pred=y_preds_syn,target_names=class_names))
# Plotting the Confusion Matrix for Synthetic Sampled Data and Adaboost
cf_mat_syn = cm(y_test_syn,y_preds_syn,labels=[0,1])
disp_syn = ConfusionMatrixDisplay(confusion_matrix=cf_mat_syn,display_labels=["0 : Not Fraud","1 : Fraud"])
disp_syn.plot()
"""# Applying Imbalance handling methods using XGBoost :
<h2>1. Under Sampling</h2>
"""
# Applying XGBoost on this Undersampled Dataset
XGB_under = XGBClassifier(random_state = 2)
XGB_under.fit(x_train1,y_train1)
# Predicting the test Instances
XGB_under_preds = XGB_under.predict(x_test1)
# Classification report
print(classification_report(y_true=y_test1,y_pred=XGB_under_preds,target_names=class_names))
# Plotting the Confusion Matrix for Synthetic Sampled Data and Adaboost
cf_mat_under2 = cm(y_test1,XGB_under_preds,labels=[0,1])
disp_under2 = ConfusionMatrixDisplay(confusion_matrix=cf_mat_under2,display_labels=["0 : Not Fraud","1 : Fraud"])
disp_under2.plot()
"""<h2>2. OverSampling </h2>"""
# Applying XGBoost on this Oversampled Dataset
XGB_over = XGBClassifier(random_state = 42)
XGB_over.fit(x_train2,y_train2)
# Predicting the test Instances
XGB_over_preds = XGB_over.predict(x_test2)
# Classification report
print(classification_report(y_true=y_test2,y_pred=XGB_over_preds,target_names=class_names))
# Plotting the Confusion Matrix for OverSampled Data and XGBoost
cf_mat_over2 = cm(y_test2,XGB_over_preds,labels=[0,1])
disp_over2 = ConfusionMatrixDisplay(confusion_matrix=cf_mat_over2,display_labels=["0 : Not Fraud","1 : Fraud"])
disp_over2.plot()
cnt = 0
cnt2 = 0
for i in range(len(XGB_over_preds)):
if (XGB_over_preds[i] == 1 and np.array(y_test2)[i] == 0):
cnt+=1
if (XGB_over_preds[i] == 0 and np.array(y_test2)[i] == 1):
cnt2+=1
print("Not Fraud Instances Predicted as Fraud (Out of 71,079) : ",cnt)
print("Fraud Instances Predicted as Not Fraud (Out of 71,079) : ",cnt2)
print("_________________________________________________________________")
print("")
print("Total Misclassified Instances (Out of 1,42,198) : ", cnt+cnt2)
print("_________________________________________________________________")
"""<h2> 3. Synthetic Minority Oversampling Technique (SMOTE) </h2>"""
# Applying XGBoost on this Synthetic sampled Dataset
XGB_smote = XGBClassifier(random_state = 2)
XGB_smote.fit(x_train_syn,y_train_syn)
# Predicting the test Instances
XGB_syn_preds = XGB_smote.predict(x_test_syn)
# Classification report
print(classification_report(y_true=y_test_syn,y_pred=XGB_syn_preds,target_names=class_names))
# Plotting the Confusion Matrix for Synthetic Sampled Data and XGBoost
cf_mat_syn2 = cm(y_test_syn,XGB_syn_preds,labels=[0,1])
disp_syn2 = ConfusionMatrixDisplay(confusion_matrix=cf_mat_syn2,display_labels=["0 : Not Fraud","1 : Fraud"])
disp_syn2.plot()
cnt_xg_2 = 0
cnt2_xg_2 = 0
for i in range(len(XGB_syn_preds)):
if (XGB_syn_preds[i] == 1 and np.array(y_test_syn)[i] == 0):
cnt_xg_2+=1
if (XGB_syn_preds[i] == 0 and np.array(y_test_syn)[i] == 1):
cnt2_xg_2+=1
print("Not Fraud Instances Predicted as Fraud (Out of 71,079) : ",cnt_xg_2)
print("Fraud Instances Predicted as Not Fraud (Out of 71,079) : ",cnt2_xg_2)
print("_________________________________________________________________")
print("")
print("Total Misclassified Instances (Out of 1,42,198) : ", cnt_xg_2+cnt2_xg_2)
print("_________________________________________________________________")
"""# Applying Imbalance handling methods using LGBM :
<h2>1. Under Sampling</h2>
"""
# Making LGBM data (Under Sampled)
params = {'num_leaves':50}
train_data_under = lightgbm.Dataset(x_train1,label=y_train1)
test_data_under = lightgbm.Dataset(x_test1,label=y_test1)
# Training LGBM model on this under sampled Data
LGBM_under = lightgbm.train(params,train_data_under)
# Making Predictions of test instances
LGBM_under_preds = LGBM_under.predict(x_test1)
# Converting Decimal values to nearest class
lgbm_preds_under_fin = []
for i in LGBM_under_preds:
if i <= 0.5:
lgbm_preds_under_fin.append(0)
else:
lgbm_preds_under_fin.append(1)
# Classification report
print(classification_report(y_true=y_test1,y_pred=lgbm_preds_under_fin,target_names=class_names))
# Plotting the Confusion Matrix for underSampled Data and LGBM
cf_mat_under3 = cm(y_test1,lgbm_preds_under_fin,labels=[0,1])
disp_under3 = ConfusionMatrixDisplay(confusion_matrix=cf_mat_under3,display_labels=["0 : Not Fraud","1 : Fraud"])
disp_under3.plot()
"""<h2>2. OverSampling </h2>"""
# Making LGBM data (Over Sampled)
params = {'num_leaves':50}
train_data_over = lightgbm.Dataset(x_train2,label=y_train2)
test_data_over = lightgbm.Dataset(x_test2,label=y_test2)
# Training LGBM model on this Over sampled Data
LGBM_over = lightgbm.train(params,train_data_over)
# Making Predictions of test instances
LGBM_over_preds = LGBM_over.predict(x_test2)
# Converting Decimal values to nearest class
lgbm_preds_over_fin = []
for i in LGBM_over_preds:
if i <= 0.5:
lgbm_preds_over_fin.append(0)
else:
lgbm_preds_over_fin.append(1)
# Classification report
print(classification_report(y_true=y_test2,y_pred=lgbm_preds_over_fin,target_names=class_names))
# Plotting the Confusion Matrix for OverSampled Data and LGBM
cf_mat_over3 = cm(y_test2,lgbm_preds_over_fin,labels=[0,1])
disp_over3 = ConfusionMatrixDisplay(confusion_matrix=cf_mat_over3,display_labels=["0 : Not Fraud","1 : Fraud"])
disp_over3.plot()
cnt_lgbm = 0
cnt2_lgbm = 0
for i in range(len(lgbm_preds_over_fin)):
if (lgbm_preds_over_fin[i] == 1 and np.array(y_test2)[i] == 0):
cnt_lgbm+=1
if (lgbm_preds_over_fin[i] == 0 and np.array(y_test2)[i] == 1):
cnt2_lgbm+=1
print("Not Fraud Instances Predicted as Fraud (Out of 71,079) : ",cnt_lgbm)
print("Fraud Instances Predicted as Not Fraud (Out of 71,079) : ",cnt2_lgbm)
print("_________________________________________________________________")
print("")
print("Total Misclassified Instances (Out of 1,42,198) : ", cnt_lgbm+cnt2_lgbm)
print("_________________________________________________________________")
"""<h2> 3. Synthetic Minority Oversampling Technique (SMOTE) </h2>"""
# Making LGBM data (Synthetic Sampled using SMOTE)
params = {'num_leaves':50}
train_data_syn = lightgbm.Dataset(x_train_syn,label=y_train_syn)
test_data_syn = lightgbm.Dataset(x_test_syn,label=y_test_syn)
# Training LGBM model on this Synthetic sampled Data
LGBM_syn = lightgbm.train(params,train_data_syn)
# Making Predictions of test instances
LGBM_syn_preds = LGBM_syn.predict(x_test_syn)
# Converting Decimal values to nearest class
lgbm_preds_syn_fin = []
for i in LGBM_syn_preds:
if i <= 0.5:
lgbm_preds_syn_fin.append(0)
else:
lgbm_preds_syn_fin.append(1)
# Classification report
print(classification_report(y_true=y_test_syn,y_pred=lgbm_preds_syn_fin,target_names=class_names))
# Plotting the Confusion Matrix for Synthetic Sampled Data and LGBM
cf_mat_syn3 = cm(y_test_syn,lgbm_preds_syn_fin,labels=[0,1])
disp_syn3 = ConfusionMatrixDisplay(confusion_matrix=cf_mat_syn3,display_labels=["0 : Not Fraud","1 : Fraud"])
disp_syn3.plot()
cnt_lgbm2 = 0
cnt2_lgbm2 = 0
for i in range(len(lgbm_preds_syn_fin)):
if (lgbm_preds_syn_fin[i] == 1 and np.array(y_test_syn)[i] == 0):
cnt_lgbm2+=1
if (lgbm_preds_syn_fin[i] == 0 and np.array(y_test_syn)[i] == 1):
cnt2_lgbm2+=1
print("Not Fraud Instances Predicted as Fraud (Out of 71,079) : ",cnt_lgbm2)
print("Fraud Instances Predicted as Not Fraud (Out of 71,079) : ",cnt2_lgbm2)
print("_________________________________________________________________")
print("")
print("Total Misclassified Instances (Out of 1,42,198) : ", cnt_lgbm2+cnt2_lgbm2)
print("_________________________________________________________________")
"""# Plots for Performance on different model selections"""
##### For AdaBoost #####
f1_s_ada = []
ps_s_ada = []
rs_s_ada = []
acc_s_ada = []
f1_s_ada = [f1(y_test,y_preds),f1(y_test1,y_preds_under1),f1(y_test2,y_preds_over1),f1(y_test_syn,y_preds_syn)]
ps_s_ada = [ps(y_test,y_preds),ps(y_test1,y_preds_under1),ps(y_test2,y_preds_over1),ps(y_test_syn,y_preds_syn)]
rs_s_ada = [rs(y_test,y_preds),rs(y_test1,y_preds_under1),rs(y_test2,y_preds_over1),rs(y_test_syn,y_preds_syn)]
acc_s_ada = [acc(y_test,y_preds),acc(y_test1,y_preds_under1),acc(y_test2,y_preds_over1),acc(y_test_syn,y_preds_syn)]
f1_s_ada = np.round(np.array(f1_s_ada), 3)
ps_s_ada = np.round(np.array(ps_s_ada), 3)
rs_s_ada = np.round(np.array(rs_s_ada), 3)
acc_s_ada = np.round(np.array(acc_s_ada), 3)
##### For XGBoost #####
f1_s_xgb = []
ps_s_xgb = []
rs_s_xgb = []
acc_s_xgb = []
f1_s_xgb = [f1(y_test,y_preds_xgb),f1(y_test1,XGB_under_preds),f1(y_test2,XGB_over_preds),f1(y_test_syn,XGB_syn_preds)]
ps_s_xgb = [ps(y_test,y_preds_xgb),ps(y_test1,XGB_under_preds),ps(y_test2,XGB_over_preds),ps(y_test_syn,XGB_syn_preds)]
rs_s_xgb = [rs(y_test,y_preds_xgb),rs(y_test1,XGB_under_preds),rs(y_test2,XGB_over_preds),rs(y_test_syn,XGB_syn_preds)]
acc_s_xgb = [acc(y_test,y_preds_xgb),acc(y_test1,XGB_under_preds),acc(y_test2,XGB_over_preds),acc(y_test_syn,XGB_syn_preds)]
f1_s_xgb = np.round(np.array(f1_s_xgb), 3)
ps_s_xgb = np.round(np.array(ps_s_xgb), 3)
rs_s_xgb = np.round(np.array(rs_s_xgb), 3)
acc_s_xgb = np.round(np.array(acc_s_xgb), 3)
##### For LGBM #####
f1_s_lgbm = []
ps_s_lgbm = []
rs_s_lgbm = []
acc_s_lgbm = []
f1_s_lgbm = [f1(y_test,lgbm_preds_imb),f1(y_test1,lgbm_preds_under_fin),f1(y_test2,lgbm_preds_over_fin),f1(y_test_syn,lgbm_preds_syn_fin)]
ps_s_lgbm = [ps(y_test,lgbm_preds_imb),ps(y_test1,lgbm_preds_under_fin),ps(y_test2,lgbm_preds_over_fin),ps(y_test_syn,lgbm_preds_syn_fin)]
rs_s_lgbm = [rs(y_test,lgbm_preds_imb),rs(y_test1,lgbm_preds_under_fin),rs(y_test2,lgbm_preds_over_fin),rs(y_test_syn,lgbm_preds_syn_fin)]
acc_s_lgbm = [acc(y_test,lgbm_preds_imb),acc(y_test1,lgbm_preds_under_fin),acc(y_test2,lgbm_preds_over_fin),acc(y_test_syn,lgbm_preds_syn_fin)]
f1_s_lgbm = np.round(np.array(f1_s_lgbm), 3)
ps_s_lgbm = np.round(np.array(ps_s_lgbm), 3)
rs_s_lgbm = np.round(np.array(rs_s_lgbm), 3)
acc_s_lgbm = np.round(np.array(acc_s_lgbm), 3)
# Plot for F1 scores
plt.figure(figsize = (8,5))
plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],f1_s_ada,marker='o', linestyle='--',label = "AdaBoost",color='b')
plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],f1_s_xgb,marker='o', linestyle='--',label = "XGBoost",color = 'g')
plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],f1_s_lgbm,marker='o', linestyle='--',label = "LGBM",color = 'r')
plt.grid()
plt.title("Plot of F1 Scores for Different methods")
plt.xlabel("Imbalance handling Method",weight = 'bold')
plt.ylabel("F1 score",weight = 'bold')
plt.legend(loc='lower right')
plt.show()
# Plot for Precision scores
plt.figure(figsize = (8,5))
plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],ps_s_ada,marker='o', linestyle='--',label = "AdaBoost",color='b')
plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],ps_s_xgb,marker='o', linestyle='--',label = "XGBoost",color = 'g')
plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],ps_s_lgbm,marker='o', linestyle='--',label = "LGBM",color = 'r')
plt.grid()
plt.title("Plot of Precission Scores for Different methods")
plt.xlabel("Imbalance handling Method",weight = 'bold')
plt.ylabel("Precission Scores",weight = 'bold')
plt.legend(loc='lower right')
plt.show()
# Plot for Recall scores
plt.figure(figsize = (8,5))
plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],rs_s_ada,marker='o', linestyle='--',label = "AdaBoost",color='b')
plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],rs_s_xgb,marker='o', linestyle='--',label = "XGBoost",color = 'g')
plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],rs_s_lgbm,marker='o', linestyle='--',label = "LGBM",color = 'r')
plt.grid()
plt.title("Plot of Recall Scores for Different methods")
plt.xlabel("Imbalance handling Method",weight = 'bold')
plt.ylabel("Recall Scores",weight = 'bold')
plt.legend(loc='lower right')
plt.show()
# Plot for Accuracy scores
plt.figure(figsize = (8,5))
plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],acc_s_ada,marker='o', linestyle='--',label = "AdaBoost",color='b')
plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],acc_s_xgb,marker='o', linestyle='--',label = "XGBoost",color = 'g')
plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],acc_s_lgbm,marker='o', linestyle='--',label = "LGBM",color = 'r')
plt.grid()
plt.title("Plot of Accuracy Scores for Different methods")
plt.xlabel("Imbalance handling Method",weight = 'bold')
plt.ylabel("Accuracy Scores",weight = 'bold')
plt.legend(loc='lower right')
plt.show()
#Best model grapgh
plt.figure(figsize = (8,5))
# plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],acc_s_ada,marker='o', linestyle='--',label = "AdaBoost",color='b')
plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],acc_s_xgb,marker='o', linestyle='--',label = "XGBoost",color = 'g')
plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],acc_s_lgbm,marker='o', linestyle='--',label = "LGBM",color = 'r')
# plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],rs_s_ada,marker='o', linestyle='--',label = "AdaBoost",color='b')
plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],rs_s_xgb,marker='o', linestyle='--',label = "XGBoost",color = 'g')
plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],rs_s_lgbm,marker='o', linestyle='--',label = "LGBM",color = 'r')
# plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],f1_s_ada,marker='o', linestyle='--',label = "AdaBoost",color='b')
plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],f1_s_xgb,marker='o', linestyle='--',label = "XGBoost",color = 'g')
plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],f1_s_lgbm,marker='o', linestyle='--',label = "LGBM",color = 'r')
# plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],f1_s_ada,marker='o', linestyle='--',label = "AdaBoost",color='b')
plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],f1_s_xgb,marker='o', linestyle='--',label = "XGBoost",color = 'g')
plt.plot(["Imbalanced","Under Sampled","Over Sampled","SMOTE"],f1_s_lgbm,marker='o', linestyle='--',label = "LGBM",color = 'r')
plt.grid()
plt.title("Plot of Different measures and Scores for different Models")
plt.xlabel("Imbalance handling Method",weight = 'bold')
plt.ylabel("Scores",weight = 'bold')
low_point = ("Over Sampled",1)
high_point = ("SMOTE",1)
plt.annotate('Best Models', xy=low_point, xytext=("Over Sampled",0.85),
arrowprops=dict(facecolor='blue', shrink=0.05))
plt.annotate('Best Models', xy=high_point, xytext=("Over Sampled",0.85),
arrowprops=dict(facecolor='blue', shrink=0.05))
# plt.legend(loc='lower right')
plt.show()
"""<h3>Finally Lets Try to predict the actual Data using our best model and check it's score on the actual data. </h3>"""
# From the evaluation steps , we got XGBoost and Oversampling model as the best trained model.
best_model = XGB_over
y_actual = y_data
y_best_predicted = best_model.predict(x_data)
# Checking the Classification report
class_names = ["0 : Not Fraud","1 : Fraud"]
print(classification_report(y_true=y_actual,y_pred=y_best_predicted,target_names=class_names))
# Plotting the Confusion Matrix for Imbalanced Data and AdaBoost
act_data_cm = cm(y_actual,y_best_predicted,labels=[0,1])
disp_act_ = ConfusionMatrixDisplay(confusion_matrix=act_data_cm,display_labels=["0 : Not Fraud","1 : Fraud"])
disp_act_.plot()
cnt_best_model_actual_data = 0
cnt2_best_model_actual_data = 0
for i in range(len(y_best_predicted)):
if (y_best_predicted[i] == 1 and np.array(y_actual)[i] == 0):
cnt_best_model_actual_data+=1
if (y_best_predicted[i] == 0 and np.array(y_actual)[i] == 1):
cnt2_best_model_actual_data+=1
print("Not Fraud Instances Predicted as Fraud (Out of 2,84,315) : ",cnt_best_model_actual_data)
print("Fraud Instances Predicted as Not Fraud (Out of 492) : ",cnt2_best_model_actual_data)
print("_________________________________________________________________")
print("")
print("Total Misclassified Instances (Out of 2,84,807) : ", cnt_best_model_actual_data+cnt2_best_model_actual_data)
print("_________________________________________________________________")
"""<h4> Our best trained model is correctly classifying all the fraud instances of the actual data , hence the model is not at all biased towards any of the classes , this can be seen from the fact that it only missclassified 10 points from 2,84,315 non fraud instances as fraud , but did not classified any fraud instances as non fraud. So, every fraud instance is caught with an accuracy of 100% and every non fraud instance is detected corectly with an accuracy of 99.996%</h4>"""