-
Notifications
You must be signed in to change notification settings - Fork 0
/
malware_catboost.py
158 lines (119 loc) · 6.66 KB
/
malware_catboost.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# -*- coding: utf-8 -*-
"""bishwa-catboost.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/189BqES1x3tM3q5wyEZ4BncX7oQAPFto_
"""
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
plt.rcParams['figure.figsize'] = (20.0, 10.0)
"""**Loading the train data**"""
data_train = pd.read_csv('../input/malware-detection-tejas/train.csv')
"""**Preprocessing the data**"""
data_train.columns
#OMG!! This is a highly unbalanced data. We're screwed.
print(data_train['HasDetections'].value_counts()/data_train.shape[0])
#Let's check null values in each column
(data_train.isna().sum()/data_train.shape[0]).sort_values(ascending=False)
# PuaMode, Census_ProcessorClass, DefaultBrowsersIdentifier have almost more than 94.8% null values so even if we substitute,
#it will make the feature skewed, so better we drop these columns along with MachineIdentifier
data_train.drop(columns=['PuaMode','Census_ProcessorClass','DefaultBrowsersIdentifier','MachineIdentifier'],inplace=True)
#Looking at other columns, maintaining a list of some other columns that would be required to drop
#for instance those with very skewed distributions
for column in data_train.columns:
print(column)
print(data_train[column].value_counts()/data_train.shape[0])
#All of these columns are highly skewed, and some of them have too many null values,
#which after imputing will make them skewed.
# Census_IsFlightingInternal
# Census_InternalBatteryType
# Census_ThresholdOptIn
# Census_IsWIMBootEnabled
lst=['ProductName','IsBeta','RtpStateBitfield','IsSxsPassiveMode','AVProductsEnabled',
'HasTpm','Platform','Processor','OsVer','IsProtected','AutoSampleOptIn','SMode','Firewall',
'UacLuaenable','Census_DeviceFamily','Census_ProcessorManufacturerIdentifier','Census_HasOpticalDiskDrive',
'Census_OSArchitecture','Census_IsPortableOperatingSystem','Census_GenuineStateName','Census_IsFlightsDisabled',
'Census_FlightRing','Census_IsFlightingInternal','Census_ThresholdOptIn','Census_IsWIMBootEnabled',
'Census_InternalBatteryType','Census_PowerPlatformRoleName','Census_IsVirtualDevice','Census_IsTouchEnabled',
'Census_IsPenCapable','Census_IsAlwaysOnAlwaysConnectedCapable']
print(Total number of columns dropped len(lst))
data_train.drop(columns=lst,inplace=True)
#Let's again check null values in each column
(data_train.isna().sum()/data_train.shape[0]).sort_values(ascending=False)
#Now, none of the columns seem to be of continuous data, all of them have discrete values
#We can very well impute null values with mode(let's have a check how it works)
lst1=[i for i in data_train.columns if data_train[i].isna().any()]
for i in lst1:
data_train[i].fillna(data_train[i].mode()[0],inplace=True)
data_train.isna().any()
"""**Loading the test data and repeating the preprocessing steps over test data**"""
data_test = pd.read_csv('/kaggle/input/malware-detection-tejas/test.csv')
#repeating the same steps in data_test
machine_identifier=data_test['MachineIdentifier']
data_test.drop(columns=['MachineIdentifier'],inplace=True)
drop_cols=['PuaMode','Census_ProcessorClass','DefaultBrowsersIdentifier','ProductName','IsBeta','RtpStateBitfield',
'IsSxsPassiveMode','AVProductsEnabled',
'HasTpm','Platform','Processor','OsVer','IsProtected','AutoSampleOptIn','SMode','Firewall',
'UacLuaenable','Census_DeviceFamily','Census_ProcessorManufacturerIdentifier','Census_HasOpticalDiskDrive',
'Census_OSArchitecture','Census_IsPortableOperatingSystem','Census_GenuineStateName','Census_IsFlightsDisabled',
'Census_FlightRing','Census_IsFlightingInternal','Census_ThresholdOptIn','Census_IsWIMBootEnabled',
'Census_InternalBatteryType','Census_PowerPlatformRoleName','Census_IsVirtualDevice','Census_IsTouchEnabled',
'Census_IsPenCapable','Census_IsAlwaysOnAlwaysConnectedCapable']
print(Total number of columns dropped len(drop_cols))
data_test.drop(columns=drop_cols,inplace=True)
data_test.isna().sum()
#imputing null values of test data with mode
lst1=[i for i in data_test.columns if data_test[i].isna().any()]
for i in lst1:
data_test[i].fillna(data_test[i].mode()[0],inplace=True)
print(data_train.shape)
print(data_test.shape)
#segregating all the categorical columns
cat_lists = []
for cols in data_train.columns :
if data_train[cols].dtype == 'object' :
cat_lists.append(cols)
print('No of final categorical columns : ', len(cat_lists))
"""**Model Training**"""
X = data_train.iloc[: , :47]
y = data_train.iloc[: , 47: ]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 0)
print(X_train.shape)
print(y_train.shape)
#using catboost
from catboost import CatBoostClassifier
model=CatBoostClassifier(cat_features=cat_lists,n_estimators=1500)
model.fit(X_train,y_train)
#plotting feature importance based on the model
plt.figure(figsize = (5, 18))
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.plot(kind='barh')
#having a look at the roc_auc score
from sklearn.metrics import roc_auc_score
y_pred=model.predict_proba(X_test)[:,1]
roc_auc_score(y_test,y_pred)
#training the model again over the whole data, to increase model efficiency
model.fit(X,y)
#model prediction over test data
predicted_proba=model.predict_proba(data_test)[:,1]
"""**Creating the submission file**"""
#creating the submission file
submission = pd.DataFrame({'MachineIdentifier' : machine_identifier, 'HasDetections' : predicted_proba})
submission.to_csv('cat_boost_0.7147524388500393.csv', index=False)