malware_finalsub_xgb.py

# -*- coding: utf-8 -*-
"""finalsub-xgb (2).ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1JelOKlbzSLt6QO83vOcTaG0EcR-RDvek
"""

# All the packages used for processing
import numpy as np 
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns

train_sample_fraction = None

train_sample_num = 567730

na_rate_threshold = 0.9

unbalanced_feature_rate_threshold = 0.9

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
plt.rcParams['figure.figsize'] = (5, 5)

#  Loading Training data
train_data = pd.read_csv('../input/malware-detection-tejas/train.csv')

#  Removing columns which are having null value fraction greater than the threshold of NA rate
trainCols = list(train_data.columns)

for col in trainCols:
    
    # remove columns with high NA rate
    na_rate = train_data[col].isnull().sum() / train_data.shape[0]
    
    # remove columns with high Unbalanced values rate
    unbalanced_rate = train_data[col].value_counts(normalize=True, dropna=False).values[0]
    
    if na_rate > na_rate_threshold:
        trainCols.remove(col)
    elif unbalanced_rate > unbalanced_feature_rate_threshold:
        trainCols.remove(col)

# Filtering the columns after removing the columns having higher null values
train_data = train_data[trainCols]

# Filtering categorical and numerical columns
cat_lists = list(train_data.select_dtypes(include=['object']).columns)
cat_lists.remove('MachineIdentifier')
int_lists = list(train_data.select_dtypes(include=['int64', 'float64']).columns)

"""## Machine Learning Modeling and Tuning"""

# Loading Test Data
test = pd.read_csv('../input/malware-detection-tejas/test.csv', usecols=trainCols[:-1])

#  Dropping MachineIdentifier column
train_data = train_data.drop(['MachineIdentifier'], axis=1)
test = test.drop(['MachineIdentifier'], axis=1)

"""### Filling NA values with the statistical Mode"""

modes = train_data.mode()

for col in train_data.columns:
    train_data[col] = np.where(train_data[col].isnull(), modes[col], train_data[col])

modes_test = test.mode()

for col in test.columns:
    test[col] = np.where(test[col].isnull(), modes_test[col], test[col])

"""### Concatenate both train_sample and test sets before label encoding"""

train_shape = train_data.shape
test_shape = test.shape

train_and_test = pd.concat([train_data,test], axis="rows", sort=False)

"""### Label Encoding the Categorical features before machine learning modeling"""

from sklearn.preprocessing import LabelEncoder
def Encoder(columnlist,data):
    for i in columnlist:
        #print(i)
        labelenc=LabelEncoder()
        data[i]=labelenc.fit_transform(data[i].astype(str))

Encoder(cat_lists, train_and_test)

"""### Splitting the data back to train and test after Label Encoding"""

train_data = train_and_test[0:train_shape[0]]
test = train_and_test[(train_shape[0]):(train_and_test.shape[0]+1)]

del train_and_test

"""### Removing HasDetections column from test data as it has been added during concatenation"""

test = test.drop(["HasDetections"], axis = 1)

y = train_data['HasDetections']
X = train_data.drop(['HasDetections'], axis=1)

# Converting the type of data in label to numerical
y = y.astype(int)

from sklearn.model_selection import train_test_split
xtrain, xvalid, ytrain, yvalid = train_test_split(X, y, random_state=42, test_size=0.3)

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import xgboost as xgb

"""### Best HyperParameters :
learning_rate =0.03, 
n_estimators=4000, 
max_depth=11,
min_child_weight=7,
gamma=0.2,
subsample=1,
colsample_bytree=0.4,
reg_alpha=0.6,
reg_lambda = 1,
objective= 'binary:logistic',
nthread=-1
"""

clf_xgb = xgb.XGBClassifier(learning_rate =0.03, 
                            n_estimators=4000, 
                            max_depth=11,
                            min_child_weight=7,
                            gamma=0.2,
                            subsample=1,
                            colsample_bytree=0.4,
                            reg_alpha=0.6,
                            reg_lambda = 1,
                            objective= 'binary:logistic',
                            nthread=-1,
                            scale_pos_weight=1,
                            seed=42)

clf_xgb.fit(xtrain, ytrain, eval_set=[(xtrain, ytrain), (xvalid, yvalid)], 
            early_stopping_rounds=100, eval_metric='auc', verbose=100)

predictions = clf_xgb.predict(xvalid)
predictions_probas = clf_xgb.predict_proba(xvalid)

print("roc-auc score", roc_auc_score(yvalid, predictions_probas[:,1]))

sns.set(rc={'figure.figsize':(5, 18)})
xgb.plot_importance(clf_xgb, title='Feature importance', xlabel='F score', ylabel='Features')

"""### Predicting the probabilities for HasDetections on Test Data

"""

predictions_proba_test = clf_xgb.predict_proba(test)[:,1]

"""### Prepare Submission File"""

submission = pd.read_csv('../input/malware-detection-tejas/test.csv', usecols=['MachineIdentifier'])
submission['HasDetections'] = predictions_proba_test
submission.to_csv('xgboost1_0.7168420091487775R.csv', index=False)