-
Notifications
You must be signed in to change notification settings - Fork 0
/
fun_b.py
111 lines (95 loc) · 3.39 KB
/
fun_b.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# From Say_No_to_Overfitting
import warnings
warnings.filterwarnings("ignore")
import shutil
import os
import pandas as pd
import matplotlib
matplotlib.use(u'nbAgg')
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
from sklearn.manifold import TSNE
from sklearn import preprocessing
import pandas as pd
from multiprocessing import Process# this is used for multithreading
import multiprocessing
import codecs# this is used for file operations
import random as r
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
def entropy(p,n):
p_ratio = float(p)/(p+n)
n_ratio = float(n)/(p+n)
return -p_ratio*math.log(p_ratio) - n_ratio * math.log(n_ratio)
def info_gain(p0,n0,p1,n1,p,n):
return entropy(p,n) - float(p0+n0)/(p+n)*entropy(p0,n0) - float(p1+n1)/(p+n)*entropy(p1,n1)
import array
from csv import writer
def read_image(filename):
f = open(filename,'rb')
ln = os.path.getsize(filename) # length of file in bytes
width = 256
rem = ln%width
a = array.array("B") # uint8 array
a.fromfile(f,ln-rem)
f.close()
g = np.reshape(a,(int(len(a)/width),width))
g = np.uint8(g)
g= np.resize(g,(1000,))
return list(g)
# Do asm image extraction
def extract_byte_image_features(tfiles):
byte_files = [i for i in tfiles if '.txt' in i]
ftot = len(byte_files)
pid = os.getpid()
print('Process id:', pid)
feature_file = 'data/' + str(pid) + '-test-image-features-byte.csv'
print('feature file:', feature_file)
outrows = []
with open(feature_file,'w') as f:
fw = writer(f)
column_names = ['filename'] + [("byte_{:s}".format(str(x))) for x in range(1000)]
fw.writerow(column_names)
for idx, fname in enumerate(byte_files):
file_id = fname.split('.')[0]
image_data = read_image(ext_drive + fname)
outrows.append([file_id] + image_data)
# Print progress
if (idx+1) % 10 == 0:
print(pid, idx + 1, 'of', ftot, 'files processed.')
fw.writerows(outrows)
outrows = []
# Write remaining files
if len(outrows) > 0:
fw.writerows(outrows)
outrows = []
import time as tm
from multiprocessing import Pool
# TRAIN FILES ASM
# Now divide the train files into four groups for multiprocessing
start_time = tm.time()
ext_drive = 'byteFiles/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/2
quart=int(quart)
train1 = tfiles[:quart]
train2 = tfiles[quart:]
print(len(tfiles), quart, (len(train1)+len(train2)))
trains = [train1, train2]
#p = Pool(4)
#p.map(extract_asm_image_features, trains)
#print("Elapsed time: {:.2f} hours.".format((tm.time() - start_time)/3600.0))
def g1_process():
extract_byte_image_features(train1)
def g2_process():
extract_byte_image_features(train2)