-
Notifications
You must be signed in to change notification settings - Fork 14
/
chart.py
100 lines (77 loc) · 3.08 KB
/
chart.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
__author__ = 'Davide Tampellini'
__copyright__ = '2015 Davide Tampellini - FabbricaBinaria'
__license__ = 'GNU GPL version 3 or later'
import argparse
import textwrap
import scipy as sp
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent('''
Dump Scraper - Training data chart
This script will print out a chart showing the how the features are distribuited
'''))
parser.add_argument('-c', '--columns', help='Comma separated list of training columns that will be displayed', required=True)
args = parser.parse_args()
# Print banner
print("")
print("Dump Scraper - Training chart")
print("Copyright (C) 2015 FabbricaBinaria - Davide Tampellini")
print("===============================================================================")
print("Dump Scraper is Free Software, distributed under the terms of the GNU General")
print("Public License version 3 or, at your option, any later version.")
print("This program comes with ABSOLUTELY NO WARRANTY as per sections 15 & 16 of the")
print("license. See http://www.gnu.org/licenses/gpl-3.0.html for details.")
print("===============================================================================")
print("")
columns = map(int, args.columns.split(','))
if len(columns) != 2:
print("You should pass only two columns at time")
exit()
labels = []
for column in columns:
if column > 2 or column < 0:
print("Column value should be between 0 and 2")
exit()
if column == 0:
labels.append('Trash')
elif column == 1:
folder = 'hash'
labels.append('Hash')
elif column == 2:
labels.append('Plain')
'''
Columns:
0 - Trash
1 - Plain
2 - Hash
3 - Label
4 - Filename
'''
data = sp.genfromtxt("training/features.csv", delimiter=",", skip_header=1, usecols=columns)
target = sp.genfromtxt("training/features.csv", delimiter=",", skip_header=1, usecols=(-2))
n_neighbors = 10
h = .02 # step size in the mesh
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
clf = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')
clf.fit(data, target)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(data[:, 0], data[:, 1], c=target, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title(labels[0] + " vs " + labels[1] + " (k = %i, weights = '%s')" % (n_neighbors, 'uniform'))
print "Printing chart..."
plt.show()
print "Operation completed"