-
Notifications
You must be signed in to change notification settings - Fork 1
/
url_preproc.py
194 lines (121 loc) · 4.13 KB
/
url_preproc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# purpose: given a dataset with url(s), generate a feature vector that summarizes core syntax of the URL
# can inherit from other vectors (ie one based on document-level features)
import math # for calculating entropy
import socket
import pandas as pd
import tldextract
output = 'output.csv'
filename = 'pt5000.csv'
inputdf = pd.read_csv(filename)
# pre-production version
# functions
# Feature 1 - Count periods in URL
def countPeriods(url):
result = url.count('.')
return result
# Feature 2 - URL contains @ or -
def flagSpecialSymbols(url):
return ("@" in url) or ("-" in url)
# Feature 3 - Length of URL >= 74
def obtainLength(url):
return len(url)
# entropy
def shannonEntropy(url):
string = url.strip()
prob = [float(string.count(c)) / len(string) for c in dict.fromkeys(list(string))]
entropy = sum([(p * math.log(p) / math.log(2.0)) for p in prob])
#l = float(len(s))
#return -sum(map(lambda a: (a/l)*math.log2(a/l), Counter(s).values()))
return entropy
def numDigits(url):
digits = [i for i in url if i.isdigit()]
return len(digits)
def numParameters(url):
params = url.split('&')
return len(params) - 1
def numQueries(url):
params = url.split('?')
return len(params) - 1
def numAnchors(url):
fragments = url.split('#')
return len(fragments) - 1
def hasHttp(url):
return 'http:' in url
def hasHttps(url):
return 'https:' in url
# get host ip
def get_ip(url):
url2=url
if url[0:5] == 'https':
url2 = url[8:]
elif url[0:4] == 'http':
url2 = url[7:]
try:
temp = socket.gethostbyname(url2)
except:
temp = 0
return temp
# lists
# feature 1
base_num_periods_list = []
full_num_periods_list = []
# feature 2
base_spec_symbols_list = []
full_spec_symbols_list = []
# feature 3
base_length_list = []
full_length_list = []
ip_list = []
full_anchors_list = []
base_anchors_list = []
full_params_list = []
base_params_list = []
full_queries_list = []
base_queries_list = []
full_digits_list = []
base_digits_list = []
full_entropy_list = []
base_entropy_list = []
## # main loop
for index, row in inputdf.iterrows():
print('processing ' + row['url'])
# feature 1
full_num_periods_list.append(countPeriods(row['url']))
base_num_periods_list.append(countPeriods(row['base_url']))
# feature 2
full_spec_symbols_list.append(flagSpecialSymbols(row['url']))
base_spec_symbols_list.append(flagSpecialSymbols(row['base_url']))
# feature 3
full_length_list.append(obtainLength(row['url']))
base_length_list.append(obtainLength(row['base_url']))
ip_list.append(get_ip(row['base_url']))
full_anchors_list.append(numAnchors(row['url']))
base_anchors_list.append(numAnchors(row['base_url']))
full_params_list.append(numParameters(row['url']))
base_params_list.append(numParameters(row['url']))
full_queries_list.append(numQueries(row['url']))
base_queries_list.append(numQueries(row['url']))
full_digits_list.append(numDigits(row['url']))
base_digits_list.append(numDigits(row['base_url']))
full_entropy_list.append(shannonEntropy(row['url']))
base_entropy_list.append(shannonEntropy(row['base_url']))
# tie lists to features
inputdf['base_num_periods'] = base_num_periods_list
inputdf['full_num_periods'] = full_num_periods_list
inputdf['base_spec_symbols'] = base_spec_symbols_list
inputdf['full_spec_symbols'] = full_spec_symbols_list
inputdf['base_length'] = base_length_list
inputdf['full_length'] = full_length_list
inputdf['ip_list'] = ip_list
inputdf['full_anchors'] = full_anchors_list
inputdf['base_anchors'] = base_anchors_list
inputdf['full_params'] = full_params_list
inputdf['base_params'] = base_params_list
inputdf['full_queries'] = full_queries_list
inputdf['base_queries'] = base_queries_list
inputdf['full_digits'] = full_digits_list
inputdf['base_digits'] = base_digits_list
inputdf['full_entropy'] = full_entropy_list
inputdf['base_entropy'] = base_entropy_list
inputdf.to_csv(output, encoding='utf-8', index=False)