-
Notifications
You must be signed in to change notification settings - Fork 383
/
Copy pathawspyml.py
309 lines (257 loc) · 9.56 KB
/
awspyml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
# Copyright 2015 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Amazon Software License (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/asl/
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
# or implied. See the License for the specific language governing permissions
# and limitations under the License.
"""AWSPyML - Python utilities to help with Amazon Machine Learning.
"""
import boto
import csv
import json
import math
import random
import re
def aml_connection():
"""Connects to the service and validates that credentials are configured properly.
"""
ml = boto.connect_machinelearning()
try:
# Check that the connection is configured properly
ml.describe_ml_models(limit=1)
except:
raise RuntimeError("""There was a problem connecting to Amazon Machine Learning.
Be sure your AWS credentials are properly configured.
A credentials file should be in ~/.aws/credentials
(or C:\Users\USER_NAME\.aws\credentials on Windows)
and look like:
[Credentials]
aws_access_key_id = <your_access_key_here>
aws_secret_access_key = <your_secret_key_here>
""")
return ml
class Identifiers(object):
chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
@classmethod
def _new(cls, prefix=None):
if prefix:
out = prefix + '-'
else:
out = ''
for i in range(11):
# 62^11 -> 65 random bits, which is sufficient to avoid collisions
# creating 1 entity per second for 200+ years.
out += cls.chars[random.randint(0, len(cls.chars) - 1)]
return out
@classmethod
def new_data_source_id(cls):
return cls._new('ds')
@classmethod
def new_ml_model_id(cls):
return cls._new('ml')
@classmethod
def new_evaluation_id(cls):
return cls._new('ev')
@classmethod
def new_batch_prediction_id(cls):
return cls._new('bp')
class AWSPyMLException(Exception):
pass
class SchemaException(AWSPyMLException):
pass
class InvalidSchemaException(SchemaException):
pass
class JsonConfiguration(object):
"""Base class for JSON Configuration objects.
"""
def as_obj(self):
self.validate()
return self._obj
def _json_kwargs(self, dense):
if dense:
return {}
else:
return {"indent": 2}
def validate(self):
"""Raises an exception if any of the contents do not match
requirements.
Should be overridden by sub-class.
"""
pass # base class does nothing
def as_json_string(self, dense=False):
self.validate()
return json.dumps(self._obj, **(self._json_kwargs(dense)))
def write_to_file(self, filename, dense=False):
self.validate()
f = open(filename)
json.dump(self._obj, f, **self._json_kwargs(dense))
f.close()
def save_to_s3(self, s3url, dense=False):
self.validate()
raise "Not implemented yet"
class Schema(JsonConfiguration):
"""A Schema object for Amazon Machine Learning.
Keeps track of variable types. Helps format the
JSON used by the API.
"""
VALID_VARIABLE_TYPES = [
"BINARY",
"CATEGORICAL",
"NUMERIC",
"TEXT",
]
def __init__(self):
self._obj = {
"version": "1.0",
"dataFormat": "CSV",
"attributes": [],
"dataFileContainsHeader": True,
"targetAttributeName": None,
"excludedAttributeNames": [], # Optional
"rowId": None, # Optional
}
def validate(self):
"""Validates that the schema object is properly formed.
Either returns True or raises an exception.
Note this is not 100% robust and might not catch all
problems that the API will catch.
"""
if self.num_attributes() == 0:
raise InvalidSchemaException("no attributes defined")
for idx, var in enumerate(self.attributes()):
name = var["attributeName"]
typ = var["attributeType"]
if not name or not typ:
raise InvalidSchemaException(
"Variable #%d is not fully defined" % idx)
if len(name) > 64:
raise InvalidSchemaException("%s" % var["attributeName"])
if typ not in self.VALID_VARIABLE_TYPES:
raise InvalidSchemaException(
"variable %s has invalid type %s" % (name, typ))
return True # Passes all rules.
def set_target(self, target_variable_name):
"""Sets the target variable to the variable of the specified name
"""
if not self.get_variable_by_name(target_variable_name):
raise SchemaException("Can't set target to undefined variable %s" %
target_variable_name)
self._obj['targetAttributeName'] = target_variable_name
def set_variable_name(self, idx, name):
"""For variable in the position idx, set its name
"""
self.get_variable_by_idx(idx)['attributeName'] = name
def set_variable_type(self, idx, attributeType):
"""For variable in the position idx, set its field type.
"""
self.get_variable_by_idx(idx)['attributeType'] = attributeType
def num_attributes(self):
return len(self._obj['attributes'])
def attributes(self):
return self._obj["attributes"]
def get_variable_by_name(self, name):
for var in self.attributes():
if var["attributeName"] == name:
return var
return None
def get_variable_by_idx(self, idx):
if self.num_attributes() <= idx:
# The list is too small, so we expand it
amount = 1 + idx - self.num_attributes()
self._obj['attributes'].extend(
[{
"attributeName": None,
"attributeType": None,
}] * amount
)
return self._obj['attributes'][idx]
def set_header_line(self, header_line):
"""Takes a boolean to specify whether or not the
data file(s) contain a header line with the names
of the attributes in it
"""
self._obj["dataFileContainsHeader"] = header_line
class SchemaGuesser(object):
"""Guesses at a good Schema object by looking at sample data.
"""
def __init__(self):
self.data = []
self.num_attributes = 0
self.schema = Schema()
self.text_words_threshold = 20 # Heuristic.
def from_file(self, filename, target_variable=None, header_line='auto',
num_lines_to_use=1000):
"""Returns a Schema object that is guessed by looking at the first
num_lines_to_use records from the given file.
Can set header_line to true or false if known, otherwise, it guesses.
"""
f = open(filename)
try:
self._load_csv_data(f, num_lines_to_use)
schema = self._guess_schema_from_data(header_line)
if target_variable:
schema.set_target(target_variable)
return schema
finally:
f.close()
def _guess_schema_from_data(self, header_line):
if header_line == 'auto':
header_line = self._guess_if_header_line_present()
self._name_attributes(header_line)
for i in xrange(self.num_attributes): # Loop through columns
column = [row[i] for row in self.data]
if header_line:
column = column[1:]
self.schema.set_variable_type(i, self._guess_variable_type(column))
return self.schema
def _guess_variable_type(self, samples):
counts = {
"NUMERIC": 0,
"BINARY": 0,
"TEXT": 0,
"CATEGORICAL": 0,
}
for sample in samples:
try:
num = float(sample)
counts["NUMERIC"] += 1
if num == 0 or num == 1:
counts["BINARY"] += 1
except:
# Non-numeric
word_count = len(re.split("\s+", sample))
if word_count > self.text_words_threshold:
counts["TEXT"] += 1
else:
counts["CATEGORICAL"] += 1
return max(counts, key=counts.get)
def _name_attributes(self, header_line):
digits = int(math.floor(math.log10(self.num_attributes))) + 1
for idx, name in enumerate(self.data[0]):
if header_line:
self.schema.set_variable_name(idx, name)
else:
name = "Var{number:0{width}d}".format(width=digits, number = idx + 1)
self.schema.set_variable_name(idx, name)
def _guess_if_header_line_present(self):
header_row = self.data[0]
has_header_line = self._guess_variable_type(
header_row) == "CATEGORICAL"
self.schema.set_header_line(has_header_line)
return has_header_line
def _load_csv_data(self, csvfile, num_lines_to_use):
"""Loads at most num_lines_to_use records from the open file with csv
data. Returns as list of lists.
"""
csvreader = csv.reader(csvfile)
for record in csvreader:
self.data.append(record)
if len(self.data) == num_lines_to_use:
break
self.num_attributes = len(self.data[0])