From 3789317bc1063c3e76bcf6203cc0a666ebb89116 Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Sun, 2 Dec 2018 17:05:55 +0800 Subject: [PATCH 01/22] Improve model knob API with changes to model examples (lacking worker-side changes) --- examples/models/image_classification/SkDt.py | 30 ++-- examples/models/image_classification/SkSvm.py | 46 ++---- .../image_classification/TfFeedForward.py | 95 +++++------- .../models/image_classification/TfVgg16.py | 50 +++---- examples/models/pos_tagging/BigramHmm.py | 12 +- examples/models/pos_tagging/PyBiLstm.py | 72 +++------ rafiki/advisor/btb_gp_advisor.py | 58 ++++--- rafiki/model/__init__.py | 1 + rafiki/model/knob.py | 118 +++++++++++++++ rafiki/model/model.py | 141 +++++++++--------- 10 files changed, 324 insertions(+), 299 deletions(-) create mode 100644 rafiki/model/knob.py diff --git a/examples/models/image_classification/SkDt.py b/examples/models/image_classification/SkDt.py index eee91a4e..4f6d53ff 100644 --- a/examples/models/image_classification/SkDt.py +++ b/examples/models/image_classification/SkDt.py @@ -5,34 +5,28 @@ import base64 import numpy as np -from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class +from rafiki.config import APP_MODE +from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class, \ + IntegerKnob, CategoricalKnob from rafiki.constants import TaskType, ModelDependency class SkDt(BaseModel): ''' Implements a decision tree classifier on Scikit-Learn for simple image classification ''' - - def get_knob_config(self): + @staticmethod + def get_knob_config(): return { - 'knobs': { - 'max_depth': { - 'type': 'int', - 'range': [2, 8] - }, - 'criterion': { - 'type': 'string', - 'values': ['gini', 'entropy'] - }, - } + 'max_depth': IntegerKnob(2, 16 if APP_MODE != 'DEV' else 8), + 'criterion': CategoricalKnob(['gini', 'entropy']) } - def init(self, knobs): - self._max_depth = knobs.get('max_depth') - self._criterion = knobs.get('criterion') + def __init__(self, **knobs): + super().__init__(**knobs) + self._knobs = knobs self._clf = self._build_classifier( - self._max_depth, - self._criterion + self._knobs.get('max_depth'), + self._knobs.get('criterion') ) def train(self, dataset_uri): diff --git a/examples/models/image_classification/SkSvm.py b/examples/models/image_classification/SkSvm.py index 2a7162dc..39b86915 100644 --- a/examples/models/image_classification/SkSvm.py +++ b/examples/models/image_classification/SkSvm.py @@ -5,46 +5,32 @@ import base64 import numpy as np -from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class +from rafiki.config import APP_MODE +from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class, \ + IntegerKnob, CategoricalKnob, FloatKnob from rafiki.constants import TaskType, ModelDependency class SkSvm(BaseModel): ''' Implements a SVM on Scikit-Learn for simple image classification ''' - - def get_knob_config(self): + @staticmethod + def get_knob_config(): return { - 'knobs': { - 'max_iter': { - 'type': 'int', - 'range': [10, 10] - }, - 'kernel': { - 'type': 'string', - 'values': ['rbf', 'linear'] - }, - 'gamma': { - 'type': 'string', - 'values': ['scale', 'auto'] - }, - 'C': { - 'type': 'float_exp', - 'range': [1e-2, 1e2] - } - } + 'max_iter': IntegerKnob(10, 40 if APP_MODE != 'DEV' else 10), + 'kernel': CategoricalKnob(['rbf', 'linear']), + 'gamma': CategoricalKnob(['scale', 'auto']), + 'C': FloatKnob(1e-2, 1e2, is_exp=True) } - def init(self, knobs): - self._max_iter = knobs.get('max_iter') - self._kernel = knobs.get('kernel') - self._gamma = knobs.get('gamma') - self._C = knobs.get('C') + def __init__(self, **knobs): + super().__init__(**knobs) + self._knobs = knobs self._clf = self._build_classifier( - self._max_iter, - self._kernel, - self._gamma, - self._C + knobs.get('max_iter'), + knobs.get('kernel'), + knobs.get('gamma') , + knobs.get('C') ) def train(self, dataset_uri): diff --git a/examples/models/image_classification/TfFeedForward.py b/examples/models/image_classification/TfFeedForward.py index a94bae34..b907c66b 100644 --- a/examples/models/image_classification/TfFeedForward.py +++ b/examples/models/image_classification/TfFeedForward.py @@ -8,7 +8,8 @@ import base64 from rafiki.config import APP_MODE -from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class +from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class, \ + IntegerKnob, CategoricalKnob, FloatKnob from rafiki.constants import TaskType, ModelDependency class TfFeedForward(BaseModel): @@ -16,53 +17,20 @@ class TfFeedForward(BaseModel): Implements a fully-connected feed-forward neural network with variable hidden layers on Tensorflow for simple image classification ''' - - def get_knob_config(self): - epochs_range = [3, 100] - hidden_layer_count_range = [1, 8] - - if APP_MODE == 'DEV': - print('WARNING: In DEV mode, `epochs` is set to 3 and `hidden_layer_count` is set to 2.') - epochs_range = [3, 3] - hidden_layer_count_range = [2, 2] - + @staticmethod + def get_knob_config(): return { - 'knobs': { - 'epochs': { - 'type': 'int', - 'range': epochs_range - }, - 'hidden_layer_count': { - 'type': 'int', - 'range': hidden_layer_count_range - }, - 'hidden_layer_units': { - 'type': 'int', - 'range': [2, 128] - }, - 'learning_rate': { - 'type': 'float_exp', - 'range': [1e-5, 1e-1] - }, - 'batch_size': { - 'type': 'int_cat', - 'values': [16, 32, 64, 128] - }, - 'image_size': { - 'type': 'int_cat', - 'values': [8, 16, 32] - } - } + 'epochs': IntegerKnob(3, 10 if APP_MODE != 'DEV' else 3), + 'hidden_layer_count': IntegerKnob(1, 8 if APP_MODE != 'DEV' else 2), + 'hidden_layer_units': IntegerKnob(2, 128), + 'learning_rate': FloatKnob(1e-5, 1e-1, is_exp=True), + 'batch_size': CategoricalKnob([16, 32, 64, 128]), + 'image_size': CategoricalKnob([8, 16, 32]), } - def init(self, knobs): - self._batch_size = knobs.get('batch_size') - self._hidden_layer_units = knobs.get('hidden_layer_units') - self._hidden_layer_count = knobs.get('hidden_layer_count') - self._learning_rate = knobs.get('learning_rate') - self._epochs = knobs.get('epochs') - self._image_size = knobs.get('image_size') - + def __init__(self, **knobs): + super().__init__(**knobs) + self._knobs = knobs self._graph = tf.Graph() config = tf.ConfigProto() config.gpu_options.allow_growth = True @@ -70,7 +38,11 @@ def init(self, knobs): self._define_plots() def train(self, dataset_uri): - dataset = self.utils.load_dataset_of_image_files(dataset_uri, image_size=[self._image_size, self._image_size]) + im_sz = self._knobs.get('image_size') + bs = self._knobs.get('batch_size') + ep = self._knobs.get('epochs') + + dataset = self.utils.load_dataset_of_image_files(dataset_uri, image_size=[im_sz, im_sz]) num_classes = dataset.classes (images, classes) = zip(*[(image, image_class) for (image, image_class) in dataset]) images = np.asarray(images) @@ -85,8 +57,8 @@ def train(self, dataset_uri): images, classes, verbose=0, - epochs=self._epochs, - batch_size=self._batch_size, + epochs=ep, + batch_size=bs, callbacks=[ tf.keras.callbacks.LambdaCallback(on_epoch_end=self._on_train_epoch_end) ] @@ -98,7 +70,9 @@ def train(self, dataset_uri): self.utils.log('Train accuracy: {}'.format(accuracy)) def evaluate(self, dataset_uri): - dataset = self.utils.load_dataset_of_image_files(dataset_uri, image_size=[self._image_size, self._image_size]) + im_sz = self._knobs.get('image_size') + + dataset = self.utils.load_dataset_of_image_files(dataset_uri, image_size=[im_sz, im_sz]) (images, classes) = zip(*[(image, image_class) for (image, image_class) in dataset]) images = np.asarray(images) classes = np.asarray(classes) @@ -111,7 +85,9 @@ def evaluate(self, dataset_uri): return accuracy def predict(self, queries): - X = self.utils.resize_as_images(queries, image_size=[self._image_size, self._image_size]) + im_sz = self._knobs.get('image_size') + + X = self.utils.resize_as_images(queries, image_size=[im_sz, im_sz]) with self._graph.as_default(): with self._sess.as_default(): probs = self._model.predict(X) @@ -166,20 +142,17 @@ def _define_plots(self): self.utils.define_plot('Loss Over Time', ['loss']) def _build_model(self, num_classes): - hidden_layer_units = self._hidden_layer_units - hidden_layer_count = self._hidden_layer_count - learning_rate = self._learning_rate - image_size = self._image_size + units = self._knobs.get('hidden_layer_units') + layers = self._knobs.get('hidden_layer_count') + lr = self._knobs.get('learning_rate') + im_sz = self._knobs.get('image_size') model = keras.Sequential() - model.add(keras.layers.Flatten(input_shape=(image_size, image_size,))) + model.add(keras.layers.Flatten(input_shape=(im_sz, im_sz,))) model.add(keras.layers.BatchNormalization()) - for _ in range(hidden_layer_count): - model.add(keras.layers.Dense( - hidden_layer_units, - activation=tf.nn.relu - )) + for _ in range(layers): + model.add(keras.layers.Dense(units, activation=tf.nn.relu)) model.add(keras.layers.Dense( num_classes, @@ -187,7 +160,7 @@ def _build_model(self, num_classes): )) model.compile( - optimizer=keras.optimizers.Adam(lr=learning_rate), + optimizer=keras.optimizers.Adam(lr=lr), loss='sparse_categorical_crossentropy', metrics=['accuracy'] ) diff --git a/examples/models/image_classification/TfVgg16.py b/examples/models/image_classification/TfVgg16.py index b783e35f..9a88bad1 100644 --- a/examples/models/image_classification/TfVgg16.py +++ b/examples/models/image_classification/TfVgg16.py @@ -8,7 +8,8 @@ import abc from urllib.parse import urlparse, parse_qs -from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class +from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class, \ + IntegerKnob, FloatKnob, CategoricalKnob from rafiki.constants import TaskType, ModelDependency from rafiki.config import APP_MODE @@ -16,42 +17,26 @@ class TfVgg16(BaseModel): ''' Implements VGG16 on Tensorflow for simple image classification ''' - - def get_knob_config(self): - epochs_range = [1, 20] - - if APP_MODE == 'DEV': - print('WARNING: In DEV mode, `epochs` is set to 1.') - epochs_range = [1, 1] - + @staticmethod + def get_knob_config(): return { - 'knobs': { - 'epochs': { - 'type': 'int', - 'range': epochs_range - }, - 'learning_rate': { - 'type': 'float_exp', - 'range': [1e-5, 1e-1] - }, - 'batch_size': { - 'type': 'int_cat', - 'values': [16, 32, 64, 128] - } - } + 'epochs': IntegerKnob(1, 1 if APP_MODE != 'DEV' else 10), + 'learning_rate': FloatKnob(1e-5, 1e-1, is_exp=True), + 'batch_size': CategoricalKnob([16, 32, 64, 128]), } - def init(self, knobs): - self._batch_size = knobs.get('batch_size') - self._epochs = knobs.get('epochs') - self._learning_rate = knobs.get('learning_rate') - + def __init__(self, **knobs): + super().__init__(**knobs) + self._knobs = knobs self._graph = tf.Graph() config = tf.ConfigProto() config.gpu_options.allow_growth = True self._sess = tf.Session(graph=self._graph, config=config) def train(self, dataset_uri): + ep = self._knobs.get('epochs') + bs = self._knobs.get('batch_size') + dataset = self.utils.load_dataset_of_image_files(dataset_uri, image_size=[48, 48]) num_classes = dataset.classes (images, classes) = zip(*[(image, image_class) for (image, image_class) in dataset]) @@ -65,8 +50,8 @@ def train(self, dataset_uri): self._model.fit( images, classes, - epochs=self._epochs, - batch_size=self._batch_size + epochs=ep, + batch_size=bs ) def evaluate(self, dataset_uri): @@ -129,7 +114,8 @@ def load_parameters(self, params): self._model = keras.models.load_model(tmp.name) def _build_model(self, num_classes): - learning_rate = self._learning_rate + lr = self._knobs.get('learning_rate') + model = keras.applications.VGG16( include_top=True, input_shape=(48, 48, 3), @@ -138,7 +124,7 @@ def _build_model(self, num_classes): ) model.compile( - optimizer=keras.optimizers.Adam(lr=learning_rate), + optimizer=keras.optimizers.Adam(lr=lr), loss='sparse_categorical_crossentropy', metrics=['accuracy'] ) diff --git a/examples/models/pos_tagging/BigramHmm.py b/examples/models/pos_tagging/BigramHmm.py index 219f3a07..b27efc12 100644 --- a/examples/models/pos_tagging/BigramHmm.py +++ b/examples/models/pos_tagging/BigramHmm.py @@ -18,14 +18,12 @@ class BigramHmm(BaseModel): ''' Implements Bigram Hidden Markov Model (HMM) for POS tagging ''' + @staticmethod + def get_knob_config(): + return {} - def get_knob_config(self): - return { - 'knobs': {} - } - - def init(self, knobs): - pass + def __init__(self, **knobs): + super().__init__(**knobs) def train(self, dataset_uri): dataset = self.utils.load_dataset_of_corpus(dataset_uri) diff --git a/examples/models/pos_tagging/PyBiLstm.py b/examples/models/pos_tagging/PyBiLstm.py index c906021e..603fe9eb 100644 --- a/examples/models/pos_tagging/PyBiLstm.py +++ b/examples/models/pos_tagging/PyBiLstm.py @@ -12,7 +12,8 @@ import torch.optim as optim from torch.utils.data.dataset import Dataset -from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class +from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class, \ + IntegerKnob, FloatKnob, CategoricalKnob from rafiki.constants import TaskType, ModelDependency from rafiki.config import APP_MODE @@ -20,50 +21,20 @@ class PyBiLstm(BaseModel): ''' Implements a Bidrectional LSTM model in Pytorch for POS tagging ''' - - def get_knob_config(self): - epochs_range = [10, 50] - - if APP_MODE == 'DEV': - print('WARNING: In DEV mode, `epochs` is set to 10.') - epochs_range = [10, 10] - + @staticmethod + def get_knob_config(): return { - 'knobs': { - 'epochs': { - 'type': 'int', - 'range': epochs_range - }, - 'batch_size': { - 'type': 'int_cat', - 'values': [16, 32, 64, 128] - }, - 'word_embed_dims': { - 'type': 'int', - 'range': [16, 128] - }, - 'learning_rate': { - 'type': 'float_exp', - 'range': [1e-2, 1e-1] - }, - 'word_rnn_hidden_size': { - 'type': 'int', - 'range': [16, 128] - }, - 'word_dropout': { - 'type': 'float_exp', - 'range': [1e-3, 2e-1] - } - } + 'epochs': IntegerKnob(10, 50 if APP_MODE != 'DEV' else 10), + 'word_embed_dims': IntegerKnob(16, 128), + 'word_rnn_hidden_size': IntegerKnob(16, 128), + 'word_dropout': FloatKnob(1e-3, 2e-1, is_exp=True), + 'learning_rate': FloatKnob(1e-2, 1e-1, is_exp=True), + 'batch_size': CategoricalKnob([16, 32, 64, 128]), } - def init(self, knobs): - self._epochs = knobs.get('epochs') - self._word_embed_dims = knobs.get('word_embed_dims') - self._word_rnn_hidden_size = knobs.get('word_rnn_hidden_size') - self._word_dropout = knobs.get('word_dropout') - self._batch_size = knobs.get('batch_size') - self._learning_rate = knobs.get('learning_rate') + def __init__(self, **knobs): + super().__init__(**knobs) + self._knobs = knobs self._define_plots() def train(self, dataset_uri): @@ -160,7 +131,7 @@ def _prepare_batch(self, dataset, lo, hi, Tensor, has_tags=True): return (words_tsr, tags_tsr) def _predict(self, dataset): - N = self._batch_size + N = self._knobs.get('batch_size') net = self._net B = math.ceil(len(dataset) / N) # No. of batches word_count = len(self._word_dict) @@ -196,8 +167,8 @@ def _predict(self, dataset): return sents_pred_tags def _train(self, dataset): - N = self._batch_size - epochs = self._epochs + N = self._knobs.get('batch_size') + ep = self._knobs.get('epochs') null_tag = self._tag_count # Tag to ignore (from padding of sentences during batching) B = math.ceil(len(dataset) / N) # No. of batches @@ -211,7 +182,7 @@ def _train(self, dataset): loss_func = nn.CrossEntropyLoss(ignore_index=null_tag) - for epoch in range(epochs): + for epoch in range(ep): total_loss = 0 for i in range(B): # Extract batch from dataset @@ -256,10 +227,15 @@ def _compute_accuracy(self, dataset, sents_tags): return correct / total def _create_model(self): + word_embed_dims = self._knobs.get('word_embed_dims') + word_rnn_hidden_size = self._knobs.get('word_rnn_hidden_size') + word_dropout = self._knobs.get('word_dropout') + lr = self._knobs.get('learning_rate') + word_count = len(self._word_dict) net = PyNet(word_count + 1, self._tag_count + 1, \ - self._word_embed_dims, self._word_rnn_hidden_size, self._word_dropout) - optimizer = optim.Adam(net.parameters(), lr=self._learning_rate) + word_embed_dims, word_rnn_hidden_size, word_dropout) + optimizer = optim.Adam(net.parameters(), lr=lr) return (net, optimizer) class PyNet(nn.Module): diff --git a/rafiki/advisor/btb_gp_advisor.py b/rafiki/advisor/btb_gp_advisor.py index 95acc53b..ee02305a 100644 --- a/rafiki/advisor/btb_gp_advisor.py +++ b/rafiki/advisor/btb_gp_advisor.py @@ -1,6 +1,7 @@ from btb.tuning import GP from btb import HyperParameter, ParamTypes +from rafiki.model import BaseKnob, FloatKnob, IntegerKnob, CategoricalKnob from .advisor import BaseAdvisor class BtbGpAdvisor(BaseAdvisor): @@ -9,8 +10,7 @@ class BtbGpAdvisor(BaseAdvisor): ''' def __init__(self, knob_config): # TODO: Support conditional knobs - knobs = knob_config['knobs'] - tunables = self._get_tunables(knobs) + tunables = self._get_tunables(knob_config) # TODO: Allow configuration of tuner self._tuner = GP(tunables=tunables) @@ -22,37 +22,31 @@ def propose(self): def feedback(self, knobs, score): self._tuner.add(knobs, score) - def _get_tunables(self, knobs): + def _get_tunables(self, knob_config): tunables = [ - _knob_to_tunable(name, knob_config) - for (name, knob_config) - in knobs.items() + (name, _knob_to_tunable(x)) + for (name, x) + in knob_config.items() ] return tunables -_KNOB_TYPE_TO_TUNABLE_TYPE = { - 'int': ParamTypes.INT, - 'int_exp': ParamTypes.INT_EXP, - 'int_cat': ParamTypes.INT_CAT, - 'float': ParamTypes.FLOAT, - 'float_exp': ParamTypes.FLOAT_EXP, - 'float_cat': ParamTypes.FLOAT_CAT, - 'string': ParamTypes.STRING, - 'bool': ParamTypes.BOOL -} - -_KNOB_CONFIG_TO_TUNABLE_RANGE = { - ParamTypes.INT: (lambda x: x['range']), - ParamTypes.INT_EXP: (lambda x: x['range']), - ParamTypes.INT_CAT: (lambda x: x['values']), - ParamTypes.FLOAT: (lambda x: x['range']), - ParamTypes.FLOAT_EXP: (lambda x: x['range']), - ParamTypes.FLOAT_CAT: (lambda x: x['values']), - ParamTypes.STRING: (lambda x: x['values']), - ParamTypes.BOOL: (lambda x: x['values']) -} - -def _knob_to_tunable(name, knob_config): - tunable_type = _KNOB_TYPE_TO_TUNABLE_TYPE[knob_config['type']] - tunable_range = _KNOB_CONFIG_TO_TUNABLE_RANGE[tunable_type](knob_config) - return (name, HyperParameter(tunable_type, tunable_range)) \ No newline at end of file +def _knob_to_tunable(knob): + if isinstance(knob, CategoricalKnob): + if knob.value_type is int: + return HyperParameter(ParamTypes.INT_CAT, knob.values) + elif knob.value_type is float: + return HyperParameter(ParamTypes.FLOAT_CAT, knob.values) + elif knob.value_type is str: + return HyperParameter(ParamTypes.STRING, knob.values) + elif knob.value_type is bool: + return HyperParameter(ParamTypes.BOOL, knob.values) + elif isinstance(knob, IntegerKnob): + if knob.is_exp: + return HyperParameter(ParamTypes.INT_EXP, [knob.value_min, knob.value_max]) + else: + return HyperParameter(ParamTypes.INT, [knob.value_min, knob.value_max]) + elif isinstance(knob, FloatKnob): + if knob.is_exp: + return HyperParameter(ParamTypes.FLOAT_EXP, [knob.value_min, knob.value_max]) + else: + return HyperParameter(ParamTypes.FLOAT, [knob.value_min, knob.value_max]) \ No newline at end of file diff --git a/rafiki/model/__init__.py b/rafiki/model/__init__.py index 8dc86e14..d7af85de 100644 --- a/rafiki/model/__init__.py +++ b/rafiki/model/__init__.py @@ -2,3 +2,4 @@ parse_model_install_command, InvalidModelClassException, InvalidModelParamsException, \ ModelUtils from .log import ModelLogUtilsLogger +from .knob import BaseKnob, CategoricalKnob, IntegerKnob, FloatKnob \ No newline at end of file diff --git a/rafiki/model/knob.py b/rafiki/model/knob.py new file mode 100644 index 00000000..91b76920 --- /dev/null +++ b/rafiki/model/knob.py @@ -0,0 +1,118 @@ +import abc + +# TODO: Add documentation for each knob + +class BaseKnob(abc.ABC): + # TODO: Support conditional and validation logic + pass + +class CategoricalKnob(BaseKnob): + ''' + Knob representing a categorical value of type `int`, `float`, `bool` or `str`. + A generated value of this knob must be an element of `values`. + ''' + + def __init__(self, values): + self._values = values + (self._value_type) = self._validate_values(values) + + @property + def value_type(self): + return self._value_type + + @property + def values(self): + return self._values + + @staticmethod + def _validate_values(values): + if len(values) == 0: + raise ValueError('Length of `values` should at least 1') + + if isinstance(values[0], int): + value_type = int + elif isinstance(values[0], float): + value_type = float + elif isinstance(values[0], bool): + value_type = bool + elif isinstance(values[0], str): + value_type = str + else: + raise TypeError('Only the following types for `values` are supported: `int`, `float`, `bool`, `str`') + + if any([not isinstance(x, value_type) for x in values]): + raise TypeError('`values` should have elements of the same type') + + return (value_type) + +class IntegerKnob(BaseKnob): + ''' + Knob representing any `int` value within a specific interval (`value_min`, `value_max`). + `is_exp` specifies whether the knob value should be scaled exponentially. + ''' + + def __init__(self, value_min, value_max, is_exp=False): + self._validate_values(value_min, value_max) + self._value_min = value_min + self._value_max = value_max + self._is_exp = is_exp + + @property + def value_min(self): + return self._value_min + + @property + def value_max(self): + return self._value_max + + @property + def is_exp(self): + return self._is_exp + + @staticmethod + def _validate_values(value_min, value_max): + if not isinstance(value_min, int): + raise ValueError('`value_min` should be an `int`') + + if not isinstance(value_max, int): + raise ValueError('`value_max` should be an `int`') + + if value_min > value_max: + raise ValueError('`value_max` should be at least `value_min`') + + +class FloatKnob(BaseKnob): + ''' + Knob representing any `float` value within a specific interval (`value_min`, `value_max`). + `is_exp` specifies whether the knob value should be scaled exponentially. + ''' + + def __init__(self, value_min, value_max, is_exp=False): + self._validate_values(value_min, value_max) + self._value_min = value_min + self._value_max = value_max + self._is_exp = is_exp + + @property + def value_min(self): + return self._value_min + + @property + def value_max(self): + return self._value_max + + @property + def is_exp(self): + return self._is_exp + + @staticmethod + def _validate_values(value_min, value_max): + if not isinstance(value_min, float) and not isinstance(value_min, int): + raise ValueError('`value_min` should be a `float` or `int`') + + if not isinstance(value_max, float) and not isinstance(value_max, int): + raise ValueError('`value_max` should be a `float` or `int`') + + if value_min > value_max: + raise ValueError('`value_max` should be at least `value_min`') + \ No newline at end of file diff --git a/rafiki/model/model.py b/rafiki/model/model.py index b30b6286..9e12620c 100644 --- a/rafiki/model/model.py +++ b/rafiki/model/model.py @@ -5,6 +5,7 @@ import pickle import uuid from importlib import import_module +import inspect from rafiki.advisor import Advisor, AdvisorType from rafiki.predictor import ensemble_predictions @@ -12,6 +13,7 @@ from .dataset import ModelDatasetUtils from .log import ModelLogUtils +from .knob import BaseKnob class InvalidModelClassException(Exception): pass class InvalidModelParamsException(Exception): pass @@ -24,57 +26,33 @@ def __init__(self): class BaseModel(abc.ABC): ''' Rafiki's base model class that Rafiki models should extend. - Rafiki models should implement all abstract methods according to their associated tasks' specifications. + Rafiki models should implement all abstract methods according to their associated tasks' specifications, + including the static method `get_knob_config()`. ''' - def __init__(self): + def __init__(self, **knobs): + ''' + Initialize a model instance with generated knob values. + These knob values will be chosen by Rafiki based on the model's knob config. + Call `super().__init__(**knobs)` as the first line of the model's `__init__` method, + followed by the model's initialization logic. + + :param knobs: Dictionary of knob values for this model instance + :type knobs: dict[str, any] + ''' self.utils = ModelUtils() - super().__init__() - @abc.abstractmethod - def get_knob_config(self): + @staticmethod + def get_knob_config(): ''' - Return a dictionary defining this model's knob configuration + Return a dictionary defining this model class' knob configuration (i.e. list of knob names, their data types and their ranges). :returns: Dictionary defining this model's knob configuration - :rtype: - :: - - { - 'knobs': { - 'hidden_layer_units': { - 'type': 'int', - 'range': [2, 128] - }, - 'epochs': { - 'type': 'int', - 'range': [1, 100] - }, - 'learning_rate': { - 'type': 'float_exp', - 'range': [1e-5, 1e-1] - }, - 'batch_size': { - 'type': 'int_cat', - 'values': [1, 2, 4, 8, 16, 32, 64, 128] - } - } - } - + :rtype: dict[str, rafiki.model.BaseKnob] ''' raise NotImplementedError() - def init(self, knobs): - ''' - Initialize the model with a dictionary of knob values. - These knob values will be chosen by Rafiki based on the model's knob config. - - :param knobs: Dictionary of knob values for this model instance - :type knobs: dict[str, any] - ''' - pass - @abc.abstractmethod def train(self, dataset_uri): ''' @@ -164,7 +142,9 @@ def test_model_class(model_file_path, model_class, task, dependencies, \ :returns: The trained model ''' try: - print('Testing model installation...') + _print_header('Installing & checking model dependencies...') + _check_dependencies(dependencies) + # Test installation if not isinstance(dependencies, dict): raise Exception('`dependencies` should be a dict[str, str]') @@ -173,30 +153,24 @@ def test_model_class(model_file_path, model_class, task, dependencies, \ exit_code = os.system(install_command) if exit_code != 0: raise Exception('Error in installing model dependencies') - print('Testing loading of model...') + _print_header('Checking loading of model & model definition...') f = open(model_file_path, 'rb') model_file_bytes = f.read() - py_model_class = load_model_class(model_file_bytes, model_class) - model_inst = py_model_class() - if not isinstance(model_inst, BaseModel): - raise Exception('Model should extend `rafiki.model.BaseModel`') - - knob_config = model_inst.get_knob_config() - if not isinstance(knob_config, dict): - raise Exception('`get_knob_config()` should return a dict[str, any]') + py_model_class = load_model_class(model_file_bytes, model_class, temp_mod_name='your-model-file-temp') + _check_model_class(py_model_class) - if 'knobs' not in knob_config: - raise Exception('`knob_config` should have a \'knobs\' key') + _print_header('Checking model knob configuration...') + knob_config = py_model_class.get_knob_config() + _check_knob_config(knob_config) - print('Checking model dependencies & methods...') - _check_dependencies(py_model_class, dependencies) - _check_methods(py_model_class) - - print('Testing training & evaluation of model...') + _print_header('Checking model initialization...') advisor = Advisor(knob_config, advisor_type=AdvisorType.BTB_GP) if knobs is None: knobs = advisor.propose() print('Using knobs: {}'.format(knobs)) - model_inst.init(knobs) + model_inst = py_model_class(**knobs) + _check_model_inst(model_inst) + + _print_header('Checking training & evaluation of model...') model_inst.train(train_dataset_uri) score = model_inst.evaluate(test_dataset_uri) @@ -205,7 +179,7 @@ def test_model_class(model_file_path, model_class, task, dependencies, \ print('Score: {}'.format(score)) - print('Testing dumping of parameters of model...') + _print_header('Checking dumping of parameters of model...') parameters = model_inst.dump_parameters() if not isinstance(parameters, dict): @@ -218,13 +192,12 @@ def test_model_class(model_file_path, model_class, task, dependencies, \ traceback.print_stack() raise Exception('`parameters` should be serializable by `pickle`') - print('Testing loading of parameters of model...') + _print_header('Checking loading of parameters of model...') model_inst.destroy() - model_inst = py_model_class() - model_inst.init(knobs) + model_inst = py_model_class(**knobs) model_inst.load_parameters(parameters) - print('Testing predictions with model...') + _print_header('Checking predictions with model...') print('Using queries: {}'.format(queries)) predictions = model_inst.predict(queries) @@ -239,15 +212,18 @@ def test_model_class(model_file_path, model_class, task, dependencies, \ predictions = ensemble_predictions([predictions], task) print('Predictions: {}'.format(predictions)) - print('The model definition is valid!') + + _info('The model definition is valid!') return model_inst except Exception as e: raise InvalidModelClassException(e) -def load_model_class(model_file_bytes, model_class): - temp_mod_name = str(uuid.uuid4()) +def load_model_class(model_file_bytes, model_class, temp_mod_name=None): + if temp_mod_name is None: + temp_mod_name = str(uuid.uuid4()) + temp_model_file_name ='{}.py'.format(temp_mod_name) # Temporarily save the model file to disk @@ -290,7 +266,7 @@ def parse_model_install_command(dependencies, enable_gpu=False): return ' '.join(commands) -def _check_dependencies(py_model_class, dependencies): +def _check_dependencies(dependencies): for (dep, ver) in dependencies.items(): # Warn that TF models need to cater for GPU sharing if dep == ModelDependency.TENSORFLOW: @@ -302,15 +278,38 @@ def _check_dependencies(py_model_class, dependencies): elif dep == ModelDependency.KERAS: _warn('Keras models can enable GPU usage with by adding a `tensorflow` dependency.') -def _check_methods(py_model_class): - model_inst = py_model_class() - if getattr(model_inst, 'get_predict_label_mapping', None) is not None: +def _check_model_class(py_model_class): + if not issubclass(py_model_class, BaseModel): + raise Exception('Model should extend `rafiki.model.BaseModel`') + + if inspect.isfunction(getattr(py_model_class, 'get_predict_label_mapping', None)): _warn('`get_predict_label_mapping` has been deprecated') + + if inspect.isfunction(getattr(py_model_class, 'init', None)): + _warn('`init` has been deprecated - use `__init__` for your model\'s initialization logic instead') + + if inspect.isfunction(getattr(py_model_class, 'get_knob_config', None)) and \ + not isinstance(py_model_class.__dict__.get('get_knob_config', None), staticmethod): + _warn('`get_knob_config` has been changed to a `@staticmethod`') + +def _check_model_inst(model_inst): + if getattr(model_inst, 'utils', None) is None: + raise Exception('`super().__init__(**knobs)` should be called as the first line of the model\'s `__init__` method.') + +def _check_knob_config(knob_config): + if not isinstance(knob_config, dict) or \ + any([(not isinstance(name, str) or not isinstance(knob, BaseKnob)) for (name, knob) in knob_config.items()]): + raise Exception('Static method `get_knob_config()` should return a dict[str, BaseKnob]') def _info(msg): msg_color = '\033[94m' end_color = '\033[0m' - print('{}INFO: {}{}'.format(msg_color, msg, end_color)) + print('{}{}{}'.format(msg_color, msg, end_color)) + +def _print_header(msg): + print('-' * (len(msg) + 4)) + print('| {} |'.format(msg)) + print('-' * (len(msg) + 4)) def _warn(msg): msg_color = '\033[93m' From 7c4094297a865ce588033d9e4d4d4e824e3f0fd0 Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Mon, 3 Dec 2018 12:38:15 +0800 Subject: [PATCH 02/22] Make worker-side changes for changes to knobs API --- dockerfiles/advisor.Dockerfile | 2 ++ rafiki/model/knob.py | 4 ++-- rafiki/worker/inference.py | 3 +-- rafiki/worker/train.py | 9 +++------ 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/dockerfiles/advisor.Dockerfile b/dockerfiles/advisor.Dockerfile index 5b17bb9a..e9ceda37 100644 --- a/dockerfiles/advisor.Dockerfile +++ b/dockerfiles/advisor.Dockerfile @@ -21,6 +21,8 @@ ENV PYTHONPATH $DOCKER_WORKDIR_PATH # Install python dependencies COPY rafiki/utils/requirements.txt utils/requirements.txt RUN pip install -r utils/requirements.txt +RUN pip install -r model/requirements.txt +COPY rafiki/container/requirements.txt container/requirements.txt COPY rafiki/advisor/requirements.txt advisor/requirements.txt RUN pip install -r advisor/requirements.txt diff --git a/rafiki/model/knob.py b/rafiki/model/knob.py index 91b76920..c7325bff 100644 --- a/rafiki/model/knob.py +++ b/rafiki/model/knob.py @@ -47,7 +47,7 @@ def _validate_values(values): class IntegerKnob(BaseKnob): ''' - Knob representing any `int` value within a specific interval (`value_min`, `value_max`). + Knob representing any `int` value within a specific interval [`value_min`, `value_max`]. `is_exp` specifies whether the knob value should be scaled exponentially. ''' @@ -83,7 +83,7 @@ def _validate_values(value_min, value_max): class FloatKnob(BaseKnob): ''' - Knob representing any `float` value within a specific interval (`value_min`, `value_max`). + Knob representing any `float` value within a specific interval [`value_min`, `value_max`]. `is_exp` specifies whether the knob value should be scaled exponentially. ''' diff --git a/rafiki/worker/inference.py b/rafiki/worker/inference.py index ffa884e7..332e37ac 100644 --- a/rafiki/worker/inference.py +++ b/rafiki/worker/inference.py @@ -80,8 +80,7 @@ def _load_model(self, trial_id): # Load model based on trial clazz = load_model_class(model.model_file_bytes, model.model_class) - model_inst = clazz() - model_inst.init(trial.knobs) + model_inst = clazz(**trial.knobs) # Unpickle model parameters and load it parameters = pickle.loads(trial.parameters) diff --git a/rafiki/worker/train.py b/rafiki/worker/train.py index b22c7851..8ef2a4c5 100644 --- a/rafiki/worker/train.py +++ b/rafiki/worker/train.py @@ -133,15 +133,13 @@ def stop(self): def _train_and_evaluate_model(self, clazz, knobs, train_dataset_uri, test_dataset_uri): - model_inst = clazz() + # Initialize model + model_inst = clazz(**knobs) # Insert model training logger model_logger = TrainModelLogUtilsLogger() model_inst.utils.set_logger(model_logger) - # Initialize model - model_inst.init(knobs) - # Train model model_inst.train(train_dataset_uri) @@ -189,8 +187,7 @@ def _stop_worker(self): def _create_advisor(self, clazz): # Retrieve knob config for model of worker - model_inst = clazz() - knob_config = model_inst.get_knob_config() + knob_config = clazz.get_knob_config() # Create advisor associated with worker res = self._client.create_advisor(knob_config, advisor_id=self._service_id) From ad4b25cd83d799ea91f0f650146f2117d6d01012 Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Mon, 3 Dec 2018 12:38:37 +0800 Subject: [PATCH 03/22] Increment version no --- .env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.env.sh b/.env.sh index 4a14a676..15497c8f 100644 --- a/.env.sh +++ b/.env.sh @@ -1,6 +1,6 @@ # Core configuration for Rafiki export DOCKER_NETWORK=rafiki -export RAFIKI_VERSION=0.0.7 +export RAFIKI_VERSION=0.0.8 export RAFIKI_IP_ADDRESS=127.0.0.1 export ADMIN_EXT_PORT=3000 export ADMIN_WEB_EXT_PORT=3001 From f819e88327ea4d1f7bd44432ef34c1130c37047d Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Mon, 3 Dec 2018 14:13:36 +0800 Subject: [PATCH 04/22] Improve docs on new `__init__` of BaseModel --- .../src/user/client-create-models.include.rst | 5 +-- docs/src/user/creating-models.rst | 7 +++-- examples/models/image_classification/SkDt.py | 4 +-- examples/models/image_classification/SkSvm.py | 2 +- rafiki/model/model.py | 31 ++++++++++++------- 5 files changed, 30 insertions(+), 19 deletions(-) diff --git a/docs/src/user/client-create-models.include.rst b/docs/src/user/client-create-models.include.rst index cda9fd06..53b4472c 100644 --- a/docs/src/user/client-create-models.include.rst +++ b/docs/src/user/client-create-models.include.rst @@ -1,6 +1,7 @@ -To create a model, you will need to submit a model class that extends :class:`rafiki.model.BaseModel` in a single Python file, -where the model's implementation conforms to a specific task (see :ref:`tasks`). +To create a model, you will need to submit a model class that conforms to the specification +by :class:`rafiki.model.BaseModel`, written in a `single` Python file. +The model's implementation should conform to a specific task (see :ref:`tasks`). Refer to the parameters of :meth:`rafiki.client.Client.create_model` for configuring how your model runs on Rafiki, and refer to :ref:`creating-models` to understand more about how to write & test models for Rafiki. diff --git a/docs/src/user/creating-models.rst b/docs/src/user/creating-models.rst index 894e5203..2aca6cb1 100644 --- a/docs/src/user/creating-models.rst +++ b/docs/src/user/creating-models.rst @@ -6,9 +6,10 @@ Creating Models .. contents:: Table of Contents - -To create a model on Rafiki, use the :meth:`rafiki.client.Client.create_model` method. - +To create a model, you will need to submit a model class that conforms to the specification +by :class:`rafiki.model.BaseModel`, written in a `single` Python file. +The model's implementation should conform to a specific task (see :ref:`tasks`). +To submit the model to Rafiki, use the :meth:`rafiki.client.Client.create_model` method. Model Environment -------------------------------------------------------------------- diff --git a/examples/models/image_classification/SkDt.py b/examples/models/image_classification/SkDt.py index 4f6d53ff..984922d0 100644 --- a/examples/models/image_classification/SkDt.py +++ b/examples/models/image_classification/SkDt.py @@ -25,8 +25,8 @@ def __init__(self, **knobs): super().__init__(**knobs) self._knobs = knobs self._clf = self._build_classifier( - self._knobs.get('max_depth'), - self._knobs.get('criterion') + knobs.get('max_depth'), + knobs.get('criterion') ) def train(self, dataset_uri): diff --git a/examples/models/image_classification/SkSvm.py b/examples/models/image_classification/SkSvm.py index 39b86915..35b4b761 100644 --- a/examples/models/image_classification/SkSvm.py +++ b/examples/models/image_classification/SkSvm.py @@ -29,7 +29,7 @@ def __init__(self, **knobs): self._clf = self._build_classifier( knobs.get('max_iter'), knobs.get('kernel'), - knobs.get('gamma') , + knobs.get('gamma'), knobs.get('C') ) diff --git a/rafiki/model/model.py b/rafiki/model/model.py index 9e12620c..7610bf31 100644 --- a/rafiki/model/model.py +++ b/rafiki/model/model.py @@ -27,19 +27,28 @@ class BaseModel(abc.ABC): ''' Rafiki's base model class that Rafiki models should extend. Rafiki models should implement all abstract methods according to their associated tasks' specifications, - including the static method `get_knob_config()`. - ''' + together with the static method ``get_knob_config()``. - def __init__(self, **knobs): - ''' - Initialize a model instance with generated knob values. - These knob values will be chosen by Rafiki based on the model's knob config. - Call `super().__init__(**knobs)` as the first line of the model's `__init__` method, - followed by the model's initialization logic. + In the model's ``__init__`` method, call ``super().__init__(**knobs)`` as the first line, + followed by the model's initialization logic. The model should be initialize itself with ``knobs``, + a set of generated knob values for the instance, and possibly save the knobs' values as + attribute(s) of the model instance. These knob values will be chosen by Rafiki based on the model's knob config. + + For example: - :param knobs: Dictionary of knob values for this model instance - :type knobs: dict[str, any] - ''' + :: + + def __init__(self, **knobs): + super().__init__(**knobs) + self.__dict__.update(knobs) + ... + self._build_model(self.knob1, self.knob2) + + + :param knobs: Dictionary of knob values for this model instance + :type knobs: dict[str, any] + ''' + def __init__(self, **knobs): self.utils = ModelUtils() @staticmethod From f8a16ddf31a7180f83c35bad8fee5cd174c427c0 Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Mon, 3 Dec 2018 16:23:41 +0800 Subject: [PATCH 05/22] Finish worker-side changes for model knob API update, with serialization & deserialization of knob classes --- dockerfiles/advisor.Dockerfile | 2 +- rafiki/advisor/app.py | 8 ++++ rafiki/client/client.py | 6 +-- rafiki/model/__init__.py | 3 +- rafiki/model/knob.py | 69 ++++++++++++++++++++++++++++------ rafiki/model/model.py | 6 ++- rafiki/worker/train.py | 5 ++- scripts/start_worker.py | 3 +- 8 files changed, 82 insertions(+), 20 deletions(-) diff --git a/dockerfiles/advisor.Dockerfile b/dockerfiles/advisor.Dockerfile index e9ceda37..dea42998 100644 --- a/dockerfiles/advisor.Dockerfile +++ b/dockerfiles/advisor.Dockerfile @@ -21,8 +21,8 @@ ENV PYTHONPATH $DOCKER_WORKDIR_PATH # Install python dependencies COPY rafiki/utils/requirements.txt utils/requirements.txt RUN pip install -r utils/requirements.txt +COPY rafiki/model/requirements.txt model/requirements.txt RUN pip install -r model/requirements.txt -COPY rafiki/container/requirements.txt container/requirements.txt COPY rafiki/advisor/requirements.txt advisor/requirements.txt RUN pip install -r advisor/requirements.txt diff --git a/rafiki/advisor/app.py b/rafiki/advisor/app.py index 915e08f8..72253281 100644 --- a/rafiki/advisor/app.py +++ b/rafiki/advisor/app.py @@ -1,7 +1,9 @@ from flask import Flask, request, jsonify import os import traceback +import json +from rafiki.model import deserialize_knob_config from rafiki.constants import UserType from rafiki.config import SUPERADMIN_EMAIL, SUPERADMIN_PASSWORD from rafiki.utils.auth import generate_token, decode_token, UnauthorizedException, auth @@ -40,6 +42,12 @@ def generate_user_token(): @auth([UserType.ADMIN, UserType.APP_DEVELOPER]) def create_advisor(auth): params = get_request_params() + + # Deserialize knob config + if 'knob_config_str' in params: + params['knob_config'] = deserialize_knob_config(params['knob_config_str']) + del params['knob_config_str'] + return jsonify(service.create_advisor(**params)) @app.route('/advisors//propose', methods=['POST']) diff --git a/rafiki/client/client.py b/rafiki/client/client.py index 8a609562..0d210f5f 100644 --- a/rafiki/client/client.py +++ b/rafiki/client/client.py @@ -365,19 +365,19 @@ def stop_inference_job(self, app, app_version=-1): # Advisors #################################### - def create_advisor(self, knob_config, advisor_id=None): + def create_advisor(self, knob_config_str, advisor_id=None): ''' Creates a Rafiki advisor. If `advisor_id` is passed, it will create an advisor of that ID, or do nothing if an advisor of that ID has already been created. :param knob_config: Knob configuration for advisor session - :type knob_config: dict[str, any] + :type knob_config_str: Knob config, serialized :param str advisor_id: ID of advisor to create ''' data = self._post('/advisors', target='advisor', json={ 'advisor_id': advisor_id, - 'knob_config': knob_config + 'knob_config_str': knob_config_str }) return data diff --git a/rafiki/model/__init__.py b/rafiki/model/__init__.py index d7af85de..68f59c24 100644 --- a/rafiki/model/__init__.py +++ b/rafiki/model/__init__.py @@ -2,4 +2,5 @@ parse_model_install_command, InvalidModelClassException, InvalidModelParamsException, \ ModelUtils from .log import ModelLogUtilsLogger -from .knob import BaseKnob, CategoricalKnob, IntegerKnob, FloatKnob \ No newline at end of file +from .knob import BaseKnob, CategoricalKnob, IntegerKnob, FloatKnob, \ + serialize_knob_config, deserialize_knob_config \ No newline at end of file diff --git a/rafiki/model/knob.py b/rafiki/model/knob.py index c7325bff..7ab10d07 100644 --- a/rafiki/model/knob.py +++ b/rafiki/model/knob.py @@ -1,18 +1,46 @@ import abc - -# TODO: Add documentation for each knob +import json class BaseKnob(abc.ABC): + ''' + The base class for a knob type. + ''' + # TODO: Support conditional and validation logic - pass + + def __init__(self, knob_args={}): + self._knob_args = knob_args + + def to_json(self): + return json.dumps({ + 'type': self.__class__.__name__, + 'args': self._knob_args + }) + + @classmethod + def from_json(cls, json_str): + json_dict = json.loads(json_str) + + if 'type' not in json_dict or 'args' not in json_dict: + raise ValueError('Invalid JSON representation of knob: {}.'.format(json_str)) + + knob_type = json_dict['type'] + knob_args = json_dict['args'] + knob_classes = [CategoricalKnob, IntegerKnob, FloatKnob] + for clazz in knob_classes: + if clazz.__name__ == knob_type: + return clazz(**knob_args) + + raise ValueError('Invalid knob type: {}'.format(knob_type)) class CategoricalKnob(BaseKnob): ''' - Knob representing a categorical value of type `int`, `float`, `bool` or `str`. - A generated value of this knob must be an element of `values`. + Knob type representing a categorical value of type ``int``, ``float``, ``bool`` or ``str``. + A generated value of this knob would be an element of ``values``. ''' - def __init__(self, values): + knob_args = { 'values': values } + super().__init__(knob_args) self._values = values (self._value_type) = self._validate_values(values) @@ -47,11 +75,13 @@ def _validate_values(values): class IntegerKnob(BaseKnob): ''' - Knob representing any `int` value within a specific interval [`value_min`, `value_max`]. - `is_exp` specifies whether the knob value should be scaled exponentially. + Knob type epresenting `any` ``int`` value within a specific interval [``value_min``, ``value_max``]. + ``is_exp`` specifies whether the knob value should be scaled exponentially. ''' def __init__(self, value_min, value_max, is_exp=False): + knob_args = { 'value_min': value_min, 'value_max': value_max, 'is_exp': is_exp } + super().__init__(knob_args) self._validate_values(value_min, value_max) self._value_min = value_min self._value_max = value_max @@ -83,11 +113,13 @@ def _validate_values(value_min, value_max): class FloatKnob(BaseKnob): ''' - Knob representing any `float` value within a specific interval [`value_min`, `value_max`]. - `is_exp` specifies whether the knob value should be scaled exponentially. + Knob type representing `any` ``float`` value within a specific interval [``value_min``, ``value_max``]. + ``is_exp`` specifies whether the knob value should be scaled exponentially. ''' def __init__(self, value_min, value_max, is_exp=False): + knob_args = { 'value_min': value_min, 'value_max': value_max, 'is_exp': is_exp } + super().__init__(knob_args) self._validate_values(value_min, value_max) self._value_min = value_min self._value_max = value_max @@ -115,4 +147,19 @@ def _validate_values(value_min, value_max): if value_min > value_max: raise ValueError('`value_max` should be at least `value_min`') - \ No newline at end of file + + +def deserialize_knob_config(knob_config_str): + knob_config = { + name: BaseKnob.from_json(knob_str) + for (name, knob_str) in json.loads(knob_config_str).items() + } + return knob_config + +def serialize_knob_config(knob_config): + knob_config_str = json.dumps({ + name: knob.to_json() + for (name, knob) in knob_config.items() + }) + return knob_config_str + \ No newline at end of file diff --git a/rafiki/model/model.py b/rafiki/model/model.py index 7610bf31..e46cb818 100644 --- a/rafiki/model/model.py +++ b/rafiki/model/model.py @@ -13,7 +13,7 @@ from .dataset import ModelDatasetUtils from .log import ModelLogUtils -from .knob import BaseKnob +from .knob import BaseKnob, serialize_knob_config, deserialize_knob_config class InvalidModelClassException(Exception): pass class InvalidModelParamsException(Exception): pass @@ -310,6 +310,10 @@ def _check_knob_config(knob_config): any([(not isinstance(name, str) or not isinstance(knob, BaseKnob)) for (name, knob) in knob_config.items()]): raise Exception('Static method `get_knob_config()` should return a dict[str, BaseKnob]') + # Try serializing and deserialize knob config + knob_config_str = serialize_knob_config(knob_config) + knob_config = deserialize_knob_config(knob_config_str) + def _info(msg): msg_color = '\033[94m' end_color = '\033[0m' diff --git a/rafiki/worker/train.py b/rafiki/worker/train.py index 8ef2a4c5..b287af6b 100644 --- a/rafiki/worker/train.py +++ b/rafiki/worker/train.py @@ -7,7 +7,7 @@ from rafiki.config import SUPERADMIN_EMAIL, SUPERADMIN_PASSWORD from rafiki.constants import TrainJobStatus, TrialStatus, BudgetType -from rafiki.model import load_model_class +from rafiki.model import load_model_class, serialize_knob_config from rafiki.utils.log import JobLogger from rafiki.model import ModelLogUtilsLogger from rafiki.db import Database @@ -188,9 +188,10 @@ def _stop_worker(self): def _create_advisor(self, clazz): # Retrieve knob config for model of worker knob_config = clazz.get_knob_config() + knob_config_str = serialize_knob_config(knob_config) # Create advisor associated with worker - res = self._client.create_advisor(knob_config, advisor_id=self._service_id) + res = self._client.create_advisor(knob_config_str, advisor_id=self._service_id) advisor_id = res['id'] return advisor_id diff --git a/scripts/start_worker.py b/scripts/start_worker.py index 25863251..c6560ef6 100644 --- a/scripts/start_worker.py +++ b/scripts/start_worker.py @@ -12,7 +12,8 @@ def start_service(service_id, service_type): install_command = os.environ.get('WORKER_INSTALL_COMMAND', '') exit_code = os.system(install_command) if exit_code != 0: - raise Exception('Install command gave non-zero exit code: {}'.format(install_command)) + # TODO: Fix failing install command for `pip install torch==0.4.1;`` + raise Exception('Install command gave non-zero exit code: "{}"'.format(install_command)) if service_type == ServiceType.TRAIN: from rafiki.worker import TrainWorker From 92f46e19ebbbacd95c36cf472cdd43f15a210c17 Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Mon, 3 Dec 2018 16:24:07 +0800 Subject: [PATCH 06/22] Improve docs for model developers, describing new knob classes --- docs/src/python/rafiki.model.rst | 27 ++++++++++++++++++++++++ docs/src/user/creating-models.rst | 35 ++++++++++++++++++++----------- 2 files changed, 50 insertions(+), 12 deletions(-) diff --git a/docs/src/python/rafiki.model.rst b/docs/src/python/rafiki.model.rst index 30cc9da2..cd5ce407 100644 --- a/docs/src/python/rafiki.model.rst +++ b/docs/src/python/rafiki.model.rst @@ -1,9 +1,36 @@ rafiki.model ==================================================================== +.. contents:: Table of Contents + +Core Classes +-------------------------------------------------------------------- + .. autoclass:: rafiki.model.BaseModel :members: +.. autoclass:: rafiki.model.BaseKnob + :members: + + +.. _`knob-types`: + +Knob Classes +-------------------------------------------------------------------- + +.. autoclass:: rafiki.model.CategoricalKnob + :members: + +.. autoclass:: rafiki.model.IntegerKnob + :members: + +.. autoclass:: rafiki.model.FloatKnob + :members: + + +Utility Classes & Methods +-------------------------------------------------------------------- + .. automethod:: rafiki.model.test_model_class .. autoclass:: rafiki.model.log.ModelLogUtils diff --git a/docs/src/user/creating-models.rst b/docs/src/user/creating-models.rst index 2aca6cb1..6594b760 100644 --- a/docs/src/user/creating-models.rst +++ b/docs/src/user/creating-models.rst @@ -11,6 +11,28 @@ by :class:`rafiki.model.BaseModel`, written in a `single` Python file. The model's implementation should conform to a specific task (see :ref:`tasks`). To submit the model to Rafiki, use the :meth:`rafiki.client.Client.create_model` method. +Implementing Models +-------------------------------------------------------------------- + +Details on how to implement a model are located in the documentation of :class:`rafiki.model.BaseModel`. + +In defining the hyperparameters (knobs) of a model, refer to the documentation at :ref:`knob-types` for the full list of knob types. + +After implementing your model, it is highly recommended to use :meth:`rafiki.model.test_model_class` +to test your model. This method simulates a full train-inference flow on your model, ensuring that +it is likely to work on Rafiki. + + +Logging & Dataset Loading in Models +-------------------------------------------------------------------- + +:class:`rafiki.model.BaseModel` has a property ``utils`` that subclasses the model utility classes +:class:`rafiki.model.log.ModelLogUtils` and :class:`rafiki.model.dataset.ModelDatasetUtils`. They +help with model logging & dataset loading respectively. + +Refer to the sample usage in the implementation of `./examples/models/image_classification/TfSingleHiddenLayer.py `_. + + Model Environment -------------------------------------------------------------------- @@ -33,14 +55,13 @@ Models should run at least run on CPU-only machines and optionally leverage on a Refer to the parameters of :meth:`rafiki.client.Client.create_model` for configuring how your model runs on Rafiki. -Testing Models +Sample Models -------------------------------------------------------------------- To illustrate how to write models on Rafiki, we have written the following: - Sample pre-processing logic to convert common dataset formats to Rafiki's own dataset formats in `./examples/datasets/ `_ - Sample models in `./examples/models/ `_ - - A method :meth:`rafiki.model.test_model_class` that simulates a full train-inference flow on any Rafiki model To start testing your model, first install the Python dependencies at ``rafiki/model/requirements.txt``: @@ -94,13 +115,3 @@ Example: Testing Models for ``POS_TAGGING`` python examples/models/pos_tagging/BigramHmm.py python examples/models/pos_tagging/PyBiLstm.py - - -Model Logging & Dataset Loading --------------------------------------------------------------------- - -:class:`rafiki.model.BaseModel` has a property ``utils`` that subclasses the model utility classes -:class:`rafiki.model.log.ModelLogUtils` and :class:`rafiki.model.dataset.ModelDatasetUtils`. They -help with model logging & dataset loading respectively. - -Refer to the sample usage in the implementation of `./examples/models/image_classification/TfSingleHiddenLayer.py `_. \ No newline at end of file From 42713b69ca2ace1a999721523f546926c195e8e1 Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Thu, 6 Dec 2018 20:03:39 +0800 Subject: [PATCH 07/22] Rework model logging implementation (stream logs into DB) & slightly tweak model logging API --- examples/models/image_classification/SkDt.py | 2 +- .../image_classification/TfFeedForward.py | 22 ++- examples/models/pos_tagging/BigramHmm.py | 2 +- examples/models/pos_tagging/PyBiLstm.py | 22 ++- rafiki/admin/admin.py | 9 +- rafiki/db/database.py | 29 +-- rafiki/db/schema.py | 14 +- rafiki/model/__init__.py | 2 +- rafiki/model/log.py | 170 +++++++++++++----- rafiki/model/model.py | 6 +- rafiki/worker/train.py | 76 ++++---- 11 files changed, 225 insertions(+), 129 deletions(-) diff --git a/examples/models/image_classification/SkDt.py b/examples/models/image_classification/SkDt.py index 984922d0..5263e921 100644 --- a/examples/models/image_classification/SkDt.py +++ b/examples/models/image_classification/SkDt.py @@ -39,7 +39,7 @@ def train(self, dataset_uri): # Compute train accuracy preds = self._clf.predict(X) accuracy = sum(y == preds) / len(y) - self.utils.log('Train accuracy: {}'.format(accuracy)) + self.logger.log('Train accuracy: {}'.format(accuracy)) def evaluate(self, dataset_uri): dataset = self.utils.load_dataset_of_image_files(dataset_uri) diff --git a/examples/models/image_classification/TfFeedForward.py b/examples/models/image_classification/TfFeedForward.py index b907c66b..238fdbd3 100644 --- a/examples/models/image_classification/TfFeedForward.py +++ b/examples/models/image_classification/TfFeedForward.py @@ -35,21 +35,24 @@ def __init__(self, **knobs): config = tf.ConfigProto() config.gpu_options.allow_growth = True self._sess = tf.Session(graph=self._graph, config=config) - self._define_plots() def train(self, dataset_uri): im_sz = self._knobs.get('image_size') bs = self._knobs.get('batch_size') ep = self._knobs.get('epochs') + self.logger.log('Available devices: {}'.format(str(device_lib.list_local_devices()))) + + # Define 2 plots: Loss against time, loss against epochs + self.logger.define_loss_plot() + self.logger.define_plot('Loss Over Time', ['loss']) + dataset = self.utils.load_dataset_of_image_files(dataset_uri, image_size=[im_sz, im_sz]) num_classes = dataset.classes (images, classes) = zip(*[(image, image_class) for (image, image_class) in dataset]) images = np.asarray(images) classes = np.asarray(classes) - self.utils.log('Available devices: {}'.format(str(device_lib.list_local_devices()))) - with self._graph.as_default(): self._model = self._build_model(num_classes) with self._sess.as_default(): @@ -66,8 +69,8 @@ def train(self, dataset_uri): # Compute train accuracy (loss, accuracy) = self._model.evaluate(images, classes) - self.utils.log('Train loss: {}'.format(loss)) - self.utils.log('Train accuracy: {}'.format(accuracy)) + self.logger.log('Train loss: {}'.format(loss)) + self.logger.log('Train accuracy: {}'.format(accuracy)) def evaluate(self, dataset_uri): im_sz = self._knobs.get('image_size') @@ -80,7 +83,7 @@ def evaluate(self, dataset_uri): with self._graph.as_default(): with self._sess.as_default(): (loss, accuracy) = self._model.evaluate(images, classes) - self.utils.log('Test loss: {}'.format(loss)) + self.logger.log('Test loss: {}'.format(loss)) return accuracy @@ -134,12 +137,7 @@ def load_parameters(self, params): def _on_train_epoch_end(self, epoch, logs): loss = logs['loss'] - self.utils.log_loss_metric(loss, epoch) - - def _define_plots(self): - # Define 2 plots: Loss against time, loss against epochs - self.utils.define_loss_plot() - self.utils.define_plot('Loss Over Time', ['loss']) + self.logger.log_loss(loss, epoch) def _build_model(self, num_classes): units = self._knobs.get('hidden_layer_units') diff --git a/examples/models/pos_tagging/BigramHmm.py b/examples/models/pos_tagging/BigramHmm.py index b27efc12..15b2cfc0 100644 --- a/examples/models/pos_tagging/BigramHmm.py +++ b/examples/models/pos_tagging/BigramHmm.py @@ -30,7 +30,7 @@ def train(self, dataset_uri): (sents_tokens, sents_tags) = zip(*[zip(*sent) for sent in dataset]) self._num_tags = dataset.tag_num_classes[0] (self._trans_probs, self._emiss_probs) = self._compute_probs(self._num_tags, sents_tokens, sents_tags) - self.utils.log('No. of tags: {}'.format(self._num_tags)) + self.logger.log('No. of tags: {}'.format(self._num_tags)) def evaluate(self, dataset_uri): dataset = self.utils.load_dataset_of_corpus(dataset_uri) diff --git a/examples/models/pos_tagging/PyBiLstm.py b/examples/models/pos_tagging/PyBiLstm.py index 603fe9eb..cedc6997 100644 --- a/examples/models/pos_tagging/PyBiLstm.py +++ b/examples/models/pos_tagging/PyBiLstm.py @@ -35,21 +35,20 @@ def get_knob_config(): def __init__(self, **knobs): super().__init__(**knobs) self._knobs = knobs - self._define_plots() def train(self, dataset_uri): dataset = self.utils.load_dataset_of_corpus(dataset_uri) self._word_dict = self._extract_word_dict(dataset) self._tag_count = dataset.tag_num_classes[0] - self.utils.log('No. of unique words: {}'.format(len(self._word_dict))) - self.utils.log('No. of tags: {}'.format(self._tag_count)) + self.logger.log('No. of unique words: {}'.format(len(self._word_dict))) + self.logger.log('No. of tags: {}'.format(self._tag_count)) (self._net, self._optimizer) = self._train(dataset) sents_tags = self._predict(dataset) acc = self._compute_accuracy(dataset, sents_tags) - self.utils.log('Train accuracy: {}'.format(acc)) + self.logger.log('Train accuracy: {}'.format(acc)) def evaluate(self, dataset_uri): dataset = self.utils.load_dataset_of_corpus(dataset_uri) @@ -139,7 +138,7 @@ def _predict(self, dataset): Tensor = torch.LongTensor if torch.cuda.is_available(): - self.utils.log('Using CUDA...') + self.logger.log('Using CUDA...') net = net.cuda() Tensor = torch.cuda.LongTensor @@ -172,11 +171,15 @@ def _train(self, dataset): null_tag = self._tag_count # Tag to ignore (from padding of sentences during batching) B = math.ceil(len(dataset) / N) # No. of batches + # Define 2 plots: Loss against time, loss against epochs + self.logger.define_loss_plot() + self.logger.define_plot('Loss Over Time', ['loss']) + (net, optimizer) = self._create_model() Tensor = torch.LongTensor if torch.cuda.is_available(): - self.utils.log('Using CUDA...') + self.logger.log('Using CUDA...') net = net.cuda() Tensor = torch.cuda.LongTensor @@ -206,15 +209,10 @@ def _train(self, dataset): total_loss += loss.item() - self.utils.log_loss_metric(loss=(total_loss / B), epoch=epoch) + self.logger.log_loss(loss=(total_loss / B), epoch=epoch) return (net, optimizer) - def _define_plots(self): - # Define 2 plots: Loss against time, loss against epochs - self.utils.define_loss_plot() - self.utils.define_plot('Loss Over Time', ['loss']) - def _compute_accuracy(self, dataset, sents_tags): total = 0 correct = 0 diff --git a/rafiki/admin/admin.py b/rafiki/admin/admin.py index 0ee523cd..a9da05b4 100644 --- a/rafiki/admin/admin.py +++ b/rafiki/admin/admin.py @@ -7,8 +7,8 @@ from rafiki.db import Database from rafiki.constants import ServiceStatus, UserType, ServiceType, TrainJobStatus from rafiki.config import MIN_SERVICE_PORT, MAX_SERVICE_PORT, SUPERADMIN_EMAIL, SUPERADMIN_PASSWORD +from rafiki.model import ModelLogger from rafiki.container import DockerSwarmContainerManager -from rafiki.utils.log import JobLogger from .services_manager import ServicesManager @@ -256,10 +256,9 @@ def get_trial_logs(self, trial_id): if trial is None: raise InvalidTrialException() - job_logger = JobLogger() - job_logger.import_logs(trial.logs) - (plots, metrics, messages) = job_logger.read_logs() - job_logger.destroy() + trial_logs = self._db.get_trial_logs(trial_id) + log_lines = [x.line for x in trial_logs] + (messages, metrics, plots) = ModelLogger.parse_logs(log_lines) return { 'plots': plots, diff --git a/rafiki/db/database.py b/rafiki/db/database.py index 4d3b8103..5ec9c58a 100644 --- a/rafiki/db/database.py +++ b/rafiki/db/database.py @@ -7,7 +7,8 @@ TrialStatus, ServiceStatus, InferenceJobStatus from .schema import Base, TrainJob, TrainJobWorker, \ - InferenceJob, Trial, Model, User, Service, InferenceJobWorker + InferenceJob, Trial, Model, User, Service, InferenceJobWorker, \ + TrialLog class Database(object): def __init__(self, @@ -199,11 +200,6 @@ def get_inference_jobs_of_app(self, app): return inference_jobs - def get_workers_of_inference_job(self, inference_job_id): - workers = self._session.query(InferenceJobWorker) \ - .filter(InferenceJobWorker.inference_job_id == inference_job_id).all() - return workers - #################################### # Inference Job Workers #################################### @@ -335,10 +331,17 @@ def get_trial(self, id): return trial + def get_trial_logs(self, id): + trial_logs = self._session.query(TrialLog) \ + .filter(TrialLog.trial_id == id) \ + .all() + + return trial_logs + def get_best_trials_of_train_job(self, train_job_id, max_count=3): trials = self._session.query(Trial) \ .filter(Trial.train_job_id == train_job_id) \ - .filter(Trial.status == TrainJobStatus.COMPLETED) \ + .filter(Trial.status == TrialStatus.COMPLETED) \ .order_by(Trial.score.desc()) \ .limit(max_count).all() @@ -354,8 +357,7 @@ def get_trials_of_app(self, app): def get_trials_of_train_job(self, train_job_id): trials = self._session.query(Trial) \ - .join(TrainJob, Trial.train_job_id == TrainJob.id) \ - .filter(TrainJob.id == train_job_id) \ + .filter(Trial.train_job_id == train_job_id) \ .order_by(Trial.datetime_started.desc()).all() return trials @@ -366,15 +368,19 @@ def mark_trial_as_errored(self, trial): self._session.add(trial) return trial - def mark_trial_as_complete(self, trial, score, parameters, logs): + def mark_trial_as_complete(self, trial, score, parameters): trial.status = TrialStatus.COMPLETED trial.score = score trial.datetime_stopped = datetime.datetime.utcnow() trial.parameters = parameters - trial.logs = logs self._session.add(trial) return trial + def add_trial_log(self, trial, line, level): + trial_log = TrialLog(trial_id=trial.id, line=line, level=level) + self._session.add(trial_log) + return trial_log + def mark_trial_as_terminated(self, trial): trial.status = TrialStatus.TERMINATED trial.datetime_stopped = datetime.datetime.utcnow() @@ -418,5 +424,4 @@ def _make_connection_url(self, host, port, db, user, password): def _define_tables(self): Base.metadata.create_all(bind=self._engine) - diff --git a/rafiki/db/schema.py b/rafiki/db/schema.py index 2fe4fe3e..a8150222 100644 --- a/rafiki/db/schema.py +++ b/rafiki/db/schema.py @@ -64,7 +64,6 @@ class Service(Base): container_service_name = Column(String) container_service_id = Column(String) - class TrainJob(Base): __tablename__ = 'train_job' @@ -87,21 +86,28 @@ class TrainJobWorker(Base): train_job_id = Column(String, ForeignKey('train_job.id')) model_id = Column(String, ForeignKey('model.id'), nullable=False) - class Trial(Base): __tablename__ = 'trial' id = Column(String, primary_key=True, default=generate_uuid) knobs = Column(JSON, nullable=False) datetime_started = Column(DateTime, nullable=False, default=generate_datetime) - train_job_id = Column(String, ForeignKey('train_job.id'), nullable=False) + train_job_id = Column(String, ForeignKey('train_job.id'), nullable=False, index=True) model_id = Column(String, ForeignKey('model.id'), nullable=False) status = Column(String, nullable=False, default=TrialStatus.RUNNING) score = Column(Float, default=0) parameters = Column(Binary, default=None) - logs = Column(Binary, default=None) datetime_stopped = Column(DateTime, default=None) +class TrialLog(Base): + __tablename__ = 'trial_log' + + id = Column(String, primary_key=True, default=generate_uuid) + datetime = Column(DateTime, default=generate_datetime) + trial_id = Column(String, ForeignKey('trial.id'), nullable=False, index=True) + line = Column(String, nullable=False) + level = Column(String) + class User(Base): __tablename__ = 'user' diff --git a/rafiki/model/__init__.py b/rafiki/model/__init__.py index 68f59c24..79a2df30 100644 --- a/rafiki/model/__init__.py +++ b/rafiki/model/__init__.py @@ -1,6 +1,6 @@ from .model import BaseModel, test_model_class, load_model_class, \ parse_model_install_command, InvalidModelClassException, InvalidModelParamsException, \ ModelUtils -from .log import ModelLogUtilsLogger +from .log import LogType, ModelLogger from .knob import BaseKnob, CategoricalKnob, IntegerKnob, FloatKnob, \ serialize_knob_config, deserialize_knob_config \ No newline at end of file diff --git a/rafiki/model/log.py b/rafiki/model/log.py index 7912d0a7..6adcd66d 100644 --- a/rafiki/model/log.py +++ b/rafiki/model/log.py @@ -1,73 +1,161 @@ import os import traceback import datetime +import json +import logging -class DuplicatePlotException(Exception): pass +MODEL_LOG_DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%S' -class ModelLogUtils(): - ''' - Collection of utility methods for logging and plotting of messages & metrics during training. - ''' - def __init__(self): - # Add logging to stdout for local debugging - self._logger = ModelLogUtilsLogger() +class LogType(): + PLOT = 'PLOT' + METRICS = 'METRICS' + MESSAGE = 'MESSAGE' - def set_logger(self, logger): - if not isinstance(logger, ModelLogUtilsLogger): - raise Exception('`logger` should subclass `ModelLogUtilsLogger`') - +class ModelLogger(): + def __init__(self): + # By default, set a logging handler to print to stdout (for debugging) + logger = logging.getLogger(__name__) + logger.setLevel(level=logging.INFO) + logger.addHandler(ModelLoggerDebugHandler()) self._logger = logger - def log(self, message): - ''' - Logs a message for analysis of model training. - ''' - self._logger.log(message) - def define_loss_plot(self): ''' Convenience method of defining a plot of ``loss`` against ``epoch``. - To be used with ``log_loss_metric()``. + To be used with :meth:`rafiki.model.ModeLogger.log_loss`. ''' self.define_plot('Loss Over Epochs', ['loss'], x_axis='epoch') - def log_loss_metric(self, loss, epoch): + def log_loss(self, loss, epoch): ''' Convenience method for logging `loss` against `epoch`. - To be used with ``define_loss_plot()``. + To be used with :meth:`rafiki.model.ModeLogger.define_loss_plot`.. ''' - self.log_metrics(loss=loss, epoch=epoch) + self.log(loss=loss, epoch=epoch) def define_plot(self, title, metrics, x_axis=None): ''' Defines a plot for a set of metrics for analysis of model training. By default, metrics will be plotted against time. + + For example, a model's precision & recall logged with e.g. ``log(precision=0.1, recall=0.6, epoch=1)`` + can be visualized in the plots generated by + ``define_plot('Precision & Recall', y_axis=['precision', 'recall'])`` (against time) or + ``define_plot('Precision & Recall', y_axis=['precision', 'recall'], x_axis=['epoch'])`` (against epochs). + + Only call this method in :meth:`rafiki.model.BaseModel.train`. + + :param str title: Title of the plot + :param str metrics: List of metrics that should be plotted on the y-axis + :type metrics: str[] + :param str x_axis: Metric that should be plotted on the x-axis, against all other metrics. Defaults to ``'time'``, which is automatically logged ''' - self._logger.define_plot(title, metrics, x_axis) + self._log(LogType.PLOT, { 'title': title, 'metrics': metrics, 'x_axis': x_axis }) - def log_metrics(self, **kwargs): + def log(self, msg='', **metrics): ''' - Logs metrics for a single point in time { : }. - should be a number. + Logs a message and/or a set of metrics at a single point in time. + + Logged messages will be viewable on Rafiki's administrative UI. + To visualize logged metrics on plots, a plot must be defined via :meth:`rafiki.model.ModeLogger.define_plot`. + + Only call this method in :meth:`rafiki.model.BaseModel.train` and :meth:`rafiki.model.BaseModel.evaluate`. + + :param str msg: Message to be logged + :param metrics: Set of metrics & their values to be logged as { : }, where should be a number. + :type metrics: dict[str, int|float] ''' - self._logger.log_metrics(**kwargs) + if msg: + self._log(LogType.MESSAGE, { 'message': msg }) + + if metrics: + self._log(LogType.METRICS, metrics) + + # Set the Python logger internally used. + # During model training, this method will be called by Rafiki to inject a Python logger + # to generate logs for an instance of model training. + def set_logger(self, logger): + self._logger = logger -class ModelLogUtilsLogger(): + def _log(self, log_type, log_dict={}): + log_dict['type'] = log_type + log_dict['time'] = datetime.datetime.now().strftime(MODEL_LOG_DATETIME_FORMAT) + log_line = json.dumps(log_dict) + self._logger.info(log_line) + + @staticmethod + # Parses a logged line into a dictionary. + def parse_log_line(log_line): + try: + return json.loads(log_line) + except ValueError: + return {} + + @staticmethod + # Parses logs into (messages, metrics, plots) for visualization. + def parse_logs(log_lines): + plots = [] + metrics = [] + messages = [] + + for log_line in log_lines: + log_dict = ModelLogger.parse_log_line(log_line) + + if 'time' not in log_dict or 'type' not in log_dict: + continue + + log_datetime = log_dict['time'] + log_type = log_dict['type'] + del log_dict['time'] + del log_dict['type'] + + if log_type == LogType.MESSAGE: + messages.append({ + 'time': log_datetime, + 'message': log_dict.get('message') + }) + + elif log_type == LogType.METRICS: + metrics.append({ + 'time': log_datetime, + **log_dict + }) + + elif log_type == LogType.PLOT: + plots.append({ + **log_dict + }) + + return (messages, metrics, plots) + +class ModelLoggerDebugHandler(logging.Handler): def __init__(self): - self._plots = set() - - def log(self, message): - self._print(message) + logging.Handler.__init__(self) + + def emit(self, record): + log_line = record.msg + log_dict = ModelLogger.parse_log_line(log_line) + log_type = log_dict.get('type') - def define_plot(self, title, metrics, x_axis): - if title in self._plots: - raise DuplicatePlotException('Plot {} already defined'.format(title)) - self._plots.add(title) - self._print('Plot with title `{}` of {} against {} will be registered when this model is being trained on Rafiki' \ - .format(title, ', '.join(metrics), x_axis or 'time')) + if log_type == LogType.PLOT: - def log_metrics(self, **kwargs): - self._print(', '.join(['{}={}'.format(metric, value) for (metric, value) in kwargs.items()])) + title = log_dict.get('title') + metrics = log_dict.get('metrics') + x_axis = log_dict.get('x_axis') + self._print('Plot `{}` of {} against {} will be registered when this model is being trained on Rafiki' \ + .format(title, ', '.join(metrics), x_axis or 'time')) + + elif log_type == LogType.METRICS: + metrics_log = ', '.join(['{}={}'.format(metric, value) for (metric, value) in log_dict.items()]) + self._print('Metric(s) logged: {}'.format(metrics_log)) + + elif log_type == LogType.MESSAGE: + msg = log_dict.get('message') + self._print(msg) + + else: + self._print(log_line) + def _print(self, message): - print(message) \ No newline at end of file + print('[{}]'.format(__name__), message) diff --git a/rafiki/model/model.py b/rafiki/model/model.py index e46cb818..2b9fb3d6 100644 --- a/rafiki/model/model.py +++ b/rafiki/model/model.py @@ -12,16 +12,15 @@ from rafiki.constants import TaskType, ModelDependency from .dataset import ModelDatasetUtils -from .log import ModelLogUtils +from .log import ModelLogger from .knob import BaseKnob, serialize_knob_config, deserialize_knob_config class InvalidModelClassException(Exception): pass class InvalidModelParamsException(Exception): pass -class ModelUtils(ModelDatasetUtils, ModelLogUtils): +class ModelUtils(ModelDatasetUtils): def __init__(self): ModelDatasetUtils.__init__(self) - ModelLogUtils.__init__(self) class BaseModel(abc.ABC): ''' @@ -49,6 +48,7 @@ def __init__(self, **knobs): :type knobs: dict[str, any] ''' def __init__(self, **knobs): + self.logger = ModelLogger() self.utils = ModelUtils() @staticmethod diff --git a/rafiki/worker/train.py b/rafiki/worker/train.py index b287af6b..3894e90c 100644 --- a/rafiki/worker/train.py +++ b/rafiki/worker/train.py @@ -7,9 +7,7 @@ from rafiki.config import SUPERADMIN_EMAIL, SUPERADMIN_PASSWORD from rafiki.constants import TrainJobStatus, TrialStatus, BudgetType -from rafiki.model import load_model_class, serialize_knob_config -from rafiki.utils.log import JobLogger -from rafiki.model import ModelLogUtilsLogger +from rafiki.model import load_model_class, serialize_knob_config, LogType from rafiki.db import Database from rafiki.client import Client @@ -77,9 +75,8 @@ def start(self): logger.info('Received proposal of knobs from advisor:') logger.info(pprint.pformat(knobs)) logger.info('Creating new trial in DB...') - trial = self._create_new_trial(model_id, train_job_id, knobs) - self._trial_id = trial.id - logger.info('Created trial of ID "{}" in DB'.format(trial.id)) + self._trial_id = self._create_new_trial(model_id, train_job_id, knobs) + logger.info('Created trial of ID "{}" in DB'.format(self._trial_id)) # Don't keep DB connection while training model self._db.disconnect() @@ -89,14 +86,20 @@ def start(self): try: logger.info('Starting trial...') logger.info('Training & evaluating model...') - (score, parameters, logs) = self._train_and_evaluate_model(clazz, knobs, train_dataset_uri, - test_dataset_uri) + + def handle_log(log_line, log_lvl): + with self._db: + trial = self._db.get_trial(self._trial_id) + self._db.add_trial_log(trial, log_line, log_lvl) + + (score, parameters) = self._train_and_evaluate_model(clazz, knobs, train_dataset_uri, + test_dataset_uri, handle_log) logger.info('Trial score: {}'.format(score)) with self._db: logger.info('Marking trial as complete in DB...') trial = self._db.get_trial(self._trial_id) - self._db.mark_trial_as_complete(trial, score, parameters, logs) + self._db.mark_trial_as_complete(trial, score, parameters) self._trial_id = None except Exception: @@ -131,14 +134,22 @@ def stop(self): logger.error('Error marking trial as terminated:') logger.error(traceback.format_exc()) - def _train_and_evaluate_model(self, clazz, knobs, train_dataset_uri, - test_dataset_uri): + def _train_and_evaluate_model(self, clazz, knobs, train_dataset_uri, \ + test_dataset_uri, handle_log): + # Initialize model model_inst = clazz(**knobs) - # Insert model training logger - model_logger = TrainModelLogUtilsLogger() - model_inst.utils.set_logger(model_logger) + # Add logs handlers for trial, including adding handler to root logger + # to handle logs emitted during model training with level above INFO + log_handler = ModelLoggerHandler(handle_log) + root_logger = logging.getLogger() + root_logger.addHandler(log_handler) + logger = logging.getLogger('{}.trial'.format(__name__)) + logger.setLevel(logging.INFO) + logger.propagate = False # Avoid duplicate logs in root logger + logger.addHandler(log_handler) + model_inst.logger.set_logger(logger) # Train model model_inst.train(train_dataset_uri) @@ -146,16 +157,15 @@ def _train_and_evaluate_model(self, clazz, knobs, train_dataset_uri, # Evaluate model score = model_inst.evaluate(test_dataset_uri) + # Remove log handler for trial + root_logger.removeHandler(log_handler) + # Dump and pickle model parameters parameters = model_inst.dump_parameters() parameters = pickle.dumps(parameters) model_inst.destroy() - # Export model logs - logs = model_logger.export_logs() - model_logger.destroy() - - return (score, parameters, logs) + return (score, parameters) # Creates a new trial in the DB def _create_new_trial(self, model_id, train_job_id, knobs): @@ -165,7 +175,7 @@ def _create_new_trial(self, model_id, train_job_id, knobs): knobs=knobs ) self._db.commit() - return trial + return trial.id # Gets proposal of a set of knob values from advisor def _get_proposal_from_advisor(self, advisor_id): @@ -252,21 +262,13 @@ def _make_client(self): client.login(email=superadmin_email, password=superadmin_password) return client -class TrainModelLogUtilsLogger(ModelLogUtilsLogger): - def __init__(self): - self._job_logger = JobLogger() - - def log(self, message): - return self._job_logger.log(message) - - def define_plot(self, title, metrics, x_axis): - return self._job_logger.define_plot(title, metrics, x_axis) - - def log_metrics(self, **kwargs): - return self._job_logger.log_metrics(**kwargs) - - def export_logs(self): - return self._job_logger.export_logs() +class ModelLoggerHandler(logging.Handler): + def __init__(self, handle_log): + logging.Handler.__init__(self) + self._handle_log = handle_log - def destroy(self): - return self._job_logger.destroy() \ No newline at end of file + def emit(self, record): + log_line = record.msg + log_lvl = record.levelname + self._handle_log(log_line, log_lvl) + \ No newline at end of file From 7bdccf1980058354257bc83bec41188e28735e1a Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Thu, 6 Dec 2018 20:03:52 +0800 Subject: [PATCH 08/22] Remove unused `JobLogger` --- rafiki/utils/log.py | 226 -------------------------------------------- 1 file changed, 226 deletions(-) diff --git a/rafiki/utils/log.py b/rafiki/utils/log.py index d5bc45bb..bb2aeb6c 100644 --- a/rafiki/utils/log.py +++ b/rafiki/utils/log.py @@ -10,235 +10,9 @@ logger = logging.getLogger(__name__) -JOB_LOGGER_DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%S' - def configure_logging(process_name): # Configure all logging to a log file logs_folder_path = LOGS_FOLDER_PATH logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s', filename='{}/{}.log'.format(logs_folder_path, process_name)) - -class JobLogger(): - def __init__(self): - self._log_file = tempfile.NamedTemporaryFile(delete=False, mode='w+', encoding='utf-8') - - def define_plot(self, title, metrics, x_axis): - self._log_line(has_time=False, type='PLOT', title=title, metrics=metrics, x_axis=x_axis) - - def log(self, message): - self._log_line(type='MESSAGE', message=message) - - def log_metrics(self, **kwargs): - self._log_line(type='METRICS', **kwargs) - - # Clears all logs (excluding plot definitions) before a specific time - def clear_logs(self, datetime_before=None): - if datetime_before is None: - datetime_before = datetime.now() - - self._log_file.seek(0) - new_log_file = tempfile.NamedTemporaryFile(delete=False, mode='w+', encoding='utf-8') - - # Only copy over lines in new file that are not before `datetime_before` - for line in self._log_file: - (log_datetime, _) = self._parse_line(line) - log_datetime = datetime.strptime(log_datetime, JOB_LOGGER_DATETIME_FORMAT) \ - if log_datetime is not None else None - if log_datetime is None or log_datetime >= datetime_before: - new_log_file.write(line) - - # Switch to new log file - self._log_file.close() - os.remove(self._log_file.name) - self._log_file = new_log_file - - # Read all logs as bytes - def export_logs(self): - self._log_file.seek(0) - logs_bytes = self._log_file.read().encode('utf-8') - return logs_bytes - - def destroy(self): - # Remove temporary internal log file - self._log_file.close() - os.remove(self._log_file.name) - - # Import and completely replace all logs - def import_logs(self, logs_bytes): - if logs_bytes is None: return - self._log_file.seek(0) - self._log_file.write(logs_bytes.decode('utf-8')) - self._log_file.truncate() - - ''' - Read logs as (plots, metrics, messages) - - plots: Plot[] - Plot: { title, metrics, x_axis } - metrics: Metric[] - Metric: { time: Datetime, [name]: [value]} - messages: { time: Datetime, message: string }[] - Datetime: string (%Y-%m-%dT%H:%M:%S) - ''' - def read_logs(self): - self._log_file.seek(0) - - plots = [] - metrics = [] - messages = [] - for line in self._log_file: - (log_datetime, log_dict) = self._parse_line(line) - - if 'type' not in log_dict: - continue - - log_type = log_dict['type'] - del log_dict['type'] - - if log_type == 'MESSAGE': - messages.append({ - 'time': log_datetime, - 'message': log_dict.get('message') - }) - - elif log_type == 'METRICS': - metrics.append({ - 'time': log_datetime, - **log_dict - }) - - elif log_type == 'PLOT': - plots.append({ - **log_dict - }) - - return (plots, metrics, messages) - - # Logs dictionary to temporary internal log file in JSON as line, appending current time - def _log_line(self, has_time=True, **kwargs): - if has_time: - kwargs['time'] = datetime.now().strftime(JOB_LOGGER_DATETIME_FORMAT) - self._log_file.write('{}\n'.format(json.dumps(kwargs))) - - # Parses a log line as (log_datetime, log_dict) - def _parse_line(self, line): - log = None - try: - log = json.loads(line) - except: - logger.warn('Error while reading line in log: "{}"'.format(line)) - logger.warn(traceback.format_exc()) - return (None, {}) - - log_datetime = None - if 'time' in log: - log_datetime = log['time'] - del log['time'] - - return (log_datetime, log) - -def _test_job_logger_for_train_worker(): - l = JobLogger() - l.define_plot('Model Loss', ['loss'], None) - - # Model is being trained - l.log('START') - time.sleep(1) - l.log_metrics(loss=3.42, learning_rate=0.01) - time.sleep(1) - l.log_metrics(loss=3.21, learning_rate=0.01) - time.sleep(1) - l.log_metrics(loss=3.11) - l.log('END') - - # At the end of training, logs are exported and saved - logs_bytes = l.export_logs() - assert isinstance(logs_bytes, bytes) - l.destroy() - - # App developer checks on logs - l2 = JobLogger() - l2.import_logs(logs_bytes) - (plots, metrics, messages) = l2.read_logs() - l2.destroy() - - assert len(plots) == 1 - assert plots[0] == { 'title': 'Model Loss', 'metrics': ['loss'], 'x_axis': None } - assert len(metrics) == 3 - assert [x.get('loss') for x in metrics] == [3.42, 3.21, 3.11] - assert [x.get('learning_rate') for x in metrics] == [0.01, 0.01, None] - assert [isinstance(x.get('time'), str) for x in metrics] == [True, True, True] - assert [x.get('message') for x in messages] == ['START', 'END'] - -def _test_job_logger_for_predictor(): - l = JobLogger() - l.define_plot('Queries', ['queries'], None) - - l.log('UP') - - # Predictor receives queries - time.sleep(1) - queries = 0 - while queries < 3: - l.log_metrics(query=True) - queries += 1 - time.sleep(2) - while queries < 23: - l.log_metrics(query=True) - queries += 1 - time.sleep(1) - - # Predictor's logs are exported and cleared periodically - logs_bytes = l.export_logs() - l.clear_logs() - - # App developer checks on this period's logs - l2 = JobLogger() - l2.import_logs(logs_bytes) - (plots, metrics, messages) = l2.read_logs() - l2.destroy() - - assert len(plots) == 1 - assert len(metrics) == 23 - assert metrics[4].get('query') == True - assert [x.get('message') for x in messages] == ['UP'] - - # Predictor receives more queries - time.sleep(1) - queries = 0 - while queries < 40: - l.log_metrics(query=True) - queries += 1 - time.sleep(2) - while queries < 43: - l.log_metrics(query=True) - queries += 1 - - l.log('KILLED') - l.log('DOWN') - - # Predictor's logs are exported and cleared periodically - logs_bytes = l.export_logs() - l.clear_logs() - l.destroy() - - # App developer checks on this period's logs - l2 = JobLogger() - l2.import_logs(logs_bytes) - (plots, metrics, messages) = l2.read_logs() - l2.destroy() - - assert len(plots) == 1 - assert len(metrics) == 43 - assert metrics[8].get('query') == True - assert isinstance(metrics[9].get('time'), str) is True - assert [x.get('message') for x in messages] == ['KILLED', 'DOWN'] - -if __name__ == '__main__': - print('Testing `JobLogger` for train worker...') - _test_job_logger_for_train_worker() - print('Testing `JobLogger` for predictor...') - _test_job_logger_for_predictor() - print('All tests pass!') - \ No newline at end of file From 3d658335b42255c6e730c798161852d6c00fa3ce Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Thu, 6 Dec 2018 20:04:07 +0800 Subject: [PATCH 09/22] Update docs on model logging --- docs/src/python/rafiki.model.rst | 2 +- docs/src/user/creating-models.rst | 16 +++++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/docs/src/python/rafiki.model.rst b/docs/src/python/rafiki.model.rst index cd5ce407..87f3d85f 100644 --- a/docs/src/python/rafiki.model.rst +++ b/docs/src/python/rafiki.model.rst @@ -33,7 +33,7 @@ Utility Classes & Methods .. automethod:: rafiki.model.test_model_class -.. autoclass:: rafiki.model.log.ModelLogUtils +.. autoclass:: rafiki.model.ModelLogger :members: .. autoclass:: rafiki.model.dataset.ModelDatasetUtils diff --git a/docs/src/user/creating-models.rst b/docs/src/user/creating-models.rst index 6594b760..0e2bcd7b 100644 --- a/docs/src/user/creating-models.rst +++ b/docs/src/user/creating-models.rst @@ -22,17 +22,23 @@ After implementing your model, it is highly recommended to use :meth:`rafiki.mod to test your model. This method simulates a full train-inference flow on your model, ensuring that it is likely to work on Rafiki. +Logging in Models +-------------------------------------------------------------------- + +:class:`rafiki.model.BaseModel` has an attribute ``logger`` that is of the class :class:`rafiki.model.ModelLogger`. +It allows you to log messages and metrics while your model is being trained, and you can +define plots to visualize your model's training on Rafiki's Admin Web interface. -Logging & Dataset Loading in Models +.. seealso:: :ref:`using-admin-web` + +Dataset Loading in Models -------------------------------------------------------------------- -:class:`rafiki.model.BaseModel` has a property ``utils`` that subclasses the model utility classes -:class:`rafiki.model.log.ModelLogUtils` and :class:`rafiki.model.dataset.ModelDatasetUtils`. They -help with model logging & dataset loading respectively. +:class:`rafiki.model.BaseModel` has an attribute ``utils`` that subclasses the model utility class +:class:`rafiki.model.dataset.ModelDatasetUtils`. It helps with dataset loading. Refer to the sample usage in the implementation of `./examples/models/image_classification/TfSingleHiddenLayer.py `_. - Model Environment -------------------------------------------------------------------- From 8807408d343b2b587a40a669d54d372e7df2cbe6 Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Thu, 6 Dec 2018 20:06:45 +0800 Subject: [PATCH 10/22] Remove "failing install command" TODO that seems to be fixed --- scripts/start_worker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/start_worker.py b/scripts/start_worker.py index c6560ef6..2b400139 100644 --- a/scripts/start_worker.py +++ b/scripts/start_worker.py @@ -12,7 +12,6 @@ def start_service(service_id, service_type): install_command = os.environ.get('WORKER_INSTALL_COMMAND', '') exit_code = os.system(install_command) if exit_code != 0: - # TODO: Fix failing install command for `pip install torch==0.4.1;`` raise Exception('Install command gave non-zero exit code: "{}"'.format(install_command)) if service_type == ServiceType.TRAIN: From 08bbdfd33a2e393ae2db9dab0e63ac2b8386aa12 Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Fri, 7 Dec 2018 13:05:55 +0800 Subject: [PATCH 11/22] Add error throwing when depreceated methods are called in model --- rafiki/model/model.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/rafiki/model/model.py b/rafiki/model/model.py index 2b9fb3d6..f2be1aec 100644 --- a/rafiki/model/model.py +++ b/rafiki/model/model.py @@ -305,6 +305,19 @@ def _check_model_inst(model_inst): if getattr(model_inst, 'utils', None) is None: raise Exception('`super().__init__(**knobs)` should be called as the first line of the model\'s `__init__` method.') + # Throw error when deprecated methods are called + def deprecated_func(desc): + def throw_error(*args, **kwargs): + raise AttributeError(desc) + + return throw_error + + model_inst.utils.log = deprecated_func('`self.utils.log(...)` has been changed to `self.logger.log(...)`') + model_inst.utils.log_metrics = deprecated_func('`self.utils.log_metrics(...)` has been changed to `self.logger.log(...)`') + model_inst.utils.define_plot = deprecated_func('`self.utils.define_plot(...)` has been renamed to `self.logger.define_plot(...)`') + model_inst.utils.define_loss_plot = deprecated_func('`self.utils.define_loss_plot(...)` has been renamed to `self.logger.define_loss_plot(...)`') + model_inst.utils.log_loss_metric = deprecated_func('`self.utils.log_loss_metric(...)` has been renamed to `self.logger.log_loss(...)`') + def _check_knob_config(knob_config): if not isinstance(knob_config, dict) or \ any([(not isinstance(name, str) or not isinstance(knob, BaseKnob)) for (name, knob) in knob_config.items()]): From 89165ac64cab77ff00e60a4f6fd23bcb0ae5206c Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Fri, 7 Dec 2018 23:39:20 +0800 Subject: [PATCH 12/22] Move logging & dataset loading functionality from instance attributes to separate python imports --- examples/models/image_classification/SkDt.py | 8 ++--- examples/models/image_classification/SkSvm.py | 6 ++-- .../image_classification/TfFeedForward.py | 22 ++++++------- .../models/image_classification/TfVgg16.py | 8 ++--- examples/models/pos_tagging/BigramHmm.py | 8 ++--- examples/models/pos_tagging/PyBiLstm.py | 22 ++++++------- rafiki/model/__init__.py | 8 ++--- rafiki/model/dataset.py | 1 + rafiki/model/log.py | 2 ++ rafiki/model/model.py | 33 ++++++++----------- rafiki/worker/train.py | 12 +++---- 11 files changed, 64 insertions(+), 66 deletions(-) diff --git a/examples/models/image_classification/SkDt.py b/examples/models/image_classification/SkDt.py index 5263e921..cd1ffe4d 100644 --- a/examples/models/image_classification/SkDt.py +++ b/examples/models/image_classification/SkDt.py @@ -7,7 +7,7 @@ from rafiki.config import APP_MODE from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class, \ - IntegerKnob, CategoricalKnob + IntegerKnob, CategoricalKnob, dataset_utils, logger from rafiki.constants import TaskType, ModelDependency class SkDt(BaseModel): @@ -30,7 +30,7 @@ def __init__(self, **knobs): ) def train(self, dataset_uri): - dataset = self.utils.load_dataset_of_image_files(dataset_uri) + dataset = dataset_utils.load_dataset_of_image_files(dataset_uri) (images, classes) = zip(*[(image, image_class) for (image, image_class) in dataset]) X = self._prepare_X(images) y = classes @@ -39,10 +39,10 @@ def train(self, dataset_uri): # Compute train accuracy preds = self._clf.predict(X) accuracy = sum(y == preds) / len(y) - self.logger.log('Train accuracy: {}'.format(accuracy)) + logger.log('Train accuracy: {}'.format(accuracy)) def evaluate(self, dataset_uri): - dataset = self.utils.load_dataset_of_image_files(dataset_uri) + dataset = dataset_utils.load_dataset_of_image_files(dataset_uri) (images, classes) = zip(*[(image, image_class) for (image, image_class) in dataset]) X = self._prepare_X(images) y = classes diff --git a/examples/models/image_classification/SkSvm.py b/examples/models/image_classification/SkSvm.py index 35b4b761..894e2565 100644 --- a/examples/models/image_classification/SkSvm.py +++ b/examples/models/image_classification/SkSvm.py @@ -7,7 +7,7 @@ from rafiki.config import APP_MODE from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class, \ - IntegerKnob, CategoricalKnob, FloatKnob + IntegerKnob, CategoricalKnob, FloatKnob, dataset_utils from rafiki.constants import TaskType, ModelDependency class SkSvm(BaseModel): @@ -34,14 +34,14 @@ def __init__(self, **knobs): ) def train(self, dataset_uri): - dataset = self.utils.load_dataset_of_image_files(dataset_uri) + dataset = dataset_utils.load_dataset_of_image_files(dataset_uri) (images, classes) = zip(*[(image, image_class) for (image, image_class) in dataset]) X = self._prepare_X(images) y = classes self._clf.fit(X, y) def evaluate(self, dataset_uri): - dataset = self.utils.load_dataset_of_image_files(dataset_uri) + dataset = dataset_utils.load_dataset_of_image_files(dataset_uri) (images, classes) = zip(*[(image, image_class) for (image, image_class) in dataset]) X = self._prepare_X(images) y = classes diff --git a/examples/models/image_classification/TfFeedForward.py b/examples/models/image_classification/TfFeedForward.py index 238fdbd3..fea27cb3 100644 --- a/examples/models/image_classification/TfFeedForward.py +++ b/examples/models/image_classification/TfFeedForward.py @@ -9,7 +9,7 @@ from rafiki.config import APP_MODE from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class, \ - IntegerKnob, CategoricalKnob, FloatKnob + IntegerKnob, CategoricalKnob, FloatKnob, dataset_utils, logger from rafiki.constants import TaskType, ModelDependency class TfFeedForward(BaseModel): @@ -41,13 +41,13 @@ def train(self, dataset_uri): bs = self._knobs.get('batch_size') ep = self._knobs.get('epochs') - self.logger.log('Available devices: {}'.format(str(device_lib.list_local_devices()))) + logger.log('Available devices: {}'.format(str(device_lib.list_local_devices()))) # Define 2 plots: Loss against time, loss against epochs - self.logger.define_loss_plot() - self.logger.define_plot('Loss Over Time', ['loss']) + logger.define_loss_plot() + logger.define_plot('Loss Over Time', ['loss']) - dataset = self.utils.load_dataset_of_image_files(dataset_uri, image_size=[im_sz, im_sz]) + dataset = dataset_utils.load_dataset_of_image_files(dataset_uri, image_size=[im_sz, im_sz]) num_classes = dataset.classes (images, classes) = zip(*[(image, image_class) for (image, image_class) in dataset]) images = np.asarray(images) @@ -69,13 +69,13 @@ def train(self, dataset_uri): # Compute train accuracy (loss, accuracy) = self._model.evaluate(images, classes) - self.logger.log('Train loss: {}'.format(loss)) - self.logger.log('Train accuracy: {}'.format(accuracy)) + logger.log('Train loss: {}'.format(loss)) + logger.log('Train accuracy: {}'.format(accuracy)) def evaluate(self, dataset_uri): im_sz = self._knobs.get('image_size') - dataset = self.utils.load_dataset_of_image_files(dataset_uri, image_size=[im_sz, im_sz]) + dataset = dataset_utils.load_dataset_of_image_files(dataset_uri, image_size=[im_sz, im_sz]) (images, classes) = zip(*[(image, image_class) for (image, image_class) in dataset]) images = np.asarray(images) classes = np.asarray(classes) @@ -83,14 +83,14 @@ def evaluate(self, dataset_uri): with self._graph.as_default(): with self._sess.as_default(): (loss, accuracy) = self._model.evaluate(images, classes) - self.logger.log('Test loss: {}'.format(loss)) + logger.log('Test loss: {}'.format(loss)) return accuracy def predict(self, queries): im_sz = self._knobs.get('image_size') - X = self.utils.resize_as_images(queries, image_size=[im_sz, im_sz]) + X = dataset_utils.resize_as_images(queries, image_size=[im_sz, im_sz]) with self._graph.as_default(): with self._sess.as_default(): probs = self._model.predict(X) @@ -137,7 +137,7 @@ def load_parameters(self, params): def _on_train_epoch_end(self, epoch, logs): loss = logs['loss'] - self.logger.log_loss(loss, epoch) + logger.log_loss(loss, epoch) def _build_model(self, num_classes): units = self._knobs.get('hidden_layer_units') diff --git a/examples/models/image_classification/TfVgg16.py b/examples/models/image_classification/TfVgg16.py index 9a88bad1..50d36239 100644 --- a/examples/models/image_classification/TfVgg16.py +++ b/examples/models/image_classification/TfVgg16.py @@ -9,7 +9,7 @@ from urllib.parse import urlparse, parse_qs from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class, \ - IntegerKnob, FloatKnob, CategoricalKnob + IntegerKnob, FloatKnob, CategoricalKnob, dataset_utils from rafiki.constants import TaskType, ModelDependency from rafiki.config import APP_MODE @@ -37,7 +37,7 @@ def train(self, dataset_uri): ep = self._knobs.get('epochs') bs = self._knobs.get('batch_size') - dataset = self.utils.load_dataset_of_image_files(dataset_uri, image_size=[48, 48]) + dataset = dataset_utils.load_dataset_of_image_files(dataset_uri, image_size=[48, 48]) num_classes = dataset.classes (images, classes) = zip(*[(image, image_class) for (image, image_class) in dataset]) images = np.asarray(images) @@ -55,7 +55,7 @@ def train(self, dataset_uri): ) def evaluate(self, dataset_uri): - dataset = self.utils.load_dataset_of_image_files(dataset_uri, image_size=[48, 48]) + dataset = dataset_utils.load_dataset_of_image_files(dataset_uri, image_size=[48, 48]) (images, classes) = zip(*[(image, image_class) for (image, image_class) in dataset]) images = np.asarray(images) images = np.stack([images] * 3, axis=-1) @@ -67,7 +67,7 @@ def evaluate(self, dataset_uri): return accuracy def predict(self, queries): - images = self.utils.resize_as_images(queries, image_size=[48, 48]) + images = dataset_utils.resize_as_images(queries, image_size=[48, 48]) images = np.stack([images] * 3, axis=-1) with self._graph.as_default(): with self._sess.as_default(): diff --git a/examples/models/pos_tagging/BigramHmm.py b/examples/models/pos_tagging/BigramHmm.py index 15b2cfc0..c1a31fcf 100644 --- a/examples/models/pos_tagging/BigramHmm.py +++ b/examples/models/pos_tagging/BigramHmm.py @@ -8,7 +8,7 @@ import pprint import json -from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class +from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class, logger, dataset_utils from rafiki.constants import TaskType # Min numeric value @@ -26,14 +26,14 @@ def __init__(self, **knobs): super().__init__(**knobs) def train(self, dataset_uri): - dataset = self.utils.load_dataset_of_corpus(dataset_uri) + dataset = dataset_utils.load_dataset_of_corpus(dataset_uri) (sents_tokens, sents_tags) = zip(*[zip(*sent) for sent in dataset]) self._num_tags = dataset.tag_num_classes[0] (self._trans_probs, self._emiss_probs) = self._compute_probs(self._num_tags, sents_tokens, sents_tags) - self.logger.log('No. of tags: {}'.format(self._num_tags)) + logger.log('No. of tags: {}'.format(self._num_tags)) def evaluate(self, dataset_uri): - dataset = self.utils.load_dataset_of_corpus(dataset_uri) + dataset = dataset_utils.load_dataset_of_corpus(dataset_uri) (sents_tokens, sents_tags) = zip(*[zip(*sent) for sent in dataset]) (sents_pred_tags) = self._tag_sents(self._num_tags, sents_tokens, self._trans_probs, self._emiss_probs) acc = self._compute_accuracy(sents_tags, sents_pred_tags) diff --git a/examples/models/pos_tagging/PyBiLstm.py b/examples/models/pos_tagging/PyBiLstm.py index cedc6997..a998aa65 100644 --- a/examples/models/pos_tagging/PyBiLstm.py +++ b/examples/models/pos_tagging/PyBiLstm.py @@ -13,7 +13,7 @@ from torch.utils.data.dataset import Dataset from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class, \ - IntegerKnob, FloatKnob, CategoricalKnob + IntegerKnob, FloatKnob, CategoricalKnob, logger, dataset_utils from rafiki.constants import TaskType, ModelDependency from rafiki.config import APP_MODE @@ -37,21 +37,21 @@ def __init__(self, **knobs): self._knobs = knobs def train(self, dataset_uri): - dataset = self.utils.load_dataset_of_corpus(dataset_uri) + dataset = dataset_utils.load_dataset_of_corpus(dataset_uri) self._word_dict = self._extract_word_dict(dataset) self._tag_count = dataset.tag_num_classes[0] - self.logger.log('No. of unique words: {}'.format(len(self._word_dict))) - self.logger.log('No. of tags: {}'.format(self._tag_count)) + logger.log('No. of unique words: {}'.format(len(self._word_dict))) + logger.log('No. of tags: {}'.format(self._tag_count)) (self._net, self._optimizer) = self._train(dataset) sents_tags = self._predict(dataset) acc = self._compute_accuracy(dataset, sents_tags) - self.logger.log('Train accuracy: {}'.format(acc)) + logger.log('Train accuracy: {}'.format(acc)) def evaluate(self, dataset_uri): - dataset = self.utils.load_dataset_of_corpus(dataset_uri) + dataset = dataset_utils.load_dataset_of_corpus(dataset_uri) sents_tags = self._predict(dataset) acc = self._compute_accuracy(dataset, sents_tags) return acc @@ -138,7 +138,7 @@ def _predict(self, dataset): Tensor = torch.LongTensor if torch.cuda.is_available(): - self.logger.log('Using CUDA...') + logger.log('Using CUDA...') net = net.cuda() Tensor = torch.cuda.LongTensor @@ -172,14 +172,14 @@ def _train(self, dataset): B = math.ceil(len(dataset) / N) # No. of batches # Define 2 plots: Loss against time, loss against epochs - self.logger.define_loss_plot() - self.logger.define_plot('Loss Over Time', ['loss']) + logger.define_loss_plot() + logger.define_plot('Loss Over Time', ['loss']) (net, optimizer) = self._create_model() Tensor = torch.LongTensor if torch.cuda.is_available(): - self.logger.log('Using CUDA...') + logger.log('Using CUDA...') net = net.cuda() Tensor = torch.cuda.LongTensor @@ -209,7 +209,7 @@ def _train(self, dataset): total_loss += loss.item() - self.logger.log_loss(loss=(total_loss / B), epoch=epoch) + logger.log_loss(loss=(total_loss / B), epoch=epoch) return (net, optimizer) diff --git a/rafiki/model/__init__.py b/rafiki/model/__init__.py index 79a2df30..fe124769 100644 --- a/rafiki/model/__init__.py +++ b/rafiki/model/__init__.py @@ -1,6 +1,6 @@ from .model import BaseModel, test_model_class, load_model_class, \ - parse_model_install_command, InvalidModelClassException, InvalidModelParamsException, \ - ModelUtils -from .log import LogType, ModelLogger + parse_model_install_command, InvalidModelClassException, InvalidModelParamsException from .knob import BaseKnob, CategoricalKnob, IntegerKnob, FloatKnob, \ - serialize_knob_config, deserialize_knob_config \ No newline at end of file + serialize_knob_config, deserialize_knob_config +from .dataset import dataset_utils, ModelDatasetUtils, CorpusDataset, ImageFilesDataset +from .log import logger, ModelLogger \ No newline at end of file diff --git a/rafiki/model/dataset.py b/rafiki/model/dataset.py index 6ddce71a..957a29cc 100644 --- a/rafiki/model/dataset.py +++ b/rafiki/model/dataset.py @@ -254,3 +254,4 @@ def download_dataset_from_uri(self, dataset_uri): return dataset_path +dataset_utils = ModelDatasetUtils() \ No newline at end of file diff --git a/rafiki/model/log.py b/rafiki/model/log.py index 6adcd66d..4575c8fa 100644 --- a/rafiki/model/log.py +++ b/rafiki/model/log.py @@ -159,3 +159,5 @@ def emit(self, record): def _print(self, message): print('[{}]'.format(__name__), message) + +logger = ModelLogger() \ No newline at end of file diff --git a/rafiki/model/model.py b/rafiki/model/model.py index f2be1aec..13c2fc81 100644 --- a/rafiki/model/model.py +++ b/rafiki/model/model.py @@ -12,16 +12,11 @@ from rafiki.constants import TaskType, ModelDependency from .dataset import ModelDatasetUtils -from .log import ModelLogger from .knob import BaseKnob, serialize_knob_config, deserialize_knob_config class InvalidModelClassException(Exception): pass class InvalidModelParamsException(Exception): pass -class ModelUtils(ModelDatasetUtils): - def __init__(self): - ModelDatasetUtils.__init__(self) - class BaseModel(abc.ABC): ''' Rafiki's base model class that Rafiki models should extend. @@ -48,8 +43,7 @@ def __init__(self, **knobs): :type knobs: dict[str, any] ''' def __init__(self, **knobs): - self.logger = ModelLogger() - self.utils = ModelUtils() + pass @staticmethod def get_knob_config(): @@ -291,9 +285,6 @@ def _check_model_class(py_model_class): if not issubclass(py_model_class, BaseModel): raise Exception('Model should extend `rafiki.model.BaseModel`') - if inspect.isfunction(getattr(py_model_class, 'get_predict_label_mapping', None)): - _warn('`get_predict_label_mapping` has been deprecated') - if inspect.isfunction(getattr(py_model_class, 'init', None)): _warn('`init` has been deprecated - use `__init__` for your model\'s initialization logic instead') @@ -302,21 +293,25 @@ def _check_model_class(py_model_class): _warn('`get_knob_config` has been changed to a `@staticmethod`') def _check_model_inst(model_inst): - if getattr(model_inst, 'utils', None) is None: - raise Exception('`super().__init__(**knobs)` should be called as the first line of the model\'s `__init__` method.') - # Throw error when deprecated methods are called def deprecated_func(desc): def throw_error(*args, **kwargs): raise AttributeError(desc) return throw_error - - model_inst.utils.log = deprecated_func('`self.utils.log(...)` has been changed to `self.logger.log(...)`') - model_inst.utils.log_metrics = deprecated_func('`self.utils.log_metrics(...)` has been changed to `self.logger.log(...)`') - model_inst.utils.define_plot = deprecated_func('`self.utils.define_plot(...)` has been renamed to `self.logger.define_plot(...)`') - model_inst.utils.define_loss_plot = deprecated_func('`self.utils.define_loss_plot(...)` has been renamed to `self.logger.define_loss_plot(...)`') - model_inst.utils.log_loss_metric = deprecated_func('`self.utils.log_loss_metric(...)` has been renamed to `self.logger.log_loss(...)`') + + class DeprecatedModelUtils(): + log = deprecated_func('`self.utils.log(...)` has been moved to `logger.log(...)`') + log_metrics = deprecated_func('`self.utils.log_metrics(...)` has been moved to `logger.log(...)`') + define_plot = deprecated_func('`self.utils.define_plot(...)` has been moved to `logger.define_plot(...)`') + define_loss_plot = deprecated_func('`self.utils.define_loss_plot(...)` has been moved to `logger.define_loss_plot(...)`') + log_loss_metric = deprecated_func('`self.utils.log_loss_metric(...)` has been moved to `logger.log_loss(...)`') + load_dataset_of_image_files = deprecated_func('`self.utils.load_dataset_of_image_files(...)` has been moved to `dataset_utils.load_dataset_of_image_files(...)`') + load_dataset_of_corpus = deprecated_func('`self.utils.load_dataset_of_corpus(...)` has been moved to `dataset_utils.load_dataset_of_corpus(...)`') + resize_as_images = deprecated_func('`self.utils.resize_as_images(...)` has been moved to `dataset_utils.resize_as_images(...)`') + download_dataset_from_uri = deprecated_func('`self.utils.download_dataset_from_uri(...)` has been moved to `dataset_utils.download_dataset_from_uri(...)`') + + model_inst.utils = DeprecatedModelUtils() def _check_knob_config(knob_config): if not isinstance(knob_config, dict) or \ diff --git a/rafiki/worker/train.py b/rafiki/worker/train.py index 3894e90c..18ac4459 100644 --- a/rafiki/worker/train.py +++ b/rafiki/worker/train.py @@ -7,7 +7,7 @@ from rafiki.config import SUPERADMIN_EMAIL, SUPERADMIN_PASSWORD from rafiki.constants import TrainJobStatus, TrialStatus, BudgetType -from rafiki.model import load_model_class, serialize_knob_config, LogType +from rafiki.model import load_model_class, serialize_knob_config, logger as model_logger from rafiki.db import Database from rafiki.client import Client @@ -145,11 +145,11 @@ def _train_and_evaluate_model(self, clazz, knobs, train_dataset_uri, \ log_handler = ModelLoggerHandler(handle_log) root_logger = logging.getLogger() root_logger.addHandler(log_handler) - logger = logging.getLogger('{}.trial'.format(__name__)) - logger.setLevel(logging.INFO) - logger.propagate = False # Avoid duplicate logs in root logger - logger.addHandler(log_handler) - model_inst.logger.set_logger(logger) + py_model_logger = logging.getLogger('{}.trial'.format(__name__)) + py_model_logger.setLevel(logging.INFO) + py_model_logger.propagate = False # Avoid duplicate logs in root logger + py_model_logger.addHandler(log_handler) + model_logger.set_logger(py_model_logger) # Train model model_inst.train(train_dataset_uri) From 743928b83ba0ea956cc8cafc7a169d2279e070ad Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Fri, 7 Dec 2018 23:39:46 +0800 Subject: [PATCH 13/22] Update docs on changes in doing logging & dataset loading --- docs/src/python/rafiki.model.rst | 6 +- docs/src/user/creating-models.rst | 15 ++- rafiki/model/dataset.py | 181 ++++++++++++++++-------------- rafiki/model/log.py | 19 ++++ 4 files changed, 128 insertions(+), 93 deletions(-) diff --git a/docs/src/python/rafiki.model.rst b/docs/src/python/rafiki.model.rst index 87f3d85f..0c8eaed4 100644 --- a/docs/src/python/rafiki.model.rst +++ b/docs/src/python/rafiki.model.rst @@ -36,9 +36,9 @@ Utility Classes & Methods .. autoclass:: rafiki.model.ModelLogger :members: -.. autoclass:: rafiki.model.dataset.ModelDatasetUtils +.. autoclass:: rafiki.model.ModelDatasetUtils :members: -.. autoclass:: rafiki.model.dataset.ImageFilesDataset +.. autoclass:: rafiki.model.ImageFilesDataset -.. autoclass:: rafiki.model.dataset.CorpusDataset \ No newline at end of file +.. autoclass:: rafiki.model.CorpusDataset \ No newline at end of file diff --git a/docs/src/user/creating-models.rst b/docs/src/user/creating-models.rst index 0e2bcd7b..68684e55 100644 --- a/docs/src/user/creating-models.rst +++ b/docs/src/user/creating-models.rst @@ -14,7 +14,8 @@ To submit the model to Rafiki, use the :meth:`rafiki.client.Client.create_model` Implementing Models -------------------------------------------------------------------- -Details on how to implement a model are located in the documentation of :class:`rafiki.model.BaseModel`. +Full details on how to implement a model are located in the documentation of :class:`rafiki.model.BaseModel`, +and sample model implementations are located in `./examples/models/ `_. In defining the hyperparameters (knobs) of a model, refer to the documentation at :ref:`knob-types` for the full list of knob types. @@ -25,19 +26,21 @@ it is likely to work on Rafiki. Logging in Models -------------------------------------------------------------------- -:class:`rafiki.model.BaseModel` has an attribute ``logger`` that is of the class :class:`rafiki.model.ModelLogger`. -It allows you to log messages and metrics while your model is being trained, and you can +By importing the global ``logger`` instance in the ``rafiki.model`` module, +you can log messages and metrics while your model is being trained, and you can define plots to visualize your model's training on Rafiki's Admin Web interface. +Refer to :class:`rafiki.model.ModelLogger` for full usage instructions. + .. seealso:: :ref:`using-admin-web` Dataset Loading in Models -------------------------------------------------------------------- -:class:`rafiki.model.BaseModel` has an attribute ``utils`` that subclasses the model utility class -:class:`rafiki.model.dataset.ModelDatasetUtils`. It helps with dataset loading. +The global ``dataset_utils`` instance in the ``rafiki.model`` module provides +a set of built-in dataset loading methods for common dataset types on Rafiki. -Refer to the sample usage in the implementation of `./examples/models/image_classification/TfSingleHiddenLayer.py `_. +Refer to :class:`rafiki.model.ModelDatasetUtils` for full usage instructions. Model Environment -------------------------------------------------------------------- diff --git a/rafiki/model/dataset.py b/rafiki/model/dataset.py index 957a29cc..2a78f34c 100644 --- a/rafiki/model/dataset.py +++ b/rafiki/model/dataset.py @@ -22,6 +22,103 @@ class InvalidDatasetProtocolException(Exception): pass class InvalidDatasetTypeException(Exception): pass class InvalidDatasetFormatException(Exception): pass +class ModelDatasetUtils(): + ''' + Collection of utility methods to help with the loading of datasets. + + To use these utility methods, import the global ``dataset_utils`` instance from the module ``rafiki.model``. + + For example: + + :: + + from rafiki.model import dataset_utils + ... + def train(self, dataset_uri): + ... + dataset_utils.load_dataset_of_image_files(dataset_uri) + ... + ''' + + def __init__(self): + # Caches downloaded datasets + self._dataset_uri_to_path = {} + + def load_dataset_of_corpus(self, dataset_uri, tags=['tag'], split_by='\\n'): + ''' + Loads dataset with type `CORPUS`. + + :param str dataset_uri: URI of the dataset file + :returns: An instance of ``CorpusDataset``. + ''' + dataset_path = self.download_dataset_from_uri(dataset_uri) + return CorpusDataset(dataset_path, tags, split_by) + + def load_dataset_of_image_files(self, dataset_uri, image_size=None): + ''' + Loads dataset with type `IMAGE_FILES`. + + :param str dataset_uri: URI of the dataset file + :param str image_size: dimensions to resize all images to (None for no resizing) + :returns: An instance of ``ImageFilesDataset``. + ''' + dataset_path = self.download_dataset_from_uri(dataset_uri) + return ImageFilesDataset(dataset_path, image_size) + + def resize_as_images(self, images, image_size): + ''' + Resize a list of N grayscale images to another size. + + :param int[][][] images: images to resize as a N x 2D lists (grayscale) + :param int image_size: dimensions to resize all images to (None for no resizing) + :returns: images as N x 2D numpy arrays + ''' + images = [Image.fromarray(np.asarray(x, dtype=np.uint8)) for x in images] + images = [np.asarray(x.resize(image_size)) for x in images] + return np.asarray(images) + + def download_dataset_from_uri(self, dataset_uri): + ''' + Maybe download the dataset at URI, ensuring that the dataset ends up in the local filesystem. + + :param str dataset_uri: URI of the dataset file + :returns: file path of the dataset file in the local filesystem + ''' + if dataset_uri in self._dataset_uri_to_path: + return self._dataset_uri_to_path[dataset_uri] + + dataset_path = None + + parsed_uri = urlparse(dataset_uri) + protocol = '{uri.scheme}'.format(uri=parsed_uri).lower().strip() + + # Download dataset over HTTP/HTTPS + if protocol == 'http' or protocol == 'https': + + r = requests.get(dataset_uri, stream=True) + temp_file = tempfile.NamedTemporaryFile(delete=False) + + # Show a progress bar while downloading + total_size = int(r.headers.get('content-length', 0)); + block_size = 1024 + iters = math.ceil(total_size / block_size) + for data in tqdm(r.iter_content(block_size), total=iters, unit='KB'): + temp_file.write(data) + + temp_file.close() + + dataset_path = temp_file.name + + # Assume it is on filesystem + elif protocol == '' or protocol == 'file': + dataset_path = dataset_uri + else: + raise InvalidDatasetProtocolException() + + # Cache dataset path to possibly prevent re-downloading + self._dataset_uri_to_path[dataset_uri] = dataset_path + return dataset_path + class ModelDataset(): ''' Abstract that helps loading of dataset of a specific type @@ -170,88 +267,4 @@ def _load(self, dataset_path): return (num_samples, num_classes, image_paths, image_classes, dataset_dir) -class ModelDatasetUtils(): - ''' - Collection of utility methods to help with the loading of datasets - ''' - def __init__(self): - # Caches downloaded datasets - self._dataset_uri_to_path = {} - - def load_dataset_of_corpus(self, dataset_uri, tags=['tag'], split_by='\\n'): - ''' - Loads dataset with type `CORPUS`. - - :param str dataset_uri: URI of the dataset file - :returns: An instance of ``CorpusDataset``. - ''' - dataset_path = self.download_dataset_from_uri(dataset_uri) - return CorpusDataset(dataset_path, tags, split_by) - - def load_dataset_of_image_files(self, dataset_uri, image_size=None): - ''' - Loads dataset with type `IMAGE_FILES`. - - :param str dataset_uri: URI of the dataset file - :param str image_size: dimensions to resize all images to (None for no resizing) - :returns: An instance of ``ImageFilesDataset``. - ''' - dataset_path = self.download_dataset_from_uri(dataset_uri) - return ImageFilesDataset(dataset_path, image_size) - - def resize_as_images(self, images, image_size): - ''' - Resize a list of N grayscale images to another size. - - :param int[][][] images: images to resize as a N x 2D lists (grayscale) - :param int image_size: dimensions to resize all images to (None for no resizing) - :returns: images as N x 2D numpy arrays - ''' - images = [Image.fromarray(np.asarray(x, dtype=np.uint8)) for x in images] - images = [np.asarray(x.resize(image_size)) for x in images] - return np.asarray(images) - - def download_dataset_from_uri(self, dataset_uri): - ''' - Maybe download the dataset at URI, ensuring that the dataset ends up in the local filesystem. - - :param str dataset_uri: URI of the dataset file - :returns: file path of the dataset file in the local filesystem - ''' - if dataset_uri in self._dataset_uri_to_path: - return self._dataset_uri_to_path[dataset_uri] - - dataset_path = None - - parsed_uri = urlparse(dataset_uri) - protocol = '{uri.scheme}'.format(uri=parsed_uri).lower().strip() - - # Download dataset over HTTP/HTTPS - if protocol == 'http' or protocol == 'https': - - r = requests.get(dataset_uri, stream=True) - temp_file = tempfile.NamedTemporaryFile(delete=False) - - # Show a progress bar while downloading - total_size = int(r.headers.get('content-length', 0)); - block_size = 1024 - iters = math.ceil(total_size / block_size) - for data in tqdm(r.iter_content(block_size), total=iters, unit='KB'): - temp_file.write(data) - - temp_file.close() - - dataset_path = temp_file.name - - # Assume it is on filesystem - elif protocol == '' or protocol == 'file': - dataset_path = dataset_uri - else: - raise InvalidDatasetProtocolException() - - # Cache dataset path to possibly prevent re-downloading - self._dataset_uri_to_path[dataset_uri] = dataset_path - return dataset_path - - dataset_utils = ModelDatasetUtils() \ No newline at end of file diff --git a/rafiki/model/log.py b/rafiki/model/log.py index 4575c8fa..6a6afbc5 100644 --- a/rafiki/model/log.py +++ b/rafiki/model/log.py @@ -12,6 +12,25 @@ class LogType(): MESSAGE = 'MESSAGE' class ModelLogger(): + ''' + Allows models to log messages and metrics during model training, and + define plots for visualization of model training. + + To use this logger, import the global ``logger`` instance from the module ``rafiki.model``. + + For example: + + :: + + from rafiki.model import logger + ... + def train(self, dataset_uri): + ... + logger.log('Starting model training...') + ... + + ''' + def __init__(self): # By default, set a logging handler to print to stdout (for debugging) logger = logging.getLogger(__name__) From 65ca9b165d88a807d4d83aeaa85284d02a3e7e06 Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Sat, 8 Dec 2018 21:29:32 +0800 Subject: [PATCH 14/22] Add `FixedKnob` --- .../image_classification/TfFeedForward.py | 4 +-- rafiki/advisor/btb_gp_advisor.py | 14 ++++++-- rafiki/model/__init__.py | 2 +- rafiki/model/knob.py | 36 ++++++++++++++++++- 4 files changed, 50 insertions(+), 6 deletions(-) diff --git a/examples/models/image_classification/TfFeedForward.py b/examples/models/image_classification/TfFeedForward.py index fea27cb3..1ce4cb4f 100644 --- a/examples/models/image_classification/TfFeedForward.py +++ b/examples/models/image_classification/TfFeedForward.py @@ -9,7 +9,7 @@ from rafiki.config import APP_MODE from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class, \ - IntegerKnob, CategoricalKnob, FloatKnob, dataset_utils, logger + IntegerKnob, CategoricalKnob, FloatKnob, FixedKnob, dataset_utils, logger from rafiki.constants import TaskType, ModelDependency class TfFeedForward(BaseModel): @@ -25,7 +25,7 @@ def get_knob_config(): 'hidden_layer_units': IntegerKnob(2, 128), 'learning_rate': FloatKnob(1e-5, 1e-1, is_exp=True), 'batch_size': CategoricalKnob([16, 32, 64, 128]), - 'image_size': CategoricalKnob([8, 16, 32]), + 'image_size': FixedKnob(32) } def __init__(self, **knobs): diff --git a/rafiki/advisor/btb_gp_advisor.py b/rafiki/advisor/btb_gp_advisor.py index ee02305a..3af2e013 100644 --- a/rafiki/advisor/btb_gp_advisor.py +++ b/rafiki/advisor/btb_gp_advisor.py @@ -1,7 +1,7 @@ from btb.tuning import GP from btb import HyperParameter, ParamTypes -from rafiki.model import BaseKnob, FloatKnob, IntegerKnob, CategoricalKnob +from rafiki.model import BaseKnob, FloatKnob, IntegerKnob, CategoricalKnob, FixedKnob from .advisor import BaseAdvisor class BtbGpAdvisor(BaseAdvisor): @@ -40,6 +40,15 @@ def _knob_to_tunable(knob): return HyperParameter(ParamTypes.STRING, knob.values) elif knob.value_type is bool: return HyperParameter(ParamTypes.BOOL, knob.values) + elif isinstance(knob, FixedKnob): + if knob.value_type is int: + return HyperParameter(ParamTypes.INT_CAT, [knob.value]) + elif knob.value_type is float: + return HyperParameter(ParamTypes.FLOAT_CAT, [knob.value]) + elif knob.value_type is str: + return HyperParameter(ParamTypes.STRING, [knob.value]) + elif knob.value_type is bool: + return HyperParameter(ParamTypes.BOOL, [knob.value]) elif isinstance(knob, IntegerKnob): if knob.is_exp: return HyperParameter(ParamTypes.INT_EXP, [knob.value_min, knob.value_max]) @@ -49,4 +58,5 @@ def _knob_to_tunable(knob): if knob.is_exp: return HyperParameter(ParamTypes.FLOAT_EXP, [knob.value_min, knob.value_max]) else: - return HyperParameter(ParamTypes.FLOAT, [knob.value_min, knob.value_max]) \ No newline at end of file + return HyperParameter(ParamTypes.FLOAT, [knob.value_min, knob.value_max]) + \ No newline at end of file diff --git a/rafiki/model/__init__.py b/rafiki/model/__init__.py index fe124769..fd8cebb5 100644 --- a/rafiki/model/__init__.py +++ b/rafiki/model/__init__.py @@ -1,6 +1,6 @@ from .model import BaseModel, test_model_class, load_model_class, \ parse_model_install_command, InvalidModelClassException, InvalidModelParamsException -from .knob import BaseKnob, CategoricalKnob, IntegerKnob, FloatKnob, \ +from .knob import BaseKnob, CategoricalKnob, IntegerKnob, FloatKnob, FixedKnob, \ serialize_knob_config, deserialize_knob_config from .dataset import dataset_utils, ModelDatasetUtils, CorpusDataset, ImageFilesDataset from .log import logger, ModelLogger \ No newline at end of file diff --git a/rafiki/model/knob.py b/rafiki/model/knob.py index 7ab10d07..ee8b2173 100644 --- a/rafiki/model/knob.py +++ b/rafiki/model/knob.py @@ -26,7 +26,7 @@ def from_json(cls, json_str): knob_type = json_dict['type'] knob_args = json_dict['args'] - knob_classes = [CategoricalKnob, IntegerKnob, FloatKnob] + knob_classes = [CategoricalKnob, IntegerKnob, FloatKnob, FixedKnob] for clazz in knob_classes: if clazz.__name__ == knob_type: return clazz(**knob_args) @@ -73,6 +73,40 @@ def _validate_values(values): return (value_type) +class FixedKnob(BaseKnob): + ''' + Knob type representing a single fixed value of type ``int``, ``float``, ``bool`` or ``str``. + Essentially, this represents a knob that does not require tuning. + ''' + def __init__(self, value): + knob_args = { 'value': value } + super().__init__(knob_args) + self._value = value + (self._value_type) = self._validate_value(value) + + @property + def value_type(self): + return self._value_type + + @property + def value(self): + return self._value + + @staticmethod + def _validate_value(value): + if isinstance(value, int): + value_type = int + elif isinstance(value, float): + value_type = float + elif isinstance(value, bool): + value_type = bool + elif isinstance(value, str): + value_type = str + else: + raise TypeError('Only the following types for `value` are supported: `int`, `float`, `bool`, `str`') + + return (value_type) + class IntegerKnob(BaseKnob): ''' Knob type epresenting `any` ``int`` value within a specific interval [``value_min``, ``value_max``]. From 973a84a1ba05d82e5c45b794dc045219261544bf Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Sat, 8 Dec 2018 21:29:51 +0800 Subject: [PATCH 15/22] Gitignore IPython notebook files --- .gitignore | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 733de8c6..71a1103f 100644 --- a/.gitignore +++ b/.gitignore @@ -25,4 +25,8 @@ data/* # Logs *.log logs/* -!logs/.gitkeep \ No newline at end of file +!logs/.gitkeep + +# IPython notebooks +.ipynb_checkpoints/* +*.ipynb \ No newline at end of file From 5a7f261cc567d3c4e20d0389f4f43fa0a227d2f8 Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Wed, 12 Dec 2018 20:12:40 +0800 Subject: [PATCH 16/22] Add GPU-aware train worker placement & node configuration --- rafiki/admin/services_manager.py | 20 +++++++++++----- rafiki/container/__init__.py | 2 +- rafiki/container/container_manager.py | 14 +++++++++--- rafiki/container/docker_swarm.py | 29 +++++++++++++++++++---- rafiki/db/database.py | 5 ++-- rafiki/db/schema.py | 3 ++- scripts/setup_node.sh | 33 +++++++++++++++++++++++++++ scripts/start.sh | 5 ++++ 8 files changed, 94 insertions(+), 17 deletions(-) create mode 100644 scripts/setup_node.sh diff --git a/rafiki/admin/services_manager.py b/rafiki/admin/services_manager.py index b71058a8..333790fe 100644 --- a/rafiki/admin/services_manager.py +++ b/rafiki/admin/services_manager.py @@ -8,7 +8,7 @@ from rafiki.config import MIN_SERVICE_PORT, MAX_SERVICE_PORT, \ TRAIN_WORKER_REPLICAS_PER_MODEL, INFERENCE_WORKER_REPLICAS_PER_TRIAL, \ INFERENCE_MAX_BEST_TRIALS, SERVICE_STATUS_WAIT -from rafiki.container import DockerSwarmContainerManager +from rafiki.container import DockerSwarmContainerManager, ServiceRequirement, InvalidServiceRequest from rafiki.model import parse_model_install_command logger = logging.getLogger(__name__) @@ -185,11 +185,16 @@ def _create_train_job_worker(self, train_job, model, replicas): **({'CUDA_VISIBLE_DEVICES': -1} if not enable_gpu else {}) # Hide GPU if not enabled } + requirements = [] + if enable_gpu: + requirements.append(ServiceRequirement.GPU) + service = self._create_service( service_type=service_type, docker_image=model.docker_image, replicas=replicas, - environment_vars=environment_vars + environment_vars=environment_vars, + requirements=requirements ) self._db.create_train_job_worker( @@ -241,14 +246,15 @@ def _wait_until_services_running(self, services): def _create_service(self, service_type, docker_image, replicas, environment_vars={}, args=[], - container_port=None): + container_port=None, requirements=[]): # Create service in DB container_manager_type = type(self._container_manager).__name__ service = self._db.create_service( container_manager_type=container_manager_type, service_type=service_type, - docker_image=docker_image + docker_image=docker_image, + requirements=requirements ) self._db.commit() @@ -284,7 +290,8 @@ def _create_service(self, service_type, docker_image, args=args, environment_vars=environment_vars, mounts=mounts, - publish_port=publish_port + publish_port=publish_port, + requirements=requirements ) container_service_id = container_service['id'] @@ -303,11 +310,12 @@ def _create_service(self, service_type, docker_image, ) self._db.commit() - except Exception: + except Exception as e: logger.error('Error while creating service with ID {}'.format(service.id)) logger.error(traceback.format_exc()) self._db.mark_service_as_errored(service) self._db.commit() + raise e return service diff --git a/rafiki/container/__init__.py b/rafiki/container/__init__.py index f76a4d2b..93904798 100644 --- a/rafiki/container/__init__.py +++ b/rafiki/container/__init__.py @@ -1,2 +1,2 @@ from .container_manager import ContainerManager -from .docker_swarm import DockerSwarmContainerManager \ No newline at end of file +from .docker_swarm import DockerSwarmContainerManager, ServiceRequirement, InvalidServiceRequest \ No newline at end of file diff --git a/rafiki/container/container_manager.py b/rafiki/container/container_manager.py index 4b7a939e..22152041 100644 --- a/rafiki/container/container_manager.py +++ b/rafiki/container/container_manager.py @@ -1,13 +1,20 @@ import abc import os +class InvalidServiceRequest(Exception): + pass + +class ServiceRequirement(): + GPU = 'gpu' + class ContainerManager(abc.ABC): def __init__(self, **kwargs): raise NotImplementedError() @abc.abstractmethod - def create_service(self, service_name, docker_image, - replicas, args, environment_vars, mounts={}, publish_port=None): + def create_service(self, service_name, docker_image, replicas, + args, environment_vars, mounts={}, publish_port=None, + requirements=[]): ''' Creates a service with a set number of replicas. @@ -23,7 +30,8 @@ def create_service(self, service_name, docker_image, mounts: {String: String} - Dict of host directory to container directory for mounting of volumes onto container publish_port: (, ) - host port (port to be published) to container port The service should then be reachable at the host port on the host - + requirements: [ServiceRequirement] - List of requirements for the service + Returns {String: String} where id: String - ID for the service created hostname: String - Hostname for the service created (in the internal network) diff --git a/rafiki/container/docker_swarm.py b/rafiki/container/docker_swarm.py index 2d21f0f5..50d41f41 100644 --- a/rafiki/container/docker_swarm.py +++ b/rafiki/container/docker_swarm.py @@ -4,7 +4,7 @@ import docker import logging -from .container_manager import ContainerManager +from .container_manager import ContainerManager, ServiceRequirement, InvalidServiceRequest logger = logging.getLogger(__name__) @@ -15,7 +15,9 @@ def __init__(self, self._client = docker.from_env() def create_service(self, service_name, docker_image, replicas, - args, environment_vars, mounts={}, publish_port=None): + args, environment_vars, mounts={}, publish_port=None, + requirements=[]): + env = [ '{}={}'.format(k, v) for (k, v) in environment_vars.items() @@ -38,6 +40,16 @@ def create_service(self, service_name, docker_image, replicas, 'PublishedPort': published_port, 'TargetPort': container_port }] + + # Gather list of constraints + constraints = [] + if ServiceRequirement.GPU in requirements: + # Check if there are nodes with GPU, raise error otherwise + has_gpu = self._if_any_node_has_gpu() + if not has_gpu: + raise InvalidServiceRequest('There are no nodes with GPU to deploy the service on') + + constraints.append('node.labels.gpu!=0') service = self._client.services.create( image=docker_image, @@ -50,6 +62,7 @@ def create_service(self, service_name, docker_image, replicas, restart_policy={ 'Condition': 'on-failure' }, + constraints=constraints, endpoint_spec={ 'Ports': ports_list }, @@ -84,5 +97,13 @@ def destroy_service(self, service_id): service.remove() logger.info('Deleted service of ID {}'.format(service_id)) - - \ No newline at end of file + + def _if_any_node_has_gpu(self): + nodes = self._client.nodes.list() + has_gpu = False + for node in nodes: + gpu = int(node.attrs.get('Spec', {}).get('Labels', {}).get('gpu', 0)) + if gpu > 0: + has_gpu = True + + return has_gpu \ No newline at end of file diff --git a/rafiki/db/database.py b/rafiki/db/database.py index 5ec9c58a..8bef9e8e 100644 --- a/rafiki/db/database.py +++ b/rafiki/db/database.py @@ -227,11 +227,12 @@ def get_workers_of_inference_job(self, inference_job_id): #################################### def create_service(self, service_type, container_manager_type, - docker_image): + docker_image, requirements): service = Service( service_type=service_type, docker_image=docker_image, - container_manager_type=container_manager_type + container_manager_type=container_manager_type, + requirements=requirements ) self._session.add(service) return service diff --git a/rafiki/db/schema.py b/rafiki/db/schema.py index a8150222..a53e3d66 100644 --- a/rafiki/db/schema.py +++ b/rafiki/db/schema.py @@ -1,6 +1,6 @@ from sqlalchemy.ext.declarative import declarative_base from sqlalchemy import Column, String, Float, ForeignKey, Integer, Binary, DateTime -from sqlalchemy.dialects.postgresql import JSON +from sqlalchemy.dialects.postgresql import JSON, ARRAY import uuid import datetime @@ -63,6 +63,7 @@ class Service(Base): port = Column(Integer) container_service_name = Column(String) container_service_id = Column(String) + requirements = Column(ARRAY(String)) class TrainJob(Base): __tablename__ = 'train_job' diff --git a/scripts/setup_node.sh b/scripts/setup_node.sh new file mode 100644 index 00000000..a0044fa7 --- /dev/null +++ b/scripts/setup_node.sh @@ -0,0 +1,33 @@ +# Determine whether node is Docker Swarm manager or worker +swarm_role=$1 +if [ -z "$swarm_role" ] +then + while true; do + read -p "Is this node a Docker Swarm manager running Rafiki? (y/n) " yn + case $yn in + [Yy]* ) swarm_role="manager" ; break;; + [Nn]* ) swarm_role="worker" ; break;; + * ) echo "Please answer yes or no.";; + esac + done +fi + +# For workers, join Docker Swarm +if [ "$swarm_role" = "worker" ] +then + read -p "IP address of Docker Swarm manager? " ip_addr + read -p "Docker Swarm join token? " join_token + docker swarm leave $1 + docker swarm join --token $join_token $ip_addr +fi + +# Add node label that specifies no. of GPUs +hostname=$(docker node inspect self | sed -n 's/"Hostname".*"\(.*\)".*/\1/p' | xargs) +while true; do + read -p "No. of GPUs (0-9)? " gpus + case $gpus in + [0-9] ) break;; + * ) echo "Please answer a integer from 0-9.";; + esac +done +docker node update --label-add gpu=$gpus $hostname \ No newline at end of file diff --git a/scripts/start.sh b/scripts/start.sh index 49af49e1..d739a8fb 100644 --- a/scripts/start.sh +++ b/scripts/start.sh @@ -33,6 +33,11 @@ ensure_stable() title "Creating Docker swarm for Rafiki..." bash $FILE_DIR/create_docker_swarm.sh +# Setup node for Rafiki + +title "Setting up node for Rafiki..." +bash $FILE_DIR/setup_node.sh manager + # Pull images from Docker Hub title "Pulling images for Rafiki from Docker Hub..." From 3cd65a7582cfcc13954c76aa26d83cec6f635ac0 Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Wed, 12 Dec 2018 20:18:58 +0800 Subject: [PATCH 17/22] Update docs for new node configuration step --- docs/src/dev/setup.rst | 14 +++++++++++++- scripts/setup_node.sh | 2 +- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/docs/src/dev/setup.rst b/docs/src/dev/setup.rst index 0abcbc65..c1e6b988 100644 --- a/docs/src/dev/setup.rst +++ b/docs/src/dev/setup.rst @@ -36,7 +36,19 @@ Adding Nodes to Rafiki Rafiki has with its dynamic stack (e.g. train workers, inference workes, predictors) running as `Docker Swarm Services `_. -Horizontal scaling can be done by `adding more nodes to the swarm `_. + +Horizontal scaling can be done by adding more nodes to the swarm. + +Perform the following for *each* worker node to be added: + +1. Connect the node to the same network as the master, so that the node can `join the master's Docker Swarm `_. + +2. Configure the node with the script: + + .. code-block:: shell + + bash scripts/setup_node.sh + Exposing Rafiki Publicly -------------------------------------------------------------------- diff --git a/scripts/setup_node.sh b/scripts/setup_node.sh index a0044fa7..21ae8c1b 100644 --- a/scripts/setup_node.sh +++ b/scripts/setup_node.sh @@ -24,7 +24,7 @@ fi # Add node label that specifies no. of GPUs hostname=$(docker node inspect self | sed -n 's/"Hostname".*"\(.*\)".*/\1/p' | xargs) while true; do - read -p "No. of GPUs (0-9)? " gpus + read -p "No. of GPUs? (0-9) " gpus case $gpus in [0-9] ) break;; * ) echo "Please answer a integer from 0-9.";; From ff985db760b7eebd971dc2695d8fcd467b26c836 Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Wed, 12 Dec 2018 20:49:02 +0800 Subject: [PATCH 18/22] Clean and correct docs --- docs/src/python/rafiki.model.rst | 3 +++ docs/src/user/creating-models.rst | 2 +- docs/src/user/quickstart.rst | 20 ++++++++------------ rafiki/client/client.py | 3 +-- rafiki/model/log.py | 22 +++++++++++++++------- 5 files changed, 28 insertions(+), 22 deletions(-) diff --git a/docs/src/python/rafiki.model.rst b/docs/src/python/rafiki.model.rst index 0c8eaed4..2de46e76 100644 --- a/docs/src/python/rafiki.model.rst +++ b/docs/src/python/rafiki.model.rst @@ -27,6 +27,9 @@ Knob Classes .. autoclass:: rafiki.model.FloatKnob :members: +.. autoclass:: rafiki.model.FixedKnob + :members: + Utility Classes & Methods -------------------------------------------------------------------- diff --git a/docs/src/user/creating-models.rst b/docs/src/user/creating-models.rst index 68684e55..782a1e8f 100644 --- a/docs/src/user/creating-models.rst +++ b/docs/src/user/creating-models.rst @@ -57,7 +57,7 @@ prior to model training and inference. This is configurable with the ``dependenc during model creation. Alternatively, you can build a custom Docker image that extends ``rafikiai/rafiki_worker``, -installing the required dependencies for your model. This is configurable with ``docker_image``) option +installing the required dependencies for your model. This is configurable with ``docker_image`` option during model creation. Models should run at least run on CPU-only machines and optionally leverage on a shared GPU, if it is available. diff --git a/docs/src/user/quickstart.rst b/docs/src/user/quickstart.rst index 866f0259..cc1593dc 100644 --- a/docs/src/user/quickstart.rst +++ b/docs/src/user/quickstart.rst @@ -7,21 +7,17 @@ Quick Start .. note:: - If you're a *Model Developer* just looking to contribute models to a running instance of Rafiki, refer to :ref:`quickstart-model-developers`. + - If you're a *Model Developer* just looking to contribute models to a running instance of Rafiki, refer to :ref:`quickstart-model-developers`. + - If you're an *Application Developer* just looking to train and deploy models on a running instance of Rafiki, refer to :ref:`quickstart-app-developers`. + - If you're an *Application User* just looking to make predictions to deployed models on a running instance of Rafiki, refer to :ref:`quickstart-app-users`. -.. note:: - - If you're an *Application Developer* just looking to train and deploy models on a running instance of Rafiki, refer to :ref:`quickstart-app-developers`. - -.. note:: - - If you're an *Application User* just looking to make predictions to deployed models on a running instance of Rafiki, refer to :ref:`quickstart-app-users`. +This guide assumes you have deployed your an empty instance of Rafiki and you want to try a *full* train-inference flow, +including adding of models, submitting a train job and submitting a inference job to Rafiki. -This guide assumes you have deployed your an empty instance of Rafiki and you want to do a *full* train-inference flow, -including preparation of dataset and adding of models to Rafiki. Below, the sequence of examples submit the -`Fashion MNIST dataset `_ for training and inference. -Alternatively, after installing Rafiki Client's dependencies, you can run `./examples/scripts/client_quickstart.py `_. +The sequence of examples below submits the `Fashion MNIST dataset `_ for training and inference. +Alternatively, after installing the Rafiki Client's dependencies, you can refer and run the scripted version of this quickstart +`./examples/scripts/client_quickstart.py `_. .. note:: diff --git a/rafiki/client/client.py b/rafiki/client/client.py index 0d210f5f..6fbf12c8 100644 --- a/rafiki/client/client.py +++ b/rafiki/client/client.py @@ -370,8 +370,7 @@ def create_advisor(self, knob_config_str, advisor_id=None): Creates a Rafiki advisor. If `advisor_id` is passed, it will create an advisor of that ID, or do nothing if an advisor of that ID has already been created. - :param knob_config: Knob configuration for advisor session - :type knob_config_str: Knob config, serialized + :param str knob_config_str: Serialized knob configuration for advisor session :param str advisor_id: ID of advisor to create ''' data = self._post('/advisors', target='advisor', diff --git a/rafiki/model/log.py b/rafiki/model/log.py index 6a6afbc5..1e8661b2 100644 --- a/rafiki/model/log.py +++ b/rafiki/model/log.py @@ -22,12 +22,19 @@ class ModelLogger(): :: - from rafiki.model import logger + from rafiki.model import logger, BaseModel ... - def train(self, dataset_uri): - ... - logger.log('Starting model training...') + class MyModel(BaseModel): ... + def train(self, dataset_uri): + ... + logger.log('Starting model training...') + logger.define_plot('Precision & Recall', y_axis=['precision', 'recall']) + ... + logger.log(precision=0.1, recall=0.6, epoch=1) + ... + logger.log('Ending model training...') + ... ''' @@ -41,14 +48,14 @@ def __init__(self): def define_loss_plot(self): ''' Convenience method of defining a plot of ``loss`` against ``epoch``. - To be used with :meth:`rafiki.model.ModeLogger.log_loss`. + To be used with :meth:`rafiki.model.ModelLogger.log_loss`. ''' self.define_plot('Loss Over Epochs', ['loss'], x_axis='epoch') def log_loss(self, loss, epoch): ''' Convenience method for logging `loss` against `epoch`. - To be used with :meth:`rafiki.model.ModeLogger.define_loss_plot`.. + To be used with :meth:`rafiki.model.ModelLogger.define_loss_plot`. ''' self.log(loss=loss, epoch=epoch) @@ -76,7 +83,8 @@ def log(self, msg='', **metrics): Logs a message and/or a set of metrics at a single point in time. Logged messages will be viewable on Rafiki's administrative UI. - To visualize logged metrics on plots, a plot must be defined via :meth:`rafiki.model.ModeLogger.define_plot`. + + To visualize logged metrics on plots, a plot must be defined via :meth:`rafiki.model.ModelLogger.define_plot`. Only call this method in :meth:`rafiki.model.BaseModel.train` and :meth:`rafiki.model.BaseModel.evaluate`. From 462aa7e105fbc92749f4f3e9ea33acd5ffd8d04f Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Wed, 12 Dec 2018 22:36:32 +0800 Subject: [PATCH 19/22] Mark workers as `RUNNING` only after installation of dependencies complete; fix bug of null predictions in client quickstart --- examples/scripts/client_quickstart.py | 11 ++++------- examples/scripts/tasks/test_pos_tagging.py | 6 ++---- rafiki/admin/services_manager.py | 11 ++++++++--- rafiki/worker/inference.py | 7 ++++--- scripts/start_worker.py | 12 ++++++------ 5 files changed, 24 insertions(+), 23 deletions(-) diff --git a/examples/scripts/client_quickstart.py b/examples/scripts/client_quickstart.py index ea275514..4d526086 100644 --- a/examples/scripts/client_quickstart.py +++ b/examples/scripts/client_quickstart.py @@ -61,20 +61,19 @@ def wait_until_train_job_has_completed(client, app): pass # Returns `predictor_host` of inference job -def wait_until_inference_job_is_running(client, app): +def get_predictor_host(client, app): while True: - time.sleep(10) try: inference_job = client.get_running_inference_job(app) status = inference_job.get('status') - if status == InferenceJobStatus.RUNNING: + if status == InferenceJobStatus.RUNNING: return inference_job.get('predictor_host') elif status in [InferenceJobStatus.ERRORED, InferenceJobStatus.STOPPED]: # Inference job has either errored or been stopped return False else: + time.sleep(10) continue - except: pass @@ -169,9 +168,7 @@ def make_predictions(client, predictor_host, queries): print('Creating inference job for app "{}" on Rafiki...'.format(app)) pprint.pprint(client.create_inference_job(app)) - - print('Waiting for inference job to be running...') - predictor_host = wait_until_inference_job_is_running(client, app) + predictor_host = get_predictor_host(client, app) if not predictor_host: raise Exception('Inference job has errored or stopped') print('Inference job is running!') diff --git a/examples/scripts/tasks/test_pos_tagging.py b/examples/scripts/tasks/test_pos_tagging.py index 9c759e4e..bc9f2a96 100644 --- a/examples/scripts/tasks/test_pos_tagging.py +++ b/examples/scripts/tasks/test_pos_tagging.py @@ -5,7 +5,7 @@ from rafiki.client import Client from rafiki.constants import TaskType, BudgetType, UserType, ModelDependency from examples.scripts.client_quickstart import create_user, create_model, \ - create_train_job, wait_until_inference_job_is_running, wait_until_train_job_has_completed, \ + create_train_job, get_predictor_host, wait_until_train_job_has_completed, \ make_predictions, RAFIKI_HOST, ADMIN_PORT, ADMIN_WEB_PORT, SUPERADMIN_EMAIL, MODEL_DEVELOPER_EMAIL, \ APP_DEVELOPER_EMAIL, USER_PASSWORD, ENABLE_GPU @@ -55,9 +55,7 @@ print('Creating inference job for app "{}" on Rafiki...'.format(app)) pprint.pprint(client.create_inference_job(app)) - - print('Waiting for inference job to be running...') - predictor_host = wait_until_inference_job_is_running(client, app) + predictor_host = get_predictor_host(client, app) if not predictor_host: raise Exception('Inference job has errored or stopped') print('Inference job is running!') diff --git a/rafiki/admin/services_manager.py b/rafiki/admin/services_manager.py index 333790fe..5e5eda11 100644 --- a/rafiki/admin/services_manager.py +++ b/rafiki/admin/services_manager.py @@ -46,8 +46,8 @@ def create_inference_services(self, inference_job_id): service = self._create_inference_job_worker(inference_job, trial, replicas) worker_services.append(service) - # Ensure that predictor service is running - self._wait_until_services_running([predictor_service]) + # Ensure that all services are running + self._wait_until_services_running([predictor_service, *worker_services]) # Mark inference job as running self._db.mark_inference_job_as_running(inference_job) @@ -85,8 +85,13 @@ def create_train_services(self, train_job_id): # Create a worker service for each model models = self._db.get_models_of_task(train_job.task) model_to_replicas = self._compute_train_worker_replicas_for_models(models) + worker_services = [] for (model, replicas) in model_to_replicas.items(): - self._create_train_job_worker(train_job, model, replicas) + service = self._create_train_job_worker(train_job, model, replicas) + worker_services.append(service) + + # Ensure that all services are running + self._wait_until_services_running(worker_services) # Mark train job as running self._db.mark_train_job_as_running(train_job) diff --git a/rafiki/worker/inference.py b/rafiki/worker/inference.py index 332e37ac..773449d2 100644 --- a/rafiki/worker/inference.py +++ b/rafiki/worker/inference.py @@ -34,11 +34,12 @@ def start(self): with self._db: (inference_job_id, trial_id) = self._read_worker_info() + + # Add to inference job's set of running workers + self._cache.add_worker_of_inference_job(self._service_id, inference_job_id) + self._model = self._load_model(trial_id) - # Add to inference job's set of running workers - self._cache.add_worker_of_inference_job(self._service_id, inference_job_id) - while True: (query_ids, queries) = \ self._cache.pop_queries_of_worker(self._service_id, INFERENCE_WORKER_PREDICT_BATCH_SIZE) diff --git a/scripts/start_worker.py b/scripts/start_worker.py index 2b400139..b44f9c74 100644 --- a/scripts/start_worker.py +++ b/scripts/start_worker.py @@ -3,17 +3,17 @@ from rafiki.db import Database from rafiki.constants import ServiceType +# Run install command +install_command = os.environ.get('WORKER_INSTALL_COMMAND', '') +exit_code = os.system(install_command) +if exit_code != 0: + raise Exception('Install command gave non-zero exit code: "{}"'.format(install_command)) + worker = None def start_service(service_id, service_type): global worker - # Run install command - install_command = os.environ.get('WORKER_INSTALL_COMMAND', '') - exit_code = os.system(install_command) - if exit_code != 0: - raise Exception('Install command gave non-zero exit code: "{}"'.format(install_command)) - if service_type == ServiceType.TRAIN: from rafiki.worker import TrainWorker worker = TrainWorker(service_id) From 1a52945fa13cc07e304e5518f90c7f7de68f9b56 Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Thu, 13 Dec 2018 20:00:13 +0800 Subject: [PATCH 20/22] Use `self.__dict__` way to initialize knobs as attributes in some sample models --- examples/models/image_classification/SkDt.py | 7 ++----- examples/models/image_classification/SkSvm.py | 9 ++------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/examples/models/image_classification/SkDt.py b/examples/models/image_classification/SkDt.py index cd1ffe4d..0b390096 100644 --- a/examples/models/image_classification/SkDt.py +++ b/examples/models/image_classification/SkDt.py @@ -23,11 +23,8 @@ def get_knob_config(): def __init__(self, **knobs): super().__init__(**knobs) - self._knobs = knobs - self._clf = self._build_classifier( - knobs.get('max_depth'), - knobs.get('criterion') - ) + self.__dict__.update(knobs) + self._clf = self._build_classifier(self.max_depth, self.criterion) def train(self, dataset_uri): dataset = dataset_utils.load_dataset_of_image_files(dataset_uri) diff --git a/examples/models/image_classification/SkSvm.py b/examples/models/image_classification/SkSvm.py index 894e2565..6c0cf6c3 100644 --- a/examples/models/image_classification/SkSvm.py +++ b/examples/models/image_classification/SkSvm.py @@ -25,13 +25,8 @@ def get_knob_config(): def __init__(self, **knobs): super().__init__(**knobs) - self._knobs = knobs - self._clf = self._build_classifier( - knobs.get('max_iter'), - knobs.get('kernel'), - knobs.get('gamma'), - knobs.get('C') - ) + self.__dict__.update(knobs) + self._clf = self._build_classifier(self.max_iter, self.kernel, self.gamma, self.C) def train(self, dataset_uri): dataset = dataset_utils.load_dataset_of_image_files(dataset_uri) From 3a3fbe8fa546f3e1a6763cca83ad4c2f6d0dfd40 Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Thu, 13 Dec 2018 20:01:53 +0800 Subject: [PATCH 21/22] Correct docs about logging --- rafiki/model/log.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/rafiki/model/log.py b/rafiki/model/log.py index 1e8661b2..c7dbea4e 100644 --- a/rafiki/model/log.py +++ b/rafiki/model/log.py @@ -29,7 +29,7 @@ class MyModel(BaseModel): def train(self, dataset_uri): ... logger.log('Starting model training...') - logger.define_plot('Precision & Recall', y_axis=['precision', 'recall']) + logger.define_plot('Precision & Recall', ['precision', 'recall'], x_axis=['epoch']) ... logger.log(precision=0.1, recall=0.6, epoch=1) ... @@ -66,13 +66,13 @@ def define_plot(self, title, metrics, x_axis=None): For example, a model's precision & recall logged with e.g. ``log(precision=0.1, recall=0.6, epoch=1)`` can be visualized in the plots generated by - ``define_plot('Precision & Recall', y_axis=['precision', 'recall'])`` (against time) or - ``define_plot('Precision & Recall', y_axis=['precision', 'recall'], x_axis=['epoch'])`` (against epochs). + ``define_plot('Precision & Recall', ['precision', 'recall'])`` (against time) or + ``define_plot('Precision & Recall', ['precision', 'recall'], x_axis='epoch')`` (against epochs). Only call this method in :meth:`rafiki.model.BaseModel.train`. :param str title: Title of the plot - :param str metrics: List of metrics that should be plotted on the y-axis + :param metrics: List of metrics that should be plotted on the y-axis :type metrics: str[] :param str x_axis: Metric that should be plotted on the x-axis, against all other metrics. Defaults to ``'time'``, which is automatically logged ''' @@ -89,7 +89,7 @@ def log(self, msg='', **metrics): Only call this method in :meth:`rafiki.model.BaseModel.train` and :meth:`rafiki.model.BaseModel.evaluate`. :param str msg: Message to be logged - :param metrics: Set of metrics & their values to be logged as { : }, where should be a number. + :param metrics: Set of metrics & their values to be logged as ``{ : }``, where ```` should be a number. :type metrics: dict[str, int|float] ''' if msg: From 700b8dcb292d959054c8305ebfa6c6b049498bb0 Mon Sep 17 00:00:00 2001 From: Ngin Yun Chuan Date: Thu, 13 Dec 2018 20:08:07 +0800 Subject: [PATCH 22/22] Make small correction in model logging docs --- examples/models/image_classification/SkDt.py | 2 +- rafiki/model/log.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/models/image_classification/SkDt.py b/examples/models/image_classification/SkDt.py index 0b390096..c47f06c9 100644 --- a/examples/models/image_classification/SkDt.py +++ b/examples/models/image_classification/SkDt.py @@ -25,7 +25,7 @@ def __init__(self, **knobs): super().__init__(**knobs) self.__dict__.update(knobs) self._clf = self._build_classifier(self.max_depth, self.criterion) - + def train(self, dataset_uri): dataset = dataset_utils.load_dataset_of_image_files(dataset_uri) (images, classes) = zip(*[(image, image_class) for (image, image_class) in dataset]) diff --git a/rafiki/model/log.py b/rafiki/model/log.py index c7dbea4e..fecff004 100644 --- a/rafiki/model/log.py +++ b/rafiki/model/log.py @@ -29,7 +29,7 @@ class MyModel(BaseModel): def train(self, dataset_uri): ... logger.log('Starting model training...') - logger.define_plot('Precision & Recall', ['precision', 'recall'], x_axis=['epoch']) + logger.define_plot('Precision & Recall', ['precision', 'recall'], x_axis='epoch') ... logger.log(precision=0.1, recall=0.6, epoch=1) ...