Merge pull request #79 from nginyc/add_gpu_aware_placement

[V0.0.8] Add GPU-aware placement (+ model knobs and logging API changes)
nginyc · Dec 14, 2018 · 61606c8 · 61606c8
2 parents f313623 + 700b8dc
commit 61606c8
Show file tree

Hide file tree

Showing 37 changed files with 1,041 additions and 849 deletions.
diff --git a/.env.sh b/.env.sh
@@ -1,6 +1,6 @@
 # Core configuration for Rafiki
 export DOCKER_NETWORK=rafiki
-export RAFIKI_VERSION=0.0.7
+export RAFIKI_VERSION=0.0.8
 export RAFIKI_IP_ADDRESS=127.0.0.1
 export ADMIN_EXT_PORT=3000
 export ADMIN_WEB_EXT_PORT=3001

diff --git a/.gitignore b/.gitignore
@@ -25,4 +25,8 @@ data/*
 # Logs
 *.log
 logs/*
-!logs/.gitkeep
+!logs/.gitkeep
+
+# IPython notebooks
+.ipynb_checkpoints/*
+*.ipynb
diff --git a/dockerfiles/advisor.Dockerfile b/dockerfiles/advisor.Dockerfile
@@ -22,6 +22,8 @@ ENV PYTHONPATH $DOCKER_WORKDIR_PATH
 # Install python dependencies
 COPY rafiki/utils/requirements.txt utils/requirements.txt
 RUN pip install -r utils/requirements.txt
+COPY rafiki/model/requirements.txt model/requirements.txt
+RUN pip install -r model/requirements.txt
 COPY rafiki/advisor/requirements.txt advisor/requirements.txt
 RUN pip install -r advisor/requirements.txt
 

diff --git a/docs/src/dev/setup.rst b/docs/src/dev/setup.rst
@@ -36,7 +36,19 @@ Adding Nodes to Rafiki
 
 Rafiki has with its dynamic stack (e.g. train workers, inference workes, predictors) 
 running as `Docker Swarm Services <https://docs.docker.com/engine/swarm/services/>`_.
-Horizontal scaling can be done by `adding more nodes to the swarm <https://docs.docker.com/engine/swarm/join-nodes/>`_.
+
+Horizontal scaling can be done by adding more nodes to the swarm. 
+
+Perform the following for *each* worker node to be added:
+
+1. Connect the node to the same network as the master, so that the node can `join the master's Docker Swarm <https://docs.docker.com/engine/swarm/join-nodes/>`_.
+
+2. Configure the node with the script:
+
+    .. code-block:: shell
+
+        bash scripts/setup_node.sh
+
 
 Exposing Rafiki Publicly
 --------------------------------------------------------------------

diff --git a/docs/src/python/rafiki.model.rst b/docs/src/python/rafiki.model.rst
@@ -1,17 +1,47 @@
 rafiki.model
 ====================================================================
 
+.. contents:: Table of Contents
+
+Core Classes
+--------------------------------------------------------------------
+
 .. autoclass:: rafiki.model.BaseModel
     :members:
 
+.. autoclass:: rafiki.model.BaseKnob
+    :members:
+
+
+.. _`knob-types`:
+
+Knob Classes
+--------------------------------------------------------------------
+
+.. autoclass:: rafiki.model.CategoricalKnob
+    :members:
+
+.. autoclass:: rafiki.model.IntegerKnob
+    :members:
+
+.. autoclass:: rafiki.model.FloatKnob
+    :members:
+
+.. autoclass:: rafiki.model.FixedKnob
+    :members:
+
+
+Utility Classes & Methods
+--------------------------------------------------------------------
+
 .. automethod:: rafiki.model.test_model_class
 
-.. autoclass:: rafiki.model.log.ModelLogUtils
+.. autoclass:: rafiki.model.ModelLogger
     :members:
 
-.. autoclass:: rafiki.model.dataset.ModelDatasetUtils
+.. autoclass:: rafiki.model.ModelDatasetUtils
     :members:
 
-.. autoclass:: rafiki.model.dataset.ImageFilesDataset
+.. autoclass:: rafiki.model.ImageFilesDataset
 
-.. autoclass:: rafiki.model.dataset.CorpusDataset
+.. autoclass:: rafiki.model.CorpusDataset
diff --git a/docs/src/user/client-create-models.include.rst b/docs/src/user/client-create-models.include.rst
@@ -1,6 +1,7 @@
 
-To create a model, you will need to submit a model class that extends :class:`rafiki.model.BaseModel` in a single Python file,
-where the model's implementation conforms to a specific task (see :ref:`tasks`). 
+To create a model, you will need to submit a model class that conforms to the specification
+by :class:`rafiki.model.BaseModel`, written in a `single` Python file.
+The model's implementation should conform to a specific task (see :ref:`tasks`).
 
 Refer to the parameters of :meth:`rafiki.client.Client.create_model` for configuring how your model runs on Rafiki,
 and refer to :ref:`creating-models` to understand more about how to write & test models for Rafiki.

diff --git a/docs/src/user/creating-models.rst b/docs/src/user/creating-models.rst
@@ -6,9 +6,41 @@ Creating Models
 
 .. contents:: Table of Contents
 
+To create a model, you will need to submit a model class that conforms to the specification
+by :class:`rafiki.model.BaseModel`, written in a `single` Python file.
+The model's implementation should conform to a specific task (see :ref:`tasks`).
+To submit the model to Rafiki, use the :meth:`rafiki.client.Client.create_model` method.
 
-To create a model on Rafiki, use the :meth:`rafiki.client.Client.create_model` method.
+Implementing Models
+--------------------------------------------------------------------
+
+Full details on how to implement a model are located in the documentation of :class:`rafiki.model.BaseModel`,
+and sample model implementations are located in `./examples/models/ <https://github.com/nginyc/rafiki/tree/master/examples/models/>`_.
+
+In defining the hyperparameters (knobs) of a model, refer to the documentation at :ref:`knob-types` for the full list of knob types.
 
+After implementing your model, it is highly recommended to use :meth:`rafiki.model.test_model_class` 
+to test your model. This method simulates a full train-inference flow on your model, ensuring that 
+it is likely to work on Rafiki.
+
+Logging in Models
+--------------------------------------------------------------------
+
+By importing the global ``logger`` instance in the ``rafiki.model`` module, 
+you can log messages and metrics while your model is being trained, and you can 
+define plots to visualize your model's training on Rafiki's Admin Web interface.
+
+Refer to :class:`rafiki.model.ModelLogger` for full usage instructions.
+
+.. seealso:: :ref:`using-admin-web` 
+
+Dataset Loading in Models
+--------------------------------------------------------------------
+
+The global ``dataset_utils`` instance in the ``rafiki.model`` module provides
+a set of built-in dataset loading methods for common dataset types on Rafiki.
+
+Refer to :class:`rafiki.model.ModelDatasetUtils` for full usage instructions.
 
 Model Environment
 --------------------------------------------------------------------
@@ -25,21 +57,20 @@ prior to model training and inference. This is configurable with the ``dependenc
 during model creation. 
 
 Alternatively, you can build a custom Docker image that extends ``rafikiai/rafiki_worker``,
-installing the required dependencies for your model. This is configurable with ``docker_image``) option
+installing the required dependencies for your model. This is configurable with ``docker_image`` option
 during model creation.
 
 Models should run at least run on CPU-only machines and optionally leverage on a shared GPU, if it is available.
 
 Refer to the parameters of :meth:`rafiki.client.Client.create_model` for configuring how your model runs on Rafiki.
 
-Testing Models
+Sample Models
 --------------------------------------------------------------------
 
 To illustrate how to write models on Rafiki, we have written the following:
 
     - Sample pre-processing logic to convert common dataset formats to Rafiki's own dataset formats in `./examples/datasets/ <https://github.com/nginyc/rafiki/tree/master/examples/datasets/>`_ 
     - Sample models in `./examples/models/ <https://github.com/nginyc/rafiki/tree/master/examples/models/>`_
-    - A method :meth:`rafiki.model.test_model_class` that simulates a full train-inference flow on any Rafiki model 
 
 To start testing your model, first install the Python dependencies at ``rafiki/model/requirements.txt``:
 
@@ -93,13 +124,3 @@ Example: Testing Models for ``POS_TAGGING``
 
         python examples/models/pos_tagging/BigramHmm.py
         python examples/models/pos_tagging/PyBiLstm.py
-
-
-Model Logging & Dataset Loading
---------------------------------------------------------------------
-
-:class:`rafiki.model.BaseModel` has a property ``utils`` that subclasses the model utility classes
-:class:`rafiki.model.log.ModelLogUtils` and :class:`rafiki.model.dataset.ModelDatasetUtils`. They 
-help with model logging & dataset loading respectively. 
-
-Refer to the sample usage in the implementation of `./examples/models/image_classification/TfSingleHiddenLayer.py <https://github.com/nginyc/rafiki/tree/master/examples/models/image_classification/TfSingleHiddenLayer.py>`_.
diff --git a/docs/src/user/quickstart.rst b/docs/src/user/quickstart.rst
@@ -7,21 +7,17 @@ Quick Start
 
 .. note::
 
-    If you're a *Model Developer* just looking to contribute models to a running instance of Rafiki, refer to :ref:`quickstart-model-developers`.
+    - If you're a *Model Developer* just looking to contribute models to a running instance of Rafiki, refer to :ref:`quickstart-model-developers`.
+    - If you're an *Application Developer* just looking to train and deploy models on a running instance of Rafiki, refer to :ref:`quickstart-app-developers`.
+    - If you're an *Application User* just looking to make predictions to deployed models on a running instance of Rafiki, refer to :ref:`quickstart-app-users`.
 
-.. note::
-
-    If you're an *Application Developer* just looking to train and deploy models on a running instance of Rafiki, refer to :ref:`quickstart-app-developers`.
-
-.. note::
-
-    If you're an *Application User* just looking to make predictions to deployed models on a running instance of Rafiki, refer to :ref:`quickstart-app-users`.
 
+This guide assumes you have deployed your an empty instance of Rafiki and you want to try a *full* train-inference flow, 
+including adding of models, submitting a train job and submitting a inference job to Rafiki.
 
-This guide assumes you have deployed your an empty instance of Rafiki and you want to do a *full* train-inference flow, 
-including preparation of dataset and adding of models to Rafiki. Below, the sequence of examples submit the 
-`Fashion MNIST dataset <https://github.com/zalandoresearch/fashion-mnist>`_ for training and inference. 
-Alternatively, after installing Rafiki Client's dependencies, you can run `./examples/scripts/client_quickstart.py <https://github.com/nginyc/rafiki/blob/master/examples/scripts/client_quickstart.py>`_.
+The sequence of examples below submits the `Fashion MNIST dataset <https://github.com/zalandoresearch/fashion-mnist>`_ for training and inference. 
+Alternatively, after installing the Rafiki Client's dependencies, you can refer and run the scripted version of this quickstart 
+`./examples/scripts/client_quickstart.py <https://github.com/nginyc/rafiki/blob/master/examples/scripts/client_quickstart.py>`_.
 
 .. note::
 

diff --git a/examples/models/image_classification/SkDt.py b/examples/models/image_classification/SkDt.py
@@ -5,38 +5,29 @@
 import base64
 import numpy as np
 
-from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class
+from rafiki.config import APP_MODE
+from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class, \
+                        IntegerKnob, CategoricalKnob, dataset_utils, logger
 from rafiki.constants import TaskType, ModelDependency
 
 class SkDt(BaseModel):
     '''
     Implements a decision tree classifier on Scikit-Learn for simple image classification
     '''
-
-    def get_knob_config(self):
+    @staticmethod
+    def get_knob_config():
         return {
-            'knobs': {
-                'max_depth': {
-                    'type': 'int',
-                    'range': [2, 8]
-                },
-                'criterion': {
-                    'type': 'string',
-                    'values': ['gini', 'entropy']
-                },
-            }
+            'max_depth': IntegerKnob(2, 16 if APP_MODE != 'DEV' else 8),
+            'criterion': CategoricalKnob(['gini', 'entropy'])
         }
 
-    def init(self, knobs):
-        self._max_depth = knobs.get('max_depth') 
-        self._criterion = knobs.get('criterion') 
-        self._clf = self._build_classifier(
-            self._max_depth,
-            self._criterion
-        )
-
+    def __init__(self, **knobs):
+        super().__init__(**knobs)
+        self.__dict__.update(knobs)
+        self._clf = self._build_classifier(self.max_depth, self.criterion)
+
     def train(self, dataset_uri):
-        dataset = self.utils.load_dataset_of_image_files(dataset_uri)
+        dataset = dataset_utils.load_dataset_of_image_files(dataset_uri)
         (images, classes) = zip(*[(image, image_class) for (image, image_class) in dataset])
         X = self._prepare_X(images)
         y = classes
@@ -45,10 +36,10 @@ def train(self, dataset_uri):
         # Compute train accuracy
         preds = self._clf.predict(X)
         accuracy = sum(y == preds) / len(y)
-        self.utils.log('Train accuracy: {}'.format(accuracy))
+        logger.log('Train accuracy: {}'.format(accuracy))
 
     def evaluate(self, dataset_uri):
-        dataset = self.utils.load_dataset_of_image_files(dataset_uri)
+        dataset = dataset_utils.load_dataset_of_image_files(dataset_uri)
         (images, classes) = zip(*[(image, image_class) for (image, image_class) in dataset])
         X = self._prepare_X(images)
         y = classes

diff --git a/examples/models/image_classification/SkSvm.py b/examples/models/image_classification/SkSvm.py
@@ -5,57 +5,38 @@
 import base64
 import numpy as np
 
-from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class
+from rafiki.config import APP_MODE
+from rafiki.model import BaseModel, InvalidModelParamsException, test_model_class, \
+                        IntegerKnob, CategoricalKnob, FloatKnob, dataset_utils
 from rafiki.constants import TaskType, ModelDependency
 
 class SkSvm(BaseModel):
     '''
     Implements a SVM on Scikit-Learn for simple image classification
     '''
-
-    def get_knob_config(self):
+    @staticmethod
+    def get_knob_config():
         return {
-            'knobs': {
-                'max_iter': {
-                    'type': 'int',
-                    'range': [10, 10]
-                },
-                'kernel': {
-                    'type': 'string',
-                    'values': ['rbf', 'linear']
-                },
-                'gamma': {
-                    'type': 'string',
-                    'values': ['scale', 'auto']
-                },
-                'C': {
-                    'type': 'float_exp',
-                    'range': [1e-2, 1e2]
-                }
-            }
+            'max_iter': IntegerKnob(10, 40 if APP_MODE != 'DEV' else 10),
+            'kernel': CategoricalKnob(['rbf', 'linear']),
+            'gamma': CategoricalKnob(['scale', 'auto']),
+            'C': FloatKnob(1e-2, 1e2, is_exp=True)
         }
 
-    def init(self, knobs):
-        self._max_iter = knobs.get('max_iter') 
-        self._kernel = knobs.get('kernel') 
-        self._gamma = knobs.get('gamma') 
-        self._C = knobs.get('C') 
-        self._clf = self._build_classifier(
-            self._max_iter,
-            self._kernel,
-            self._gamma,
-            self._C
-        )
+    def __init__(self, **knobs):
+        super().__init__(**knobs)
+        self.__dict__.update(knobs)
+        self._clf = self._build_classifier(self.max_iter, self.kernel, self.gamma, self.C)
 
     def train(self, dataset_uri):
-        dataset = self.utils.load_dataset_of_image_files(dataset_uri)
+        dataset = dataset_utils.load_dataset_of_image_files(dataset_uri)
         (images, classes) = zip(*[(image, image_class) for (image, image_class) in dataset])
         X = self._prepare_X(images)
         y = classes
         self._clf.fit(X, y)
 
     def evaluate(self, dataset_uri):
-        dataset = self.utils.load_dataset_of_image_files(dataset_uri)
+        dataset = dataset_utils.load_dataset_of_image_files(dataset_uri)
         (images, classes) = zip(*[(image, image_class) for (image, image_class) in dataset])
         X = self._prepare_X(images)
         y = classes