diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..6525aa6 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,127 @@ +version: 2.1 + +jobs: + test_gradient_model_py36: + docker: + - image: circleci/python:3.6.9 + working_directory: ~/project/packages/gradient_boosting_model + steps: + - checkout: + path: ~/project + - run: + name: Run tests with Python 3.6 + command: | + sudo pip install --upgrade pip + pip install --user tox + tox -e py36 + test_gradient_model_py37: + docker: + - image: circleci/python:3.7.6 + working_directory: ~/project/packages/gradient_boosting_model + steps: + - checkout: + path: ~/project + - run: + name: Run tests with Python 3.7 + command: | + sudo pip install --upgrade pip + pip install --user tox + tox -e py37 + test_gradient_model_py38: + docker: + - image: circleci/python:3.8.0 + working_directory: ~/project/packages/gradient_boosting_model + steps: + - checkout: + path: ~/project + - run: + name: Run tests with Python 3.8 + command: | + sudo pip install --upgrade pip + pip install --user tox + tox -e py38 + test_ml_api_py36: + docker: + - image: circleci/python:3.6.9 + - image: postgres + environment: + POSTGRES_USER: test_user + POSTGRES_PASSWORD: password + POSTGRES_DB: ml_api_test + environment: + DB_HOST: localhost + DB_PORT: 5432 + DB_USER: test_user + DB_PASSWORD: password + DB_NAME: ml_api_test + SHADOW_MODE_ACTIVE: true + working_directory: ~/project/packages/ml_api + steps: + - checkout: + path: ~/project + - run: + name: Run API tests with Python 3.6 + command: | + sudo pip install --upgrade pip + pip install --user tox + tox -e py36 + test_ml_api_py37: + docker: + - image: circleci/python:3.7.6 + - image: postgres + environment: + POSTGRES_USER: test_user + POSTGRES_PASSWORD: password + POSTGRES_DB: ml_api_test + environment: + DB_HOST: localhost + DB_PORT: 5432 + DB_USER: test_user + DB_PASSWORD: password + DB_NAME: ml_api_test + SHADOW_MODE_ACTIVE: true + working_directory: ~/project/packages/ml_api + steps: + - checkout: + path: ~/project + - run: + name: Run API tests with Python 3.7 + command: | + sudo pip install --upgrade pip + pip install --user tox + tox -e py37 + test_ml_api_py38: + docker: + - image: circleci/python:3.8.1 + - image: postgres + environment: + POSTGRES_USER: test_user + POSTGRES_PASSWORD: password + POSTGRES_DB: ml_api_test + environment: + DB_HOST: localhost + DB_PORT: 5432 + DB_USER: test_user + DB_PASSWORD: password + DB_NAME: ml_api_test + SHADOW_MODE_ACTIVE: true + working_directory: ~/project/packages/ml_api + steps: + - checkout: + path: ~/project + - run: + name: Run API tests with Python 3.8 + command: | + sudo pip install --upgrade pip + pip install --user tox + tox -e py38 +workflows: + version: 2 + test-all: + jobs: + - test_gradient_model_py36 + - test_gradient_model_py37 + - test_gradient_model_py38 + - test_ml_api_py36 + - test_ml_api_py37 + - test_ml_api_py38 diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..c853bc5 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,18 @@ +exercise_notebooks/* +*/env* +*/venv* +.circleci* +packages/gradient_boosting_model +*.env +*.log +.git +.gitignore +.dockerignore +*.mypy_cache +*.pytest_cache + +### Python ### + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] diff --git a/.gitignore b/.gitignore index 5346ee9..f887dad 100644 --- a/.gitignore +++ b/.gitignore @@ -89,6 +89,7 @@ venv/ ENV/ env.bak/ venv.bak/ +.tox/ # Spyder project settings .spyderproject @@ -124,3 +125,7 @@ test.csv # trained models packages/gradient_boosting_model/gradient_boosting_model/trained_models/*.pkl *.h5 + +# differential test artifacts +packages/ml_api/differential_tests/expected_results/ +packages/ml_api/differential_tests/actual_results/ diff --git a/README.md b/README.md index 95280ac..304b6fb 100644 --- a/README.md +++ b/README.md @@ -1,2 +1 @@ -# testing-and-monitoring-ml-deployments -WIP +Example project for the course "Testing & Monitoring Machine Learning Model Deployments". For setup instructions, see the course lectures. diff --git a/exercise_notebooks/elk_exercise/Dockerfile b/exercise_notebooks/elk_exercise/Dockerfile new file mode 100644 index 0000000..86647f9 --- /dev/null +++ b/exercise_notebooks/elk_exercise/Dockerfile @@ -0,0 +1,23 @@ +FROM python:3.7-alpine +WORKDIR /application + +COPY ./requirements.txt requirements.txt +RUN apk add --no-cache \ + gcc \ + libc-dev \ + linux-headers \ + bash; \ + pip install -r requirements.txt; + +COPY . /application + + +EXPOSE 5000 +VOLUME /application +CMD gunicorn --bind 0.0.0.0:5000 \ + --workers=1 \ + --log-config gunicorn_logging.conf \ + --log-level=DEBUG \ + --access-logfile=- \ + --error-logfile=- \ + application:application diff --git a/exercise_notebooks/elk_exercise/app/__init__.py b/exercise_notebooks/elk_exercise/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/exercise_notebooks/elk_exercise/app/flask_app.py b/exercise_notebooks/elk_exercise/app/flask_app.py new file mode 100644 index 0000000..e688b45 --- /dev/null +++ b/exercise_notebooks/elk_exercise/app/flask_app.py @@ -0,0 +1,18 @@ +import logging + +from flask import Flask, current_app + + +def index(): + current_app.logger.info('home') + return 'home' + + +def create_app(): + main_app = Flask(__name__) + main_app.add_url_rule('/', 'index', index) + gunicorn_error_logger = logging.getLogger('gunicorn.error') + main_app.logger.addHandler(gunicorn_error_logger) + main_app.logger.setLevel(logging.DEBUG) + + return main_app diff --git a/exercise_notebooks/elk_exercise/application.py b/exercise_notebooks/elk_exercise/application.py new file mode 100644 index 0000000..e03e2a0 --- /dev/null +++ b/exercise_notebooks/elk_exercise/application.py @@ -0,0 +1,7 @@ +from app.flask_app import create_app + + +application = create_app() + +if __name__ == '__main__': + application.run() diff --git a/exercise_notebooks/elk_exercise/docker-compose.yml b/exercise_notebooks/elk_exercise/docker-compose.yml new file mode 100644 index 0000000..7c73164 --- /dev/null +++ b/exercise_notebooks/elk_exercise/docker-compose.yml @@ -0,0 +1,91 @@ +version: '3.2' + +services: + # The environment variable "ELK_VERSION" is used throughout this file to + # specify the version of the images to run. The default is set in the + # '.env' file in this folder. It can be overridden with any normal + # technique for setting environment variables, for example: + # + # ELK_VERSION=7.0.0-beta1 docker-compose up + # + # REF: https://docs.docker.com/compose/compose-file/#variable-substitution + webapp: + build: . + container_name: webapp + expose: + - 5000 + ports: + - 5000:5000 + links: + - logstash + networks: + - elk + depends_on: + - logstash + - kibana + - elasticsearch + volumes: + - ./:/application + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:${ELK_VERSION} + volumes: + - type: bind + source: ./elasticsearch/config/elasticsearch.yml + target: /usr/share/elasticsearch/config/elasticsearch.yml + read_only: true + - type: volume + source: elasticsearch + target: /usr/share/elasticsearch/data + ports: + - "9200:9200" + - "9300:9300" + environment: + ES_JAVA_OPTS: "-Xmx256m -Xms256m" + ELASTIC_PASSWORD: changeme + # Use single node discovery in order to disable production mode and avoid bootstrap checks + # see https://www.elastic.co/guide/en/elasticsearch/reference/current/bootstrap-checks.html + discovery.type: single-node + networks: + - elk + + logstash: + image: docker.elastic.co/logstash/logstash:${ELK_VERSION} + volumes: + - type: bind + source: ./logstash/config/logstash.yml + target: /usr/share/logstash/config/logstash.yml + read_only: true + - type: bind + source: ./logstash/pipeline + target: /usr/share/logstash/pipeline + read_only: true + ports: + - "5001:5001" + - "9600:9600" + environment: + LS_JAVA_OPTS: "-Xmx256m -Xms256m" + networks: + - elk + depends_on: + - elasticsearch + + kibana: + image: docker.elastic.co/kibana/kibana:${ELK_VERSION} + volumes: + - type: bind + source: ./kibana/config/kibana.yml + target: /usr/share/kibana/config/kibana.yml + read_only: true + ports: + - "5601:5601" + networks: + - elk + depends_on: + - elasticsearch + +networks: + elk: + driver: bridge + +volumes: + elasticsearch: \ No newline at end of file diff --git a/exercise_notebooks/elk_exercise/elasticsearch/config/elasticsearch.yml b/exercise_notebooks/elk_exercise/elasticsearch/config/elasticsearch.yml new file mode 100644 index 0000000..cbed5c3 --- /dev/null +++ b/exercise_notebooks/elk_exercise/elasticsearch/config/elasticsearch.yml @@ -0,0 +1,11 @@ +--- +## Default Elasticsearch configuration from Elasticsearch base image. +## https://github.com/elastic/elasticsearch/blob/master/distribution/docker/src/docker/config/elasticsearch.yml +cluster.name: "docker-cluster" +network.host: 0.0.0.0 + +## X-Pack settings +## see https://www.elastic.co/guide/en/elasticsearch/reference/current/setup-xpack.html +xpack.license.self_generated.type: basic +xpack.security.enabled: true +xpack.monitoring.collection.enabled: true diff --git a/exercise_notebooks/elk_exercise/gunicorn_logging.conf b/exercise_notebooks/elk_exercise/gunicorn_logging.conf new file mode 100644 index 0000000..7ec8e8c --- /dev/null +++ b/exercise_notebooks/elk_exercise/gunicorn_logging.conf @@ -0,0 +1,46 @@ +[loggers] +keys=root, logstash.error, logstash.access + +[handlers] +keys=console, logstash + +[formatters] +keys=generic, access, json + +[logger_root] +level=INFO +handlers=console + +[logger_logstash.error] +level=INFO +handlers=logstash +propagate=1 +qualname=gunicorn.error + +[logger_logstash.access] +level=INFO +handlers=logstash +propagate=0 +qualname=gunicorn.access + +[handler_console] +class=StreamHandler +formatter=generic +args=(sys.stdout, ) + +[handler_logstash] +class=logstash.TCPLogstashHandler +formatter=json +args=('logstash', 5001) + +[formatter_generic] +format=%(asctime)s [%(process)d] [%(levelname)s] %(message)s +datefmt=%Y-%m-%d %H:%M:%S +class=logging.Formatter + +[formatter_access] +format=%(message)s +class=logging.Formatter + +[formatter_json] +class=pythonjsonlogger.jsonlogger.JsonFormatter \ No newline at end of file diff --git a/exercise_notebooks/elk_exercise/kibana/config/kibana.yml b/exercise_notebooks/elk_exercise/kibana/config/kibana.yml new file mode 100644 index 0000000..93380e9 --- /dev/null +++ b/exercise_notebooks/elk_exercise/kibana/config/kibana.yml @@ -0,0 +1,13 @@ +--- +## Default Kibana configuration from Kibana base image. +## https://github.com/elastic/kibana/blob/master/src/dev/build/tasks/os_packages/docker_generator/templates/kibana_yml.template.js +# +server.name: kibana +server.host: "0" +elasticsearch.hosts: [ "http://elasticsearch:9200" ] +xpack.monitoring.ui.container.elasticsearch.enabled: true + +## X-Pack security credentials +# +elasticsearch.username: elastic +elasticsearch.password: changeme \ No newline at end of file diff --git a/exercise_notebooks/elk_exercise/logstash/config/logstash.yml b/exercise_notebooks/elk_exercise/logstash/config/logstash.yml new file mode 100644 index 0000000..a48c35f --- /dev/null +++ b/exercise_notebooks/elk_exercise/logstash/config/logstash.yml @@ -0,0 +1,12 @@ +--- +## Default Logstash configuration from Logstash base image. +## https://github.com/elastic/logstash/blob/master/docker/data/logstash/config/logstash-full.yml +# +http.host: "0.0.0.0" +xpack.monitoring.elasticsearch.hosts: [ "http://elasticsearch:9200" ] + +## X-Pack security credentials +# +xpack.monitoring.enabled: true +xpack.monitoring.elasticsearch.username: elastic +xpack.monitoring.elasticsearch.password: changeme diff --git a/exercise_notebooks/elk_exercise/logstash/pipeline/logstash.conf b/exercise_notebooks/elk_exercise/logstash/pipeline/logstash.conf new file mode 100644 index 0000000..7c273f0 --- /dev/null +++ b/exercise_notebooks/elk_exercise/logstash/pipeline/logstash.conf @@ -0,0 +1,17 @@ +input { + tcp { + port => 5001 + tags => ["webapp_logs"] + type => "webapp_logs" + codec => json + } +} + +output { + elasticsearch { + hosts => "elasticsearch:9200" + user => "elastic" + password => "changeme" + index => "webapp_logs-%{+YYYY.MM.dd}" + } +} \ No newline at end of file diff --git a/exercise_notebooks/elk_exercise/requirements.txt b/exercise_notebooks/elk_exercise/requirements.txt new file mode 100644 index 0000000..6607dd0 --- /dev/null +++ b/exercise_notebooks/elk_exercise/requirements.txt @@ -0,0 +1,5 @@ +Flask>=1.1.1,<1.2.0 +python3-logstash>=0.4.80,<0.5.0 +python-json-logger>=0.1.11,<0.2.0 +gunicorn>=20.0.4,<20.1.0 + diff --git a/exercise_notebooks/prometheus_exercise/Dockerfile b/exercise_notebooks/prometheus_exercise/Dockerfile new file mode 100644 index 0000000..4fc5705 --- /dev/null +++ b/exercise_notebooks/prometheus_exercise/Dockerfile @@ -0,0 +1,17 @@ +FROM python:3.7-alpine +WORKDIR /application + +COPY ./requirements.txt requirements.txt +RUN apk add --no-cache \ + gcc \ + libc-dev \ + linux-headers \ + bash; \ + pip install -r requirements.txt; + +COPY . /application + + +EXPOSE 5000 +VOLUME /application +CMD gunicorn --workers=1 --bind 0.0.0.0:5000 application:application diff --git a/exercise_notebooks/prometheus_exercise/app/__init__.py b/exercise_notebooks/prometheus_exercise/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/exercise_notebooks/prometheus_exercise/app/flask_app.py b/exercise_notebooks/prometheus_exercise/app/flask_app.py new file mode 100644 index 0000000..9d2357d --- /dev/null +++ b/exercise_notebooks/prometheus_exercise/app/flask_app.py @@ -0,0 +1,45 @@ +import prometheus_client +from flask import Flask +from werkzeug.middleware.dispatcher import DispatcherMiddleware +from app.helpers.middleware import setup_metrics + + +def index(): + return 'home' + + +def cpu(): + # For older machines, you may want to lower + # this range to prevent timeouts. + for i in range(10000): + i**i + + return 'cpu intensive operation complete' + + +def memory(): + d = {} + # For older machines, you may want to lower + # this range to prevent timeouts. + for i in range(10000000): + i = str(i) + i += "xyz" + d[i] = i + + return 'memory intensive operation complete' + + +def create_app(): + main_app = Flask(__name__) + main_app.add_url_rule('/', 'index', index) + main_app.add_url_rule('/cpu', 'cpu', cpu) + main_app.add_url_rule('/memory', 'memory', memory) + setup_metrics(main_app) + + # Add prometheus wsgi middleware to route /metrics requests + app = DispatcherMiddleware( + app=main_app.wsgi_app, + mounts={'/metrics': prometheus_client.make_wsgi_app()} + ) + + return app diff --git a/exercise_notebooks/prometheus_exercise/app/helpers/__init__.py b/exercise_notebooks/prometheus_exercise/app/helpers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/exercise_notebooks/prometheus_exercise/app/helpers/middleware.py b/exercise_notebooks/prometheus_exercise/app/helpers/middleware.py new file mode 100644 index 0000000..f547ee3 --- /dev/null +++ b/exercise_notebooks/prometheus_exercise/app/helpers/middleware.py @@ -0,0 +1,58 @@ +from flask import request, Flask +from flask.wrappers import Response +from prometheus_client import Counter, Histogram +import time + + +# Counter and Histogram are examples of default metrics +# available from the prometheus Python client. +REQUEST_COUNT = Counter( + name='http_request_count', + documentation='App Request Count', + labelnames=['app_name', 'method', 'endpoint', 'http_status'] +) +REQUEST_LATENCY = Histogram( + name='http_request_latency_seconds', + documentation='Request latency', + labelnames=['app_name', 'endpoint'] +) + + +def start_timer() -> None: + """Get start time of a request.""" + request._prometheus_metrics_request_start_time = time.time() + + +def stop_timer(response: Response) -> Response: + """Get stop time of a request..""" + request_latency = time.time() - request._prometheus_metrics_request_start_time + REQUEST_LATENCY.labels( + app_name='webapp', + endpoint=request.path).observe(request_latency) + return response + + +def record_request_data(response: Response) -> Response: + """Capture request data. + + Uses the flask request object to extract information such as + the HTTP request method, endpoint and HTTP status. + """ + REQUEST_COUNT.labels( + app_name='webapp', + method=request.method, + endpoint=request.path, + http_status=response.status_code).inc() + return response + + +def setup_metrics(app: Flask) -> None: + """Setup Prometheus metrics. + + This function uses the flask before_request + and after_request hooks to capture metrics + with each HTTP request to the application. + """ + app.before_request(start_timer) + app.after_request(record_request_data) + app.after_request(stop_timer) diff --git a/exercise_notebooks/prometheus_exercise/application.py b/exercise_notebooks/prometheus_exercise/application.py new file mode 100644 index 0000000..e03e2a0 --- /dev/null +++ b/exercise_notebooks/prometheus_exercise/application.py @@ -0,0 +1,7 @@ +from app.flask_app import create_app + + +application = create_app() + +if __name__ == '__main__': + application.run() diff --git a/exercise_notebooks/prometheus_exercise/config/grafana/basic_cadvisor_dashboard.json b/exercise_notebooks/prometheus_exercise/config/grafana/basic_cadvisor_dashboard.json new file mode 100644 index 0000000..b621f02 --- /dev/null +++ b/exercise_notebooks/prometheus_exercise/config/grafana/basic_cadvisor_dashboard.json @@ -0,0 +1,605 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Docker monitoring with Prometheus and cAdvisor with node selection", + "editable": true, + "gnetId": 8321, + "graphTooltip": 1, + "id": 6, + "iteration": 1578215128428, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "Prometheus", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 8, + "x": 0, + "y": 0 + }, + "height": "20", + "id": 7, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(container_last_seen{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"})", + "intervalFactor": 2, + "legendFormat": "", + "metric": "container_last_seen", + "refId": "A", + "step": 240 + } + ], + "thresholds": "", + "title": "Running containers", + "transparent": true, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "Prometheus", + "editable": true, + "error": false, + "format": "mbytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 8, + "x": 8, + "y": 0 + }, + "height": "20", + "id": 5, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"})/1024/1024", + "intervalFactor": 2, + "legendFormat": "", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 240 + } + ], + "thresholds": "", + "title": "Total Memory Usage", + "transparent": true, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "Prometheus", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 8, + "x": 16, + "y": 0 + }, + "height": "20", + "id": 6, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(container_cpu_user_seconds_total{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}[5m]) * 100)", + "intervalFactor": 2, + "legendFormat": "", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 240 + } + ], + "thresholds": "", + "title": "Total CPU Usage", + "transparent": true, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 3 + }, + "hiddenSeries": false, + "id": 2, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(container_cpu_user_seconds_total{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}[5m]) * 100", + "intervalFactor": 2, + "legendFormat": "{{name}}", + "metric": "cpu", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 10 + }, + "hiddenSeries": false, + "id": 1, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "container_memory_usage_bytes{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{name}}", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "5s", + "schemaVersion": 21, + "style": "dark", + "tags": [ + "docker" + ], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "text": "cadvisor", + "value": "cadvisor" + }, + "datasource": "Prometheus", + "definition": "", + "hide": 0, + "includeAll": false, + "label": "Job", + "multi": false, + "name": "job", + "options": [], + "query": "label_values(container_cpu_user_seconds_total, job)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "cadvisor", + "value": "cadvisor" + }, + "datasource": "Prometheus", + "definition": "", + "hide": 0, + "includeAll": false, + "label": "Host:", + "multi": false, + "name": "node", + "options": [], + "query": "label_values(container_cpu_user_seconds_total{job=~\"$job\"}, instance)", + "refresh": 1, + "regex": "/([^:]+):.*/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "8080", + "value": "8080" + }, + "datasource": "Prometheus", + "definition": "", + "hide": 0, + "includeAll": false, + "label": "Port", + "multi": false, + "name": "port", + "options": [], + "query": "label_values(container_cpu_user_seconds_total{instance=~\"$node:(.*)\"}, instance)", + "refresh": 1, + "regex": "/[^:]+:(.*)/", + "skipUrlSync": false, + "sort": 3, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Docker monitoring with node selection", + "uid": "pHUTSjLZk", + "version": 2 +} \ No newline at end of file diff --git a/exercise_notebooks/prometheus_exercise/config/grafana/grafana_flask_basic_dashboard.json b/exercise_notebooks/prometheus_exercise/config/grafana/grafana_flask_basic_dashboard.json new file mode 100644 index 0000000..b5a52ca --- /dev/null +++ b/exercise_notebooks/prometheus_exercise/config/grafana/grafana_flask_basic_dashboard.json @@ -0,0 +1,224 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 1, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(http_request_count_total{job=\"webapp\"}[1m])", + "legendFormat": "{{app_name}} {{endpoint}} {{http_status}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Requests Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 0 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(http_request_latency_seconds_sum{job=\"webapp\"}[1m]) / rate(http_request_latency_seconds_count{job=\"webapp\"}[1m])", + "legendFormat": "{{endpoint}} (seconds)", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "schemaVersion": 21, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Really Simple Flask Dashboard", + "uid": "q8vgEpLZk", + "version": 4 +} \ No newline at end of file diff --git a/exercise_notebooks/prometheus_exercise/config/prometheus/prometheus.yml b/exercise_notebooks/prometheus_exercise/config/prometheus/prometheus.yml new file mode 100644 index 0000000..19d7bd8 --- /dev/null +++ b/exercise_notebooks/prometheus_exercise/config/prometheus/prometheus.yml @@ -0,0 +1,42 @@ +# my global config +global: + scrape_interval: 15s # By default, scrape targets every 15 seconds. + evaluation_interval: 15s # By default, scrape targets every 15 seconds. + # scrape_timeout is set to the global default (10s). + + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor: 'my-project' + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + - job_name: 'prometheus' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + static_configs: + - targets: ['prometheus:9090'] + - job_name: 'webapp' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + static_configs: + - targets: ['webapp:5000'] + + - job_name: 'cadvisor' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + static_configs: + - targets: ['cadvisor:8080'] diff --git a/exercise_notebooks/prometheus_exercise/docker-compose.yml b/exercise_notebooks/prometheus_exercise/docker-compose.yml new file mode 100644 index 0000000..522e59b --- /dev/null +++ b/exercise_notebooks/prometheus_exercise/docker-compose.yml @@ -0,0 +1,51 @@ +version: '3' + +volumes: + prometheus_data: {} + grafana_data: {} + +services: + webapp: + build: . + container_name: webapp + expose: + - 5000 + ports: + - 5000:5000 + volumes: + - ./:/application + prometheus: + image: prom/prometheus + container_name: prometheus + volumes: + - ./config/prometheus/:/etc/prometheus/ + - prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + expose: + - 9090 + ports: + - 9090:9090 + depends_on: + - cadvisor + grafana: + image: grafana/grafana + depends_on: + - prometheus + ports: + - 3000:3000 + volumes: + - grafana_data:/var/lib/grafana + environment: + - GF_SECURITY_ADMIN_PASSWORD=foobar + - GF_USERS_ALLOW_SIGN_UP=false + + cadvisor: + image: google/cadvisor + volumes: + - /:/rootfs:ro + - /var/run:/var/run:rw + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + ports: + - 8080:8080 diff --git a/exercise_notebooks/prometheus_exercise/requirements.txt b/exercise_notebooks/prometheus_exercise/requirements.txt new file mode 100644 index 0000000..0fbe48f --- /dev/null +++ b/exercise_notebooks/prometheus_exercise/requirements.txt @@ -0,0 +1,4 @@ +Flask>=1.1.1,<1.2.0 +prometheus_client>=0.7.1,<0.8.0 +gunicorn>=20.0.4,<20.1.0 + diff --git a/exercise_notebooks/shadow_mode_exercise/assessing_model_results.ipynb b/exercise_notebooks/shadow_mode_exercise/assessing_model_results.ipynb new file mode 100755 index 0000000..1733a2d --- /dev/null +++ b/exercise_notebooks/shadow_mode_exercise/assessing_model_results.ipynb @@ -0,0 +1,1740 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setup\n", + "\n", + "1. Checkout the code at commit: **\"Shadow Mode ML Code - Analyse Results\"**\n", + "\n", + "\n", + "2. Make sure your virtualenv is active, and you have installed the dependencies listed in the requirements.txt located in the same directory as this notebook.\n", + "\n", + "\n", + "3. Make sure you have copied the Kaggle houseprice.csv into the same directory as this notebook. By this point in the course, you should have this csv file here:\n", + "\n", + "testing-and-monitoring-ml-deployments/packages/gradient_boosting_model/gradient_boosting_model/datasets/\n", + "\n", + "\n", + "4. Before running any cells in the notebook, start your docker containers: Open the terminal/Command prompt and navigate to this directory:\n", + "\n", + "testing-and-monitoring-ml-deployments/packages/ml_api\n", + "\n", + "5. Then run: `docker-compose -f docker/docker-compose.yml up -d --build`\n", + "\n", + "(Old window version users, remember to run: `docker-machine start default` followed by `docker-machine env` before the docker compose command)\n", + "\n", + "6. Populate the DB with simulated shadow data by running\n", + "`tox -e generate_predictions` (also from the ml_api directory)\n", + "\n", + "Note: Feel free to run the populate DB command multiple times, however if you generate more than 10k requests then some of the tests below will fail.\n", + "\n", + "**Keep in mind that the populate_database.py script has some element of randomness, so based on that and on that you and I may run this script a different amount of times, the results shown in this notebook are indicative, and may be identical**\n", + "\n", + "Focus instead on the take-home messages derived from the tests." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import scipy.stats as stats\n", + "import seaborn as sns\n", + "from sqlalchemy import create_engine\n", + "\n", + "import json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A reminder that SQLAlchemy DB URIs look like this:\n", + "`postgres+psycop2://myuser:mypassword@hackersdb.example.com:5432/mydatabase`" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "postgres+psycopg2://user:password@localhost:6609/ml_api_dev\n" + ] + } + ], + "source": [ + "# note that this connection string can be found in the app DevelopmentConfig.\n", + "\n", + "# to save hassle with updating the PATH so we can import the config object,\n", + "# we write it out in full here:\n", + "\n", + "db_uri = \"postgres+psycopg2://user:password@localhost:6609/ml_api_dev\"\n", + "print(db_uri)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# connect to the database\n", + "\n", + "engine = create_engine(db_uri)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Let's load our live data, that is, the predictions we generated and are stored\n", + "# in our database (takes about 30 seconds to run)\n", + "\n", + "sql_df = pd.read_sql_table(\"gradient_boosting_model_predictions\", con=engine)\n", + "\n", + "# munge json array of inputs from postgres jsonb field.\n", + "inputs_df = sql_df.inputs.apply(\n", + " lambda row: pd.DataFrame(json.loads(row))).tolist()\n", + "\n", + "live_data = pd.concat(inputs_df, sort=False)\n", + "outputs_df = sql_df.outputs.apply(lambda row: pd.Series(json.loads(row)))\n", + "live_data['SalePrice'] = outputs_df.values\n", + "live_data.reset_index(inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# rename live columns to match training data file\n", + "\n", + "SECONDARY_VARIABLES_TO_RENAME = {\n", + " \"FirstFlrSF\": \"1stFlrSF\",\n", + " \"SecondFlrSF\": \"2ndFlrSF\",\n", + " \"ThreeSsnPortch\": \"3SsnPorch\",\n", + "}\n", + "live_data.rename(columns=SECONDARY_VARIABLES_TO_RENAME, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# columns ==> inputs to the new model\n", + "\n", + "model_features = ['LotArea', 'OverallQual', 'YearRemodAdd',\n", + " 'BsmtQual', 'BsmtFinSF1','TotalBsmtSF',\n", + " '1stFlrSF', '2ndFlrSF', 'GrLivArea',\n", + " 'GarageCars', 'YrSold']\n", + "\n", + "# From the live data, we select only those variables that\n", + "# are actually used in the model\n", + "# and the predictions (SalePrice)\n", + "\n", + "live_data = live_data[model_features + ['SalePrice']]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LotAreaOverallQualYearRemodAddBsmtQualBsmtFinSF1TotalBsmtSF1stFlrSF2ndFlrSFGrLivAreaGarageCarsYrSoldSalePrice
1228378151969TA280.0720.0212562713082.02008124359.797157
12291529961976TA0.01444.04860120414442.02008157332.471992
12302205051974Gd0.0896.02384174817922.02008167039.397270
1231805241969Gd252.0936.0459511639361.02008121015.582065
12322850351971Gd119.0864.014033698642.02008145224.114862
\n", + "
" + ], + "text/plain": [ + " LotArea OverallQual YearRemodAdd BsmtQual BsmtFinSF1 TotalBsmtSF \\\n", + "1228 3781 5 1969 TA 280.0 720.0 \n", + "1229 15299 6 1976 TA 0.0 1444.0 \n", + "1230 22050 5 1974 Gd 0.0 896.0 \n", + "1231 8052 4 1969 Gd 252.0 936.0 \n", + "1232 28503 5 1971 Gd 119.0 864.0 \n", + "\n", + " 1stFlrSF 2ndFlrSF GrLivArea GarageCars YrSold SalePrice \n", + "1228 2125 627 1308 2.0 2008 124359.797157 \n", + "1229 4860 1204 1444 2.0 2008 157332.471992 \n", + "1230 2384 1748 1792 2.0 2008 167039.397270 \n", + "1231 4595 1163 936 1.0 2008 121015.582065 \n", + "1232 1403 369 864 2.0 2008 145224.114862 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "live_data.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LotAreaOverallQualYearRemodAddBsmtQualBsmtFinSF1TotalBsmtSF1stFlrSF2ndFlrSFGrLivAreaGarageCarsYrSoldSalePrice
0845072003Gd706856856854171022008208500
1960061976Gd978126212620126222007181500
21125072002Gd486920920866178622008223500
3955071970TA216756961756171732006140000
41426082000Gd655114511451053219832008250000
\n", + "
" + ], + "text/plain": [ + " LotArea OverallQual YearRemodAdd BsmtQual BsmtFinSF1 TotalBsmtSF \\\n", + "0 8450 7 2003 Gd 706 856 \n", + "1 9600 6 1976 Gd 978 1262 \n", + "2 11250 7 2002 Gd 486 920 \n", + "3 9550 7 1970 TA 216 756 \n", + "4 14260 8 2000 Gd 655 1145 \n", + "\n", + " 1stFlrSF 2ndFlrSF GrLivArea GarageCars YrSold SalePrice \n", + "0 856 854 1710 2 2008 208500 \n", + "1 1262 0 1262 2 2007 181500 \n", + "2 920 866 1786 2 2008 223500 \n", + "3 961 756 1717 3 2006 140000 \n", + "4 1145 1053 2198 3 2008 250000 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Now let's load the data we used to train the model\n", + "\n", + "# remember to copy the houseprice.csv data to the directory of this Jupyter notebook\n", + "# or alternatively change the path below to find the file\n", + "\n", + "\n", + "# load needed columns + target\n", + "train_data = pd.read_csv('houseprice.csv',\n", + " usecols=model_features + ['SalePrice'])\n", + "\n", + "\n", + "train_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((1460, 12), (1233, 12))" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let's compare the shapes of both live and training data:\n", + "\n", + "train_data.shape, live_data.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Take this outputs as a demo of what to expect, and don't worry too much if the results are not identical. The more times you run the populate_database.py script, the larger the live_data will be." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Checks\n", + "\n", + "### Input checks - categorical variables\n", + "\n", + "We only have one categorical variable among our features:\n", + "\n", + "BsmtQual (Categorical): Evaluates the height of the basement\n", + "\n", + " Ex\tExcellent (100+ inches)\t\n", + " Gd\tGood (90-99 inches)\n", + " TA\tTypical (80-89 inches)\n", + " Fa\tFair (70-79 inches)\n", + " Po\tPoor (<70 inches)\n", + " NA\tNo Basement\n", + " \n", + "These are the values allowed according to how the variable was defined, and as we can see it can also take missing values.\n", + "\n", + "\n", + "You can find more details about the variable definitions and their permitted values here:\n", + "\n", + "[Source](http://bee-fore.s3-eu-west-1.amazonaws.com/datasets/62.txt)\n", + "\n", + "The first test aims to corroborate that live data takes only the permitted values. We can do as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Gd', 'TA', 'Ex', nan, 'Fa'], dtype=object)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let's evaluate the unique values in our training data\n", + "\n", + "train_data['BsmtQual'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['TA', 'Gd', 'Ex', 'Fa'], dtype=object)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let's now evaluate the unique values in our live data\n", + "\n", + "live_data['BsmtQual'].unique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that both our training and live data take only the permitted values. We also see that the category Po is not present in either of the data sets, which is curious. Probably there were not that many basements in Poor conditions.\n", + "\n", + "If we wanted, we could write a short test as follows, and any number bigger than 0 would fail:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len([x for x in live_data['BsmtQual'] if x not in ['Gd', 'TA', 'Ex', np.nan, 'Fa']])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Value range checks - Numerical variables\n", + "\n", + "We have a numerical and discrete variable in our data set that can only take 1 of the permitted values:\n", + "\n", + "OverallQual (Ordinal): Rates the overall material and finish of the house\n", + "\n", + " 10\tVery Excellent\n", + " 9\tExcellent\n", + " 8\tVery Good\n", + " 7\tGood\n", + " 6\tAbove Average\n", + " 5\tAverage\n", + " 4\tBelow Average\n", + " 3\tFair\n", + " 2\tPoor\n", + " 1\tVery Poor\n", + " \n", + "Given that the number of different permitted values is small, we could do an input check as we did with BsmtQual, or, for the sake of the demo, we can check value ranges:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "min 1\n", + "max 10\n", + "Name: OverallQual, dtype: int64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# we know that the min and max values are 1 and 10 according to the\n", + "# variable definition, we can indeed check that as follows:\n", + "\n", + "train_data['OverallQual'].agg(['min', 'max'])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "min 2\n", + "max 10\n", + "Name: OverallQual, dtype: int64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let's repeat for the live variable \n", + "\n", + "live_data['OverallQual'].agg(['min', 'max'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The values are within the range, so we have no reason to worry.\n", + "\n", + "We could write a small test as follows and any return bigger than 0 should fail:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len([x for x in live_data['OverallQual'] if x >10 or x <1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Excellent, we see that the variables in both our train and live data take values within the permitted range.\n", + "\n", + "**Note** we could also check value ranges for the remaining of the numerical variables, we just need to be confident that the variables can take only values within the range we want to evaluate.\n", + "\n", + "\n", + "### Missing value checks\n", + "\n", + "We know from our research phase, that these variables should not take missing data:\n", + "\n", + "numerical_na_not_allowed:\n", + " - LotArea\n", + " - OverallQual\n", + " - YearRemodAdd\n", + " - BsmtFinSF1\n", + " - TotalBsmtSF\n", + " - FirstFlrSF\n", + " - SecondFlrSF\n", + " - GrLivArea\n", + " - GarageCars\n", + " - YrSold" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# let's capture the above in a list\n", + "\n", + "numerical_na_not_allowed = ['LotArea', 'OverallQual', 'YearRemodAdd',\n", + " 'BsmtFinSF1', 'TotalBsmtSF', 'GarageCars',\n", + " 'YrSold']" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LotArea 0.0\n", + "OverallQual 0.0\n", + "YearRemodAdd 0.0\n", + "BsmtFinSF1 0.0\n", + "TotalBsmtSF 0.0\n", + "GarageCars 0.0\n", + "YrSold 0.0\n", + "dtype: float64" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# they should not take missing values in our train data\n", + "# let's check that:\n", + "\n", + "train_data[numerical_na_not_allowed].isnull().mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LotArea 0.0\n", + "OverallQual 0.0\n", + "YearRemodAdd 0.0\n", + "BsmtFinSF1 0.0\n", + "TotalBsmtSF 0.0\n", + "GarageCars 0.0\n", + "YrSold 0.0\n", + "dtype: float64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let's check if that is the case in our live data\n", + "# as well\n", + "\n", + "live_data[numerical_na_not_allowed].isnull().mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Perfect, as we can see, none of the variables that we are receiving live, take missing data where missing data is not expected. \n", + "\n", + "If we had gotten a value other than zero. we should probably investigate what's going on. We could have a bug in our code, or the variable could have changed its definition." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Distribution checks\n", + "\n", + "### First, check proportion of missing values\n", + "\n", + "We know that BsmtQual can take missing data. We can, and should test, whether the proportion of missing data that we are getting live, is the same that we considered in our training data set. We can do so, using the Chi-square test, as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 False\n", + "1 False\n", + "2 False\n", + "3 False\n", + "4 False\n", + " ... \n", + "1455 False\n", + "1456 False\n", + "1457 False\n", + "1458 False\n", + "1459 False\n", + "Name: BsmtQual, Length: 1460, dtype: bool" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check number of na in the variable in the train set\n", + "\n", + "train_data['BsmtQual'].isnull()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 False\n", + "1 False\n", + "2 False\n", + "3 False\n", + "4 False\n", + " ... \n", + "1228 False\n", + "1229 False\n", + "1230 False\n", + "1231 False\n", + "1232 False\n", + "Name: BsmtQual, Length: 1233, dtype: bool" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check number of na in the live data\n", + "\n", + "live_data['BsmtQual'].isnull()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see that we get zero, that already tells us that something may not be quite right.\n", + "\n", + "We can go ahead and test this with a statistical test. The fisher exact test as implemented in Scipy. In order to run this test, we need to create a 2x2 table, where live and train data are the columns and 0, or 1 indicating missing values are the rows, and the numbers represent the number of observations within each cell.\n", + "\n", + "Let's go ahead and do that:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# first, make 2 binary variables where we indicate if the variable took\n", + "# a missing value or not:\n", + "\n", + "train_data['BsmtQual_na'] = np.where(train_data['BsmtQual'].isnull(), 1, 0)\n", + "live_data['BsmtQual_na'] = np.where(live_data['BsmtQual'].isnull(), 1, 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trainlive
BsmtQual_na
014231233.0
137NaN
\n", + "
" + ], + "text/plain": [ + " train live\n", + "BsmtQual_na \n", + "0 1423 1233.0\n", + "1 37 NaN" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# now let's generate the 2x2 table:\n", + "\n", + "ct = pd.concat([\n", + " train_data.groupby('BsmtQual_na')['BsmtQual_na'].count(),\n", + " live_data.groupby('BsmtQual_na')['BsmtQual_na'].count()\n", + "], axis=1)\n", + "\n", + "ct.columns = ['train', 'live']\n", + "\n", + "ct" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As expected, the table contains missing data in the second row of the live data, because as we saw, there were no missing values. We can't pass a table with np.nan to the test, so we need to fill it out." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trainlive
BsmtQual_na
014231233.0
1370.0
\n", + "
" + ], + "text/plain": [ + " train live\n", + "BsmtQual_na \n", + "0 1423 1233.0\n", + "1 37 0.0" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# now we need to build a contingency table for this test\n", + "\n", + "ct.fillna(0, inplace=True)\n", + "ct" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.2719419961163252e-10" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# and now we compare frequencies with chi-square\n", + "\n", + "oddsratio, pvalue = stats.fisher_exact(ct)\n", + "\n", + "pvalue" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Aha, the p_value indicates that the distributions are significantly different. Which is not surprising given that we got 0 missing values in our live data. \n", + "\n", + "Is this a problem?\n", + "\n", + "We should start digging in our production code to see if we introduced either a bug, an exception or some sort of data filtering. If none if this happened, maybe the variable definition changed, for example.\n", + "\n", + "\n", + "### Categorical distribution test\n", + "\n", + "Similarly, we can use the same test to evaluate the proportion of categories we are getting in the categorical variables of our live data. \n", + "\n", + "As this contingency table is not a 2x2 table, we need to use a different implementation of the test, as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TA 649\n", + "Gd 618\n", + "Ex 121\n", + "Missing 37\n", + "Fa 35\n", + "Name: BsmtQual, dtype: int64" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# we first need the expected frequencies in each category\n", + "# that is the number of observations per category in the train data\n", + "\n", + "# we fill missing values with the string \"Missing\" as we did in our\n", + "# preprocessing steps\n", + "\n", + "train_data['BsmtQual'].fillna('Missing').value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TA 555\n", + "Gd 489\n", + "Ex 129\n", + "Fa 60\n", + "Name: BsmtQual, dtype: int64" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# now we need the received frequencies in each category\n", + "# in the live data data\n", + "\n", + "# we also fill in missing values with the string \"Missing\"\n", + "\n", + "live_data['BsmtQual'].fillna('Missing').value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "# we need to create 2 series of the same size\n", + "# with the counts we displayed in the previous cells\n", + "\n", + "ct = train_data['BsmtQual'].fillna('Missing').value_counts()\n", + "cl = live_data['BsmtQual'].fillna('Missing').value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TA 555\n", + "Gd 489\n", + "Ex 129\n", + "Fa 60\n", + "Name: BsmtQual, dtype: int64" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cl" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TA 555.0\n", + "Gd 489.0\n", + "Ex 129.0\n", + "Fa 60.0\n", + "Missing 0.1\n", + "Name: BsmtQual, dtype: float64" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let's add the missing category to the live data\n", + "# (I add 0.1 to avoid divide by zero errors in the test below)\n", + "\n", + "cl['Missing'] = 0.1\n", + "cl" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "# let's sort the index\n", + "\n", + "ct.sort_index(inplace=True)\n", + "cl.sort_index(inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Ex 121\n", + "Fa 35\n", + "Gd 618\n", + "Missing 37\n", + "TA 649\n", + "Name: BsmtQual, dtype: int64" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ct" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Ex 129.0\n", + "Fa 60.0\n", + "Gd 489.0\n", + "Missing 0.1\n", + "TA 555.0\n", + "Name: BsmtQual, dtype: float64" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cl" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Power_divergenceResult(statistic=13676.964186265019, pvalue=0.0)" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# and now we compare frequencies with chi-square\n", + "\n", + "stats.chisquare(f_obs=ct,\n", + " f_exp=cl)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see from the test, the p-value is 0, so the distributions in the live and train data are significantly different, which is what we expect given that we are not getting missing data in our live variable.\n", + "\n", + "For variables that are discrete in nature, like OverallQual, we could also use the above test, to compare the distributions. Let's do that:" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Power_divergenceResult(statistic=137.5069575787079, pvalue=3.3726853043045783e-25)" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# create value counts series\n", + "\n", + "ct = train_data['OverallQual'].value_counts()\n", + "cl = live_data['OverallQual'].value_counts()\n", + "\n", + "cl[1] = 0.1\n", + "\n", + "ct.sort_index(inplace=True)\n", + "cl.sort_index(inplace=True)\n", + "\n", + "# and now we compare frequencies with chi-square\n", + "\n", + "stats.chisquare(f_obs=ct,\n", + " f_exp=cl)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This variable as well shows a different category distribution in the live data, compared to train data." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# to investigate this further we can plot the number of observations\n", + "# per value, in train and live data, as follows:\n", + "\n", + "tmp = pd.concat([ct,cl], axis=1)\n", + "tmp.columns = ['train', 'live']\n", + "tmp.plot.bar()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " We do see that there is a slight difference in the distribution of the 2 sources of data: we expected less observations for the values 2 and 9, and more for the values 1 and 10 in the live data.\n", + " \n", + "As a follow up, we need to dig out the source of these discrepancies. Either we have a bug, or some sort of data filtering in our production code, or the variables may have changed their definitions\n", + "\n", + "For continuous variables, we can use Kolmogorov-Smirnov as follows:\n", + "\n", + "### Numerical variable tests" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# let's inspect the distributions of our variables first\n", + "\n", + "train_data.hist(bins=30, figsize=(10,10))\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Ks_2sampResult(statistic=0.6634714306347143, pvalue=5.1793802892230377e-256)" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let's test a few variables with the KS test:\n", + "\n", + "stats.ks_2samp(train_data['LotArea'], live_data['LotArea'])" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Ks_2sampResult(statistic=0.08687964536879646, pvalue=7.606756582567709e-05)" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stats.ks_2samp(train_data['GrLivArea'], live_data['GrLivArea'])" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Ks_2sampResult(statistic=0.04766523347665234, pvalue=0.09196049437201348)" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stats.ks_2samp(train_data['BsmtFinSF1'], live_data['BsmtFinSF1'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The p-values may change slightly between yours and this notebook, but mostly the distributions seem statistically different for all the variables.\n", + "\n", + "So let's inspect this in more detail:" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots()\n", + "\n", + "sns.kdeplot(train_data['LotArea'], ax=ax, label='train')\n", + "sns.kdeplot(live_data['LotArea'], ax=ax, label='live')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see from this plot, the variable distribution is dramatically different!\n", + "\n", + "This should send all sort of alerts, and we should investigate further the reason of this data shift, as the performance of our model could be impaired." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYYAAAD4CAYAAADo30HgAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nO3dd3hc1Zn48e87o16sLhfJtmRb7jYusjGdYIqBBNMxCQlsKOnZJPtLYrIhm7DJLiS7KSQQSnBCWIJxgASHDgk9YFs27lVYstV776M5vz/ulZHkkTS2Jd2Z0ft5Hj9z59xzz7znYdA79557zhVjDEoppVQPl9MBKKWUCiyaGJRSSvWhiUEppVQfmhiUUkr1oYlBKaVUH2FOBzAcUlNTTVZWltNhKKVUUNm6dWu1MSatf3lIJIasrCzy8vKcDkMppYKKiBzxVa6XkpRSSvWhiUEppVQfmhiUUkr1ERJjDEopdaK6urooLi6mvb3d6VBGXFRUFJmZmYSHh/tVXxODUmpMKi4uJj4+nqysLETE6XBGjDGGmpoaiouLyc7O9usYvZSklBqT2tvbSUlJCemkACAipKSknNCZkSYGpdSYFepJoceJ9lMTg/KfLtGu1JjgV2IQkVUickBE8kVkrY/9kSLylL1/k4hk9dp3p11+QEQu6VW+TkQqRWR3v7aSReQ1ETlkvyadfPfUsNnxFNwzFX57NhTrZEKlTlV9fT0PPPDACR932WWXUV9fPwIRfWzIxCAibuB+4FJgLnCjiMztV+1WoM4YMwP4BXCvfexcYA0wD1gFPGC3B/AHu6y/tcDfjTE5wN/t98pJhe/Bc1+GlOnQVgvP3g5dbU5HpVRQGygxeDyeQY978cUXSUxMHKmwAP/OGJYD+caYw8aYTmA9sLpfndXAY/b208BKsS5qrQbWG2M6jDEFQL7dHsaYt4FaH5/Xu63HgCtPoD9qJLzxXxA/ET73V7jyt1B7GN75udNRKRXU1q5dy0cffcSiRYtYtmwZ55xzDldccQVz51q/u6+88kqWLl3KvHnzePjhh48dl5WVRXV1NYWFhcyZM4fbb7+defPmcfHFF9PWNjw/2Py5XTUDKOr1vhg4faA6xhiPiDQAKXb5B/2OzRji88YbY8rs7XJgvK9KInIHcAfAlClThu6FOjkVe+HIu3DR3RCVANPOg1mXwbbH4LzvglvveFbB70d/28Pe0sZhbXPupHH8x6fmDbj/nnvuYffu3Wzfvp0333yTyy+/nN27dx+7pXTdunUkJyfT1tbGsmXLuOaaa0hJSenTxqFDh3jyySd55JFHuP7663nmmWe46aabTjn2gB58NtYDqX2OeBpjHjbG5BpjctPSjlscUA2XvEfBHQmLen3ZFn0amivg8JuOhaVUqFm+fHmfeQb33Xcfp512GitWrKCoqIhDhw4dd0x2djaLFi0CYOnSpRQWFg5LLP783CsBJvd6n2mX+apTLCJhQAJQ4+ex/VWIyERjTJmITAQq/YhRjQSvF/Y+B7Mvh9hev1RyLoboJNjxJ8i50Ln4lBomg/2yHy2xsbHHtt98801ef/113n//fWJiYjj//PN9zkOIjIw8tu12u4ftUpI/ZwxbgBwRyRaRCKzB5I396mwEbra3rwX+Yf/a3wisse9aygZygM1DfF7vtm4GnvMjRjUSSj+EliqYdWnf8rBImLsaDr4C3V3OxKZUkIuPj6epqcnnvoaGBpKSkoiJiWH//v188MEHPuuNlCETgzHGA3wVeAXYB2wwxuwRkbtF5Aq72qNAiojkA9/CvpPIGLMH2ADsBV4GvmKM6QYQkSeB94FZIlIsIrfabd0DXCQih4AL7ffKCQdfBnHBDB9nBdNXQmez3rqq1ElKSUnhrLPOYv78+Xz729/us2/VqlV4PB7mzJnD2rVrWbFixajGJiYEJi3l5uYafVDPCHjoXAiPgc+/fPy+tjq4NxvO+w584nujH5tSp2jfvn3MmTPH6TBGja/+ishWY0xu/7oBPfisHNRSDWU7fJ8tgDXGMGmxDkArFYI0MSjfCt+xXrPPG7jO9E9Yl5Lah/c2P6WUszQxKN8K34WIOJi0aOA6U88E0w2l20YvLqXUiNPEoHwreAemrAD3IA/2yFhqveoAtFIhRRODOl5TBVQfgKxzBq8XnQQpMzQxKBViNDGo4x1513odKjEAZC6DkjxdklupEKKJQR2v4B2IiIeJpw1dN2OpNQmu/ujIx6VUiImLiwOgtLSUa6+91uFoPqaJQR2v8B2YeoZ/C+Rl2rdAl2wd2ZiUCmGTJk3i6aefdjqMYzQxqL4ay6Am37/LSADpc8EVBhW7h66rlPKpsLCQ+fPnA7BixQr27NlzbN/5559PXl4eLS0tfP7zn2f58uUsXryY554budWCdM1k1VehPb6Q7WdiCIuE1JlQrolBBbGX1kL5ruFtc8ICuPTEV/S54YYb2LBhAz/60Y8oKyujrKyM3Nxcvve973HBBRewbt066uvrWb58ORdeeGGfxfeGi54xqL4K34bIBJiw0P9jxs/XMwalhsn1119/7LLShg0bjo09vPrqq9xzzz0sWrTo2GqrR4+OzNienjGovgrftSauudxD1+0xYT7s2gCttRCTPHKxKTVSTuKX/UjJyMggJSWFnTt38tRTT/Hggw8CYIzhmWeeYdasWSMeg54xqI81lFiP7fT3MlKP8da1UT1rUGp43HDDDfz0pz+loaGBhQuts/dLLrmEX//61/QsfPrhhx+O2OdrYlAfKzyB+Qu9TVhgvVbsGbyeUsov1157LevXr+f6668/VnbXXXfR1dXFwoULmTdvHnfdddeIfb5eSlIfK3gbohI/PgPwV1w6xKRqYlDqBDU3NwOQlZXF7t0fn3GPHz8ej8fTp250dDQPPfTQqMSlZwzK4vXCoVdgxkpwncTXIm0WVB//TFqlVPDRxKAsJVvtx3hednLHp8601lfSpTGUCnqaGJTl4EsgbuuM4WSkzrSe6tZSPbxxKTWCQuEJlv440X5qYlDWr/z9L1i3qUYnnVwbaTOt1+qDwxeXUiMoKiqKmpqakE8OxhhqamqIiory+xgdfFZQ+iFU7Yfld5x8G6n2vdXVByDrrOGJS6kRlJmZSXFxMVVVVU6HMuKioqLIzMz0u74mBgXbn4CwKJh/zcm3MS4DwmOgSs8YVHAIDw8nOzvb6TACkl5KGuu62mDX0zD7kxCdePLtuFyQmmOdMSilgpomhrFu2+PQXg+5/3LqbaXMsGZOK6WCmiaGsczTAe/9EqacAVOHYVwgeZr1wB5P56m3pZRyjCaGseyD30JjCZz7bRA59faSp4HxQkPRqbellHKMJoaxqnI/vPETmPMpmH7B8LSZPM161ctJSgU1TQxjUbcH/vpFiIyHy38xPGcLoIlBqRCht6uORe/90pq7cN0fIC5t+NqNTYOIOE0MSgU5PWMYa8p3w5v3wLyrYd5Vw9u2CCRna2JQKshpYhhLuj3w1y9Z8xUu+5+R+YzkaZoYlApymhjGkrx1UL4TLvsZxKaMzGckZUPdEfB2j0z7SqkRp4lhrGipgTd+DNPOh7lXjtznJE4Bbxc0lY/cZyilRpRfiUFEVonIARHJF5G1PvZHishT9v5NIpLVa9+ddvkBEblkqDZFZKWIbBOR7SLyrojMOLUuKgA+eADaG2HVPcN3F5IviVOsV53LoFTQGjIxiIgbuB+4FJgL3Cgic/tVuxWoM8bMAH4B3GsfOxdYA8wDVgEPiIh7iDZ/C3zGGLMI+BPw/VProqK9ETY/Ys1ZSJ8zsp+VMNl6rdfEoFSw8ueMYTmQb4w5bIzpBNYDq/vVWQ08Zm8/DawUEbHL1xtjOowxBUC+3d5gbRpgnL2dAJSeXNfUMR8+Dh0NcM63TrqJbq+hpcNDQ2vX4OvXJ9qJoeHoSX+WUspZ/sxjyAB6//wrBk4fqI4xxiMiDUCKXf5Bv2Mz7O2B2rwNeFFE2oBGYIWvoETkDuAOgClTpvjRjTHKGGuhvMxlMGnxCR9+qKKJX7x+kJd2lx97aueiyYn8v4tncdaMFKT/ZamIWIhO1jMGpYJYIE5w+yZwmTFmk4h8G/g5VrLowxjzMPAwQG5ubmg/gulUlG6Dqn3wyV+e0GGdHi93P7+HJzYdJSbczS1nZjFhXBQer+GJD45w06ObuHBOOvd/ZgmRYe6+BydO1jEGpYKYP4mhBJjc632mXearTrGIhGFdAqoZ4tjjykUkDTjNGLPJLn8KeNmPGNVAtv/JfgjP1X4fUt/ayRce38qmglpuOTOLr6/MITk24tj+287J5vfvFXLPS/v59p938ssbFuFy9TpzSJgM1YeGsxdKqVHkzxjDFiBHRLJFJAJrMHljvzobgZvt7WuBfxjrQvRGYI1911I2kANsHqTNOiBBROwHCHMRsO/kuzfGeb2w72+QczFEJfh1SHFdK1c/8E8+PFrPr9Ys4odXzOuTFAAiw9x88bzpfGfVLDbuKOWnr/R7OE/iVOuMIcSfpatUqBryjMEeM/gq8ArgBtYZY/aIyN1AnjFmI/Ao8LiI5AO1WH/osettAPYCHuArxphuAF9t2uW3A8+IiBcrUXx+WHs8lpRug+YK6+lsfiiua2XNwx/Q2NbFE7efzrKs5EHrf+m86ZTWt/HgWx+xLCuJlXPGWzsSJ0NXK7TWjtxEOqXUiPFrjMEY8yLwYr+yH/TabgeuG+DYnwA/8adNu/wvwF/8iUsNYf/zIG6YefGQVXsnhf+77XQWZg79mE8R4T8+NY93D1Xzs1cO8IlZ6dYlpYRedyZpYlAq6OjM51B24CXIOhuikwatdjJJoUe428U3L5rJ/vImXtxdZhUm6lwGpYKZJoZQ1VgKVfsh56JBqxXVnnxS6PHJhZOYOT6On792EE+3t9cZgyYGpYKRJoZQdfgt63Xa+QNWKapt5cZH7DGF21acVFIAcLuEb100k8NVLfx1e6l1hhIRp2cMSgUpTQyhquAtiEmB9Hk+d/dPCgsy/btraSCXzJvA7Anx/PH9QmstpgSdy6BUsNLEEIqMgcNvQva54Dr+P3Fx3fAmBbAGoq9anMHO4gaKalutcYZ6XRZDqWCkiSEU1R6GpjLIPu+4XTXNHXzu0c00DGNS6HHZgokAvLCrTM8YlApimhhCUdFm63VK32Wmmjs83PL7LZQ2tPH7W5YNa1IAmJwcw2mTE3lhZ5l1xtBWBx3Nw/oZSqmRp4khFBVvhshxkDrrWFG31/Cl/9vK3rJGHvjMEnKHmLx2sj65YCK7ShqocqdbBXrWoFTQ0cQQioq3QMbSPuML9/39EO8cqubHV87ngtnjR+yjL10wAYC3K6OtAh1nUCroaGIINR3NULEHJi8/VvTOoSru+8chrlmSyZplkwc5+NRlJsWweEoify2wv1qaGJQKOpoYQk3pNjBe6/kLQFVTB99Yv52c9Dj+88p5xz8/YQRcNn8i75a7Ma4wa6KdUiqoaGIINcVbrNeMpQDc89J+Gtu7uP/TS4iJGJ3Hb5w5IwWDi7bIdE0MSgUhTQyhpmgLpORATDJbj9TyzLZibjtnGjnj40cthNkTxhEfFUaFpEJj/0d3KKUCnSaGUGKMdUfS5OV0ew0/eG4PExOi+OonZoxqGG6XsDwrmYLOBE0MSgUhTQyhpK4AWmsgcxnrtxxlT2kj/375HGIjR/8JrsuzkznUPg7TWKoP7FEqyGhiCCVF1viCZ9JSHnjjI5ZOTeJyezbyaFuenUyZSUE87dYDe5RSQUMTQygpyYOIOF6sTKKkvo0vnjd9VO5C8mV+RgK17lTrjV5OUiqoaGIIJWU7MRMW8PA7hUxLi2Xl7HTHQgl3u0iakGW90cSgVFDRxBAqvF6o2E159Ax2lzRy+znTrMdsOmhq9kwAWqt1kptSwUQTQ6ioK4DOZl6sTic1LpKrFmc4HRHzZ86gy7gpL/rI6VCUUidAE0OoKN8JwLOlydy0YgpR4W6HA4LTpiZTSRItesagVFDRxBAqynbilTAOmUxWL3L+bAEgMsxNQ3g6riad/axUMNHEECrKd3HEPZmcSSlkp8Y6Hc0xXbETieuowOhcBqWChiaGENFdtpNtHZlcvtCZeQsDiUiezHhTw9GaFqdDUUr5SRNDKGiuxN1SwV7vVD65YJLT0fSROGEqUdLF/oIjToeilPKTJoZQYA88t6XMY0pKjMPB9JU6aRoAxYX5DkeilPKXJoYQUP/RVgByFp7hcCTHC0+yHgxUW1bgcCRKKX+N/upqatjVHt5Ks0ll5eJZQ1cebeOsS1vttUUYYxxbokMp5T89YwgBMbV7OOyeHnCXkQCIS8crYSR5qjhS0+p0NEopP2hiCHKms4X0rhLakuc4HYpvLjee2PFMlBp2ljQ4HY1Syg+aGIJc8aEduDDET1nodCgDCkvMYJLUsVsTg1JBQRNDkCs6sA2AqbOXOBzJwFwJGUwJq2NXsSYGpYKBX4lBRFaJyAERyReRtT72R4rIU/b+TSKS1WvfnXb5ARG5ZKg2xfITETkoIvtE5Oun1sXQ1lK8Bw9uJk2b53QoAxuXQbqpZndpvc6AVioIDJkYRMQN3A9cCswFbhSRuf2q3QrUGWNmAL8A7rWPnQusAeYBq4AHRMQ9RJu3AJOB2caYOcD6U+phCDPGEFl3iKqIyUhYhNPhDGxcBuGmE3d7PcV1bU5Ho5Qagj9nDMuBfGPMYWNMJ9Yf6tX96qwGHrO3nwZWinVf4mpgvTGmwxhTAOTb7Q3W5peAu40xXgBjTOXJdy+0FdW2Mbn7KJ1JOU6HMrgEa1G/iVLLntJGh4NRSg3Fn8SQART1el9sl/msY4zxAA1AyiDHDtbmdOAGEckTkZdExOdfPRG5w66TV1VV5Uc3Qs+W/BKmSCVxmQF8GQlgnPWfNsNVw95SHWdQKtAF4uBzJNBujMkFHgHW+apkjHnYGJNrjMlNS0sb1QADxdEDO3CLISnrNKdDGZw9yW1BfAt7y/SMQalA509iKMG65t8j0y7zWUdEwoAEoGaQYwdrsxh41t7+CxC492E6rK1sLwCu9NkORzKEuPEgbubENumlJKWCgD+JYQuQIyLZIhKBNZi8sV+djcDN9va1wD+MdfvJRmCNfddSNpADbB6izb8Cn7C3zwMOnlzXQlt7VzcJzR/hxQ0p050OZ3AuN8RPJCu8nrKGdmpbOp2OSCk1iCHXSjLGeETkq8ArgBtYZ4zZIyJ3A3nGmI3Ao8DjIpIP1GL9oceutwHYC3iArxhjugF8tWl/5D3AEyLyTaAZuG34uhs68iubmU4JrXFTiAuLdDqcoY2bRHpXDQB7Sxs5OyfV4YCUUgPxaxE9Y8yLwIv9yn7Qa7sduG6AY38C/MSfNu3yeuByf+Iay/aVNbJEipH0xU6H4p+EDOJLdwCwt6xBE4NSASwQB5+VHw6V1pAl5URn9J9SEqDGZeBuKmPiuEgdZ1AqwGliCFKNJftwi8GVHqCL5/U3bhJ42lg+wbqUpJQKXJoYgpAxBioPWG/SAvAZDL7YcxmWJrXxUVUzbZ3dDgeklBqIJoYgVNXcwYSuIxgEUgJ81nMPOzHMi2vBa2B/uZ41KBWoNDEEof1lTUyVCjpjJ0J4lNPh+MdeFiM7og6A3Xo5SamApYkhCO0vbyRLKnClzHA6FP/Zk9ySPNUkxoSzW5fgVipgaWIIQvvLmsh2VRCeFuAT23pzuSF+AtJYwoKMBHbpQ3uUCliaGIJQUWkZiTRB8jSnQzkx4zKgsYT5GQkcrGiivUsHoJUKRJoYgkxXtxdPTb71JugSwyRoLGVBRgIer+FAeZPTESmlfNDEEGQKqlvI9JZbb4IuMWRAQwkLJo0DYLcuwa1UQNLEEGQOV7UwVSqsN0lZjsZywhIywNNGZnQHCdHh7NZxBqUCkiaGIFNY00KWqwJv3ESIiHE6nBNjP5dBB6CVCmyaGIJMYXUL092VuFKD6FbVHvYkNxqsAegD5U10eHQAWqlAo4khyPScMZCc7XQoJy7BfjZTQxHzM8bR1W04WN7sbExKqeNoYggylVU1JHnrgm/gGaxJbu5IqD/CgowEAL2cpFQA0sQQRNo6u4lqPmK9CcbE4HJB4mSoO8KU5BjGRYVpYlAqAGliCCKFNS1kSZDeqtojcSrUH0FEOG1yIh8erXM6IqVUP5oYgsiRmhayjt2qGoRjDABJU6H+KACLpyRxsKKJ5g6Pw0EppXrTxBBECqpbmSoVeGPTITLO6XBOTuJUaKuD9kaWTk3Ca2BHUb3TUSmletHEEEQKq1uYEVYZXKuq9pc01XqtP8KiyYkAbDuil5OUCiSaGIJIQTDfqtojcYr1WneEhOhwctLj2KbjDEoFFE0MQaSkqp4Ub411OSZYJWZZr/Y4w5IpSXxYVG89rlQpFRA0MQSJ1k4PYS2l1pvEyc4GcypikiEiDuqt226XTE2kvrWLw9UtDgemlOqhiSFIFFa3kiHV1puEIE4MItYZT52VGJZOTQJgq44zKBUwNDEEicKaFjKlynrTc50+WCVNPXbGMC01jnFRYTqfQakAookhSBRUt5Ah1RhxHVulNGglTrHGGIzB5RIWT0li2xG9ZVWpQKGJIUgcrWllengtEj8J3OFOh3NqEqdCZzO01gLW5aSDlU00tHU5HJhSCjQxBI2iulaywmqDe+C5x7G5DIUALM9OxhjYXFDrXExKqWM0MQSJo7WtTDSVwT++AB/fbmsPQC+ekkhkmIv3P6pxMCilVA9NDEHA0+2lsqGFJE91cN+R1KMnudlzGSLD3CydmsT7hzUxKBUINDEEgbKGdlK9tbjoDo1LSVHjIDrp2J1JAGdMS2FfWSN1LZ0OBqaUAk0MQaGotvXjW1VD4YwB+sxlADhjegoAmwr0rEEpp/mVGERklYgcEJF8EVnrY3+kiDxl798kIlm99t1plx8QkUtOoM37RESf+4g18HxsclswL4fRW3I21B4+9nZhZiLR4W4dZ1AqAAyZGETEDdwPXArMBW4Ukbn9qt0K1BljZgC/AO61j50LrAHmAauAB0TEPVSbIpILJJ1i30JGUW0bk109s54znQ1muKTkWJeSPNalo4gwF7lZOs6gVCDw54xhOZBvjDlsjOkE1gOr+9VZDTxmbz8NrBQRscvXG2M6jDEFQL7d3oBt2knjZ8B3Tq1roeNobSszIusgNh3Co5wOZ3ikzADjhbrCY0VnTE/hYEUz1c0dzsWllPIrMWQARb3eF9tlPusYYzxAA5AyyLGDtflVYKMxpmywoETkDhHJE5G8qqoqP7oRvIrqWsl214TGwHOPnmdK1Bw6VnTGNGuc4QM9a1DKUQE1+Cwik4DrgF8PVdcY87AxJtcYk5uWljbywTmoqLaNCVSFxhyGHinTrdea/GNFCzISiI8K452D1Q4FpZQC/xJDCdD7p2qmXeazjoiEAQlAzSDHDlS+GJgB5ItIIRAjIvmMYW2d3dQ0t5HUVRk6dyQBRCdCbFqfxBDmdnH2jFTeOlilz2dQykH+JIYtQI6IZItIBNZg8sZ+dTYCN9vb1wL/MNb/2RuBNfZdS9lADrB5oDaNMS8YYyYYY7KMMVlAqz2gPWYV17WSSiNhpjO0zhjAupxU3TfvnzczjfLGdg5W6A1pSjllyMRgjxl8FXgF2AdsMMbsEZG7ReQKu9qjQIr96/5bwFr72D3ABmAv8DLwFWNM90BtDm/XQsPRUJzD0CNlRp8zBoDzZlmXBd86WOlEREopIMyfSsaYF4EX+5X9oNd2O9bYgK9jfwL8xJ82fdSJ8ye+UFZU23sOQ4idMaTmwIePQ1udNRMamJgQzazx8bx1sIo7zp3ucIBKjU0BNfisjldU18bUMPsunVC6KwkgbY71Wrm/T/F5s9LYUlBHS4fHgaCUUpoYAlxRbSuzIushKhEi450OZ3il9ySGvX2Kz5uZRme3V2dBK+UQTQwBzjpjqA69swWwZnFHxENV3zOG3KwkosPdvHUwtOenKBWoNDEEMGMMRbWtTDBVobNGUm8ikD4bKvf1KY4Mc3Pm9BTeOFCpt60q5QBNDAGsvrWL5o4ukroqQu+OpB7pc467lASwcs54iuva9LZVpRygiSGAFdW1kkgzEd2toXkpCawB6NYaaO572WjlnHQAXt9X4URUSo1pmhgCWFFt28e3qobyGQNAxe4+xePHRXFaZoImBqUcoIkhgBXVtZIZqnMYekw8zXot33ncrgvnjGd7UT2VTe2jHJRSY5smhgBW1LPcNoRuYohJhoQpULr9uF0r54zHGHhjv86CVmo0aWIIYEdrW5kZWQfhscdmBoekiQuhbMdxxXMmxpORGM1rezUxKDWaNDEEsOK6Nqa6q62zBRGnwxk5ExdB7UfQ3tinWES4cE467+ZX0dbZ7VBwSo09mhgClNdrKKlrs+cwhOhlpB7Hxhl2Hbfrwrnjae/y8vYhneym1GjRxBCgKpra6ez2ktRZFvqJYdIi67X0w+N2rZiWQmJMOC/vLh/loJQauzQxBKii2jbG0UKkpyn0E0NcujUAXbz5uF3hbhcXzx3P63sr6PDo5SSlRoMmhgB1NJSX2/ZlyulwdBP4WALj0gUTaerw8O4hfeSnUqNBE0OAKqptZbLLvhtnLCSGyadDcznUHz1u11nTUxkXFcaLu/RyklKjQRNDgCqqa2VOVL31JinL0VhGxeTTrdeiTcftighzcdHcCby2t5xOj3eUA1Nq7NHEEKCKa9uYGVkLEXGhPYehx/h5Vl+PfuBz92ULJtDY7uG9j/RyklIjTRNDgCqqa2WKuyb05zD0cLlhygoofMfn7rNzUomPDOOFnWWjHJhSY48mhgDU4emmvLGd8d6KsTG+0GPa+VB9EBpKjtsVGeZm1fwJvLy7XCe7KTXCNDEEoJK6NoyBxI7ysZcYAAre8rn76iWZNHd4eHWvDkIrNZI0MQSgojprDkPEWJjD0Fv6PIhNg4/e8Ln79OxkMhKjeWbb8WcUSqnho4khAB2taSFT7CUgxlJicLkg+zw4/AZ4j7/7yOUSrlqcwbuHqqho1KW4lRopmhgC0OHqFqaF1VpvxlJiAJi5ClqqoCTP5+6rlmTgNfDcdj1rUGqkaGIIQAXVLSyIa7DeJIYgsykAABVSSURBVE51NpjRlnMRuMJg/ws+d09Pi2PR5ESe2VqC8TFLWil16jQxBKCC6paxNYeht+hEmHoWHHhxwCrXLs3kQEUT247Wj2JgSo0dmhgCTIen214Oo2rszGHob/bl1m2rVQd97r5qcQbxkWH84Z+FoxuXUmOEJoYAU1TbitdAqmeMzWHobc4VgMDup33ujo0M49rcTF7aVaaD0EqNAE0MAeZwVQsA8e1j4DkMAxk3EbLPgV1/9rnaKsDnzsjC4zU8sen4RfeUUqdGE0OAKahuIYFm3J2NYzcxACy4DmoPQ+k2n7uzU2P5xKw0/rTpqC6sp9Qw08QQYA5XtbAo1l4oLnm6s8E4ac4VEBYF2x4fsMrNZ2ZR3dzB33aUjmJgSoU+TQwBpqC6haVx9t02ydOcDcZJ0Ykw72rrclJHk88q5+akMXtCPPe/kY+nW88alBoufiUGEVklIgdEJF9E1vrYHykiT9n7N4lIVq99d9rlB0TkkqHaFJEn7PLdIrJORMJPrYvB5XB1C3MiqgAZG89hGEzuv0BnM+zyPQjtcgnfuHAmh6tbeG67njUoNVyGTAwi4gbuBy4F5gI3isjcftVuBeqMMTOAXwD32sfOBdYA84BVwAMi4h6izSeA2cACIBq47ZR6GEQa27uobu5gqqscEiZDeJTTITkrc5m1ftLW3w9Y5ZJ545k3aRz3/eMQXXrWoNSw8OeMYTmQb4w5bIzpBNYDq/vVWQ08Zm8/DawUEbHL1xtjOowxBUC+3d6AbRpjXjQ2YDOQeWpdDB4F9h1J47tKIDnb4WgCgIh11lC2A0p8D0KLCN+8cCZHalr5iy6up9Sw8CcxZABFvd4X22U+6xhjPEADkDLIsUO2aV9C+izwsq+gROQOEckTkbyqqio/uhH4CqqtxBDXchRSxvDAc28Lr4fwGMhbN2CVlXPSOS0zgV+8fpDWTs8oBqdUaArkwecHgLeNMT4f6WWMedgYk2uMyU1LSxvl0EbG4eoWkqQZd0f92L4jqbeoBOvW1V1/hmbfPwBEhO9/ci5lDe3c9/f8UQ5QqdDjT2IoASb3ep9pl/msIyJhQAJQM8ixg7YpIv8BpAHf8qcToeJwVTOnJ+gdScc54yvgaYctjwxYZVlWMtcuzeR37xwmv9L3XUxKKf/4kxi2ADkiki0iEViDyRv71dkI3GxvXwv8wx4j2Aisse9aygZysMYNBmxTRG4DLgFuNMaMqdHE/MpmcmPtX8WpM50NJpCkzYKZl8LmR6CzdcBqay+dTUyEm7v+ukdXXlXqFAyZGOwxg68CrwD7gA3GmD0icreIXGFXexRIEZF8rF/5a+1j9wAbgL1YYwVfMcZ0D9Sm3daDwHjgfRHZLiI/GKa+BrT2rm4OVTazMLIM3BF6q2p/Z30d2mph+xMDVkmNi+Tbq2bz/uEa1m8pGrCeUmpwEgq/rHJzc01enu8HuwSLncX1XPGb99ic9RDppga+9J7TIQUWY+B3F0JrNXxtG7jcPqt5vYbPrtvEtiP1PP/1s5meFjfKgSoVPERkqzEmt395IA8+jyl7ShsBSG49bF06UX2JwFn/CnWFsPuZAau5XML/XreIyHAX31i/XddRUuokaGIIEHtKG0iP9BDWWARps50OJzDN/iSMnw9v/Bd0dw1YbUJCFPdes5BdJQ3c+/L+UQxQqdCgiSFA7Clt5BNp9uM8NTH45nLBBd+HuoJBxxoALpk3gVvOzOLRdwv4c56ONyh1IjQxBIBur2F/WRMr4iqtAk0MA5u5yloq462fQtfgD+n5/uVzOHtGKt/7yy62FNaOUoBKBT9NDAGgoLqZtq5u5rmKwB2py2EMRgQuuAsaSyDv0UGrhrld3P/pJUxOiuELj2/lUIXOb1DKH5oYAkDPwHNG+wEYPw/cY2pB2RM37TyY9gl4694BZ0P3SIgJZ90tywhzCTc+/AFlW56Dpz9v/ds58BPilBrLNDEEgD2ljUSGQUzNbpi0yOlwgsOlP7Umu71215BVs1JjefKWBfy393+Z+MLn8Hz0Fhz9AJ69DZ5cA56OUQhYqeChiSEA7Clt4LzUZqSjCSZqYvBL2kw4+xuw40nY97fB67Y3Mv2lm7iQTdwnn2FF+6/Ju+ptuOS/4eDL8Ozt4NXbWpXqoYnBYcYY9pQ2cn68vVSUnjH479zvWIl049es50P70t4I/3cNlG5DrvsDq7/6M+JjY/j0o1vYGHMlXPSfsPc52PTg6MauVADTxOCwkvo26lu7OM1daC2FkTbH6ZCCR1gEXGsvx/341dDQb23Hpgp4/Coo3QbX/QHmrmZqSizPfulMTstM4OtPfshP6lbizVkFr/8QKveNdg+UCkiaGBz2/kc1AGR17Lcmb4VFOBxRkEmZDp95Glqq4KFzIO/3UL7ben3oXKjYA9f/EeZ86tghSbERPHHbCj53xlQeebeQLzTcgjc81jrz8HY72BmlAoMmBoe9m19NRqwhpvJDmHqm0+EEp8xcuP0NSMiE578BD55lvY6bCLe9BrMvP+6QiDAXd6+ezy9vWMQ7ZfDDrpugeAtsGfwWWKXGgjCnAxjLjDG8l1/NzZMqkaJOyDrH6ZCCV9pMuOMtKNturaeUMAUylljzHgZx5eIMZk2I50uPR/J285uc8ep/ED77MivJKDVG6RmDg/aXN1Hd3MkFUQdBXDBlhdMhBTcRmLQY5l0FmUuHTAo95kwcx3NfO4fnJ3+HLk83B9fdQXe33qWkxi5NDA56L78agBmt22HCAohOdDiisSshOpz/vvWTvJ35BWY2vMfvHvpf2rt0vEGNTZoYHPTOoWrmpwqR5Vv1MlIAcLuEVbf+kKpx87im4j6+9Yc3NDmoMUkTg0M6PN1sLqjl5tSD0N1pLSmtnOdyk/bph0h2tfKJI7/i9j/maXJQY44mBodsO1JPW1c353g+gNh0mLzc6ZBUjwkLcJ39r1wX9jbhh1/j9j/m0dapyUGNHZoYHPLWwSqiXV2Mr3wbZl824KMqlUPO+y6MX8ADsb/jQP4hbvvjFk0OaszQxOCArm4vz2wr5puT9iOdLTD3SqdDUv2FRcK164gyHfwt43He/6iKWx/T5KDGBk0MDvj7vgqqmjq43rwCydMh+zynQ1K+pM2ES+9lfPUHbFy0lfcP1+iYgxoTNDE44E+bizg3vozEmm2Q+3nrkZUqMC3+LMy9kvkHfs3vVgrvfVStyUGFPP2LNMqKalt551AVd417HiLiYdGnnQ5JDUYEPvUriJ/Eyj138vMrpvFufjU3r9tMU3uX09EpNSI0MYyyJzcf5TTJJ6fmDTjzaxCT7HRIaijRiXDNI1B/lKtKf84vb1jE1iN13PS7TdS1dDodnVLDThPDKGps7+LpzYX8PP5PEJsGZ3zZ6ZCUv6asgPPvhF0bWC3v8OBNS9lX3sQ1v/0nh6uanY5OqWGliWEU3fvSfq7u/CvTOvbDpfdCZLzTIakTcc6/wdSz4IV/48L0Rp647XTq27q48v73ji1volQo0MQwSrYU1rJv8+t8O3yD9WyAeVc7HZI6US43XP2IdSvrkzeybIKb575yFhMSovjso5v4n1cO0KWL76kQoIlhFHR4uvnV06/xcOSvkIQpcMVv/F75UwWYhAy4/nGoK4BnbmNyYiTPfvksrlmSyW/eyOfqB/7JjqJ6p6NU6pRoYhhhnR4vP/y/1/ivxn8nIcKL68Y/6SqqwS7rLLj0p3DoVXj1+8RFuPnZdafx4E1LKK1vY/X973HHH/PYUVSPMcbpaJU6YfqgnhHU3tXN2j/+na8d+QYTwlsJv/lvMH6u02Gp4bDsVqg+CB88AN1dsOq/WTV/ImfNSGXdu4X87p3DvLq3gsykaC6eO4E5E+OZlhZHSmwEUeFuosPdRIa7iAxzIXr2qAKMhMIvmtzcXJOXl+d0GH3sKKrnoWde4rt1PyAjrImwz/0Fpp7hdFhqOBkDr34f3v8NZC6zbijIWApAQ2sXr+wp58XdZfwzv4bOAcYeRCAqzE1UuIukmAgykqKZnBzDjLQ4ZqTHkTM+jgnjojR5qBEhIluNMbnHlWtiGD7GGHaVNPDHfxZidjzJ3eF/ICwylsjPbrCeS6xC0+5n4IV/g7Y66wlyWWdDygxIngYJk/HETaSosZuC6mYa2rpo6/TS3tVNW1c3HfZrW1c3tS2dFNe1cbS2lfrWjyfPxUWGMT09jpx0K1lMGBdFSlwEKbGRpMZHkBwTQZhbrwqrE3dKiUFEVgG/AtzA74wx9/TbHwn8EVgK1AA3GGMK7X13ArcC3cDXjTGvDNamiGQD64EUYCvwWWPMoLOInEwMDa1dbDtax6aCWl7fW05ydR7fDH+WM1x78GSuIOy6R/X5wWNBeyNs/QPsfQ7Kd0F3R6+dAnHpkJJj/UDIzLXOMOIn+GzKGENNSyf5lc0cqmwmv6KJ/KpmDlU0U9nUcVx9EesJdMkxESTFRpAUE0FafASZSTFkJEaTmRRNRlI04+OjcLn0zEN97KQTg4i4gYPARUAxsAW40Rizt1edLwMLjTFfFJE1wFXGmBtEZC7wJLAcmAS8Dsy0D/PZpohsAJ41xqwXkQeBHcaY3w4W43AkBmMM3V6Dx/vxa4enm8a2Lupbu2hos/5VNnVQWttIeXUd5ZVVhDeXkClVnOYuZFXEDiZ1l+CNTsF1wfdg6b/octpjkbcbGkug9jA0FENDCTQchcp9ULYTvPbZQPwkyFgCExZCcjYkTrVmwkfEWXNcwmOOW0erqb2LqqYOqps7qWnuoLq5g6rmTupbO6lt6aSutZPali4qG9up6TcrO8LtOnaparL9mhYXSVJsOIkxVkJJiA4n3C24XYJLhDCXtd1zKavn/5OubkNntxdPt5eubkNXt5e2rm5aOjy0dHTT0umxtjutstZe2z2vHq/1t0ewkpv1KgjgcgkRbhcRYS7C3WK/unqVWa/R4W5iItzERIQRE+kmNiLMfv9xWVSY2+4Pp3RJzhiDMeA1BoP9aqwrigaD10CXx0uHx0uHp5vOXtsdPdtdXjq7vXR0ddPZ7SXMZfUtwu3u09fIXmV9ynuVuYchyQ+UGPwZfF4O5BtjDtsNrQdWA3t71VkN/NDefhr4jVj/BVYD640xHUCBiOTb7eGrTRHZB1wA9Cwg9Jjd7qCJ4WR94fE83jxQdSwR+ONP4T/mi+5eXY+0XkxYFDL5dJj/XVwLroOImBGIWAUFlxsSp1j/+utqt84oirdA6TYo2Qb7nx+kMYFbXrDuhALio8KJjwpnWtrQYbR1dlNS30ZxXSvFdW0U1bVSXGtdqtpZXN/nctVQRMAtQrf9x/BEiXDsj3ZspPUa7nZhAOw/tD1/YAE8drLp6jZ0erx0dXvp9Fh/VDu7vScVQ+9YXPJxonDZ73v/gcdHAgg0bpcQ7hae/9o5zEiPG9a2/UkMGUBRr/fFwOkD1THGeESkAetSUAbwQb9jM+xtX22mAPXGGI+P+n2IyB3AHfbbZhE54EdfBpIK+DV19awB9zQCf7P/3XIKoZwyv/sS4EKlH3CqffnR2cMXyanR/yYBKOfHp9SXqb4Kg/Z2VWPMw8DDw9GWiOT5Op0KRqHSl1DpB4ROX0KlH6B9GYo/tzKUAJN7vc+0y3zWEZEwIAFrEHqgYwcqrwES7TYG+iyllFIjyJ/EsAXIEZFsEYkA1gAb+9XZCNxsb18L/MNYo9obgTUiEmnfbZQDbB6oTfuYN+w2sNt87uS7p5RS6kQNeSnJHjP4KvAK1q2l64wxe0TkbiDPGLMReBR43B5crsX6Q49dbwPWQLUH+IoxphvAV5v2R34XWC8iPwY+tNseacNySSpAhEpfQqUfEDp9CZV+gPZlUCExwU0ppdTw0emSSiml+tDEoJRSqo8xnxhEZJWIHBCRfBFZ63Q8/YnIOhGpFJHdvcqSReQ1ETlkvybZ5SIi99l92SkiS3odc7Nd/5CI3Ozrs0a4H5NF5A0R2Ssie0TkX4O4L1EisllEdth9+ZFdni0im+yYn7JvrMC++eIpu3yTiGT1autOu/yAiFwy2n2xY3CLyIci8nyQ96NQRHaJyHYRybPLgu77ZceQKCJPi8h+EdknImeMal+sad5j8x/WwPdHwDQgAtgBzHU6rn4xngssAXb3KvspsNbeXgvca29fBryEtbrACmCTXZ4MHLZfk+ztpFHux0Rgib0dj7Ukytwg7YsAcfZ2OLDJjnEDsMYufxD4kr39ZeBBe3sN8JS9Pdf+zkUC2fZ30e3Ad+xbwJ+A5+33wdqPQiC1X1nQfb/sOB4DbrO3I4DE0ezLqHY20P4BZwCv9Hp/J3Cn03H5iDOLvonhADDR3p4IHLC3H8Jac6pPPeBG4KFe5X3qOdSn57DWygrqvgAxwDasmfvVQFj/7xbW3Xdn2Nthdj3p/33rXW8U488E/o61FM3zdlxB1w/7cws5PjEE3fcLax5YAfbNQU70ZaxfSvK13IfPJTgCzHhjTJm9XQ6Mt7cH6k9A9dO+BLEY65d2UPbFvvyyHagEXsP6lTzQci59lowBei8Z43Rffgl8B+h5YMRgy9IEcj8ADPCqiGwVa8kcCM7vVzZQBfzevsT3OxGJZRT7MtYTQ9Az1k+BoLnnWETigGeAbxhjGnvvC6a+GGO6jTGLsH5xLwdmOxzSCRORTwKVxpitTscyTM42xiwBLgW+IiLn9t4ZRN+vMKzLx781xiwGWrAuHR0z0n0Z64nBn+U+AlGFiEwEsF8r7fITXYJkVIlIOFZSeMIY86xdHJR96WGMqcearX8GAy/ncqJLxoyWs4ArRKQQ6xkoF2A9IyXY+gGAMabEfq0E/oKVsIPx+1UMFBtjNtnvn8ZKFKPWl7GeGPxZ7iMQ9V6CpPeyIRuBz9l3KawAGuxTz1eAi0Ukyb6T4WK7bNSIiGDNYt9njPl5r13B2Jc0EUm0t6Oxxkr2MfByLie6ZMyoMMbcaYzJNMZkYX33/2GM+QxB1g8AEYkVkfiebazvxW6C8PtljCkHikRkll20Emv1iNHry2gPEAXaP6wR/YNY14j/3el4fMT3JFAGdGH9krgV67ru34FDWA8/SrbrCnC/3ZddQG6vdj4P5Nv//sWBfpyNdeq7E9hu/7ssSPuyEGu5lp1Yf3x+YJdPw/qDmA/8GYi0y6Ps9/n2/mm92vp3u48HgEsd/J6dz8d3JQVdP+yYd9j/9vT8vxyM3y87hkVAnv0d+yvWXUWj1hddEkMppVQfY/1SklJKqX40MSillOpDE4NSSqk+NDEopZTqQxODUkqpPjQxKKWU6kMTg1JKqT7+PxkBGEVCUmiTAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots()\n", + "\n", + "sns.kdeplot(train_data['GrLivArea'], ax=ax, label='train')\n", + "sns.kdeplot(live_data['GrLivArea'], ax=ax, label='live')" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots()\n", + "\n", + "sns.kdeplot(train_data['BsmtFinSF1'], ax=ax, label='train')\n", + "sns.kdeplot(live_data['BsmtFinSF1'], ax=ax, label='live')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Finally - compare predictions\n", + "\n", + "Finally, we compare the predictions of our model.\n", + "\n", + "Given that we observed differences in the amount of missing data in BsmtQual and differences in the distribution of the other variables, there is sufficient reason to believe that the distribution of the predictions will not hold.\n", + "\n", + "Let's test that anyhow.\n", + "\n", + "In this particular scenario, we do not have the real value of the Sale Price during shadow mode, because, the houses do take some time to sell.\n", + "\n", + "So in order to evaluate the performance of the model, we can compare the distributions of the predictions between the live and train data, using again, the KS test." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Ks_2sampResult(statistic=0.15722705507227056, pvalue=4.440892098500626e-15)" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stats.ks_2samp(train_data['SalePrice'], live_data['SalePrice'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As expected, the distributions are significantly different. And we can visualize that below:" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots()\n", + "\n", + "sns.kdeplot(train_data['SalePrice'], ax=ax, label='train')\n", + "sns.kdeplot(live_data['SalePrice'], ax=ax, label='live')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As in the live environment, we are getting houses with bigger LotArea, the predictions naturally provide bigger Sale Prices. \n", + "\n", + "Does this mean that the model will not perform well? Unclear at this stage. More expensive prices are expected for bigger houses, so in principle that should not worry us too much.\n", + "\n", + "But we should definitely investigate the reasons behind the distribution changes in the input variables.\n", + "\n", + "I hope we could give you a flavour of what we should be looking at in shadow mode, and how we should be thinking or reacting when things do not go as planned." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + }, + "toc": { + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "toc_cell": false, + "toc_position": {}, + "toc_section_display": "block", + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/exercise_notebooks/shadow_mode_exercise/requirements.txt b/exercise_notebooks/shadow_mode_exercise/requirements.txt new file mode 100644 index 0000000..0e55c35 --- /dev/null +++ b/exercise_notebooks/shadow_mode_exercise/requirements.txt @@ -0,0 +1,15 @@ +# ML requirements +numpy>=1.18.1,<1.19.0 +scikit-learn>=0.22.1,<0.23.0 +pandas>=0.25.3,<0.26.0 +feature_engine>=0.3.1,<0.4.0 +joblib>=0.14.1,<0.15.0 +matplotlib>=3.1.3,<3.2.0 +seaborn>=0.10.0,<0.11.0 +jupyter>=1.0.0,<1.1.0 + +# Persistence +sqlalchemy>=1.3.11,<1.4.0 # ORM +psycopg2>=2.8.4,<2.9.0 # DB Driver +alembic>=1.3.1,<1.4.0 # DB Migrations +sqlalchemy_utils>=0.36.0,<0.37.0 # DB Utils \ No newline at end of file diff --git a/packages/gradient_boosting_model/mypy.ini b/packages/gradient_boosting_model/mypy.ini new file mode 100644 index 0000000..97e52a5 --- /dev/null +++ b/packages/gradient_boosting_model/mypy.ini @@ -0,0 +1,11 @@ +[mypy] +warn_unused_ignores = True +follow_imports = skip +show_error_context = True +warn_incomplete_stub = True +ignore_missing_imports = True +check_untyped_defs = True +cache_dir = /dev/null +warn_redundant_casts = True +warn_unused_configs = True +strict_optional = True diff --git a/packages/gradient_boosting_model/test_requirements.txt b/packages/gradient_boosting_model/test_requirements.txt new file mode 100644 index 0000000..c44378e --- /dev/null +++ b/packages/gradient_boosting_model/test_requirements.txt @@ -0,0 +1,16 @@ +-r requirements.txt + +# testing requirements +pytest>=5.3.2,<6.0.0 + +# old model for testing purposes +# source code: https://github.com/trainindata/deploying-machine-learning-models/tree/master/packages/regression_model +tid-regression-model>=2.0.20,<2.1.0 + +# repo maintenance tooling +black>=19.10b0,<20.0 +flake8>=3.7.9,<4.0 +mypy>=0.740 + +# kaggle cli +kaggle>=1.5.6,<1.6.0 diff --git a/packages/gradient_boosting_model/tests/conftest.py b/packages/gradient_boosting_model/tests/conftest.py new file mode 100644 index 0000000..c896d72 --- /dev/null +++ b/packages/gradient_boosting_model/tests/conftest.py @@ -0,0 +1,34 @@ +import pytest +from sklearn.model_selection import train_test_split + +from gradient_boosting_model.config.core import config +from gradient_boosting_model.processing.data_management import load_dataset + + +@pytest.fixture(scope="session") +def pipeline_inputs(): + # For larger datasets, here we would use a testing sub-sample. + data = load_dataset(file_name=config.app_config.training_data_file) + + # Divide train and test + X_train, X_test, y_train, y_test = train_test_split( + data[config.model_config.features], # predictors + data[config.model_config.target], + test_size=config.model_config.test_size, + # we are setting the random seed here + # for reproducibility + random_state=config.model_config.random_state, + ) + + return X_train, X_test, y_train, y_test + + +@pytest.fixture() +def raw_training_data(): + # For larger datasets, here we would use a testing sub-sample. + return load_dataset(file_name=config.app_config.training_data_file) + + +@pytest.fixture() +def sample_input_data(): + return load_dataset(file_name=config.app_config.test_data_file) diff --git a/packages/gradient_boosting_model/tests/test_config.py b/packages/gradient_boosting_model/tests/test_config.py new file mode 100644 index 0000000..5a82241 --- /dev/null +++ b/packages/gradient_boosting_model/tests/test_config.py @@ -0,0 +1,124 @@ +from pathlib import Path + +from gradient_boosting_model.config.core import ( + create_and_validate_config, + fetch_config_from_yaml, +) + +import pytest +from pydantic import ValidationError + + +TEST_CONFIG_TEXT = """ +package_name: gradient_boosting_model +training_data_file: houseprice.csv +test_data_file: test.csv +drop_features: YrSold +pipeline_name: gb_regression +pipeline_save_file: gb_regression_output_v +target: SalePrice +variables_to_rename: + foo: bar +test_size: 0.1 +features: + - LotArea +numerical_vars: + - LotArea +categorical_vars: + - BsmtQual +temporal_vars: YearRemodAdd +numerical_vars_with_na: + - LotFrontage +numerical_na_not_allowed: + - LotArea +random_state: 0 +n_estimators: 50 +rare_label_tol: 0.01 +rare_label_n_categories: 5 +loss: ls +allowed_loss_functions: + - ls + - huber +""" + +INVALID_TEST_CONFIG_TEXT = """ +package_name: gradient_boosting_model +training_data_file: houseprice.csv +test_data_file: test.csv +drop_features: YrSold +pipeline_name: gb_regression +pipeline_save_file: gb_regression_output_v +target: SalePrice +features: + - LotArea +numerical_vars: + - LotArea +categorical_vars: + - BsmtQual +temporal_vars: YearRemodAdd +numerical_vars_with_na: + - LotFrontage +numerical_na_not_allowed: + - LotArea +random_state: 0 +n_estimators: 50 +rare_label_tol: 0.01 +rare_label_n_categories: 5 +loss: ls +allowed_loss_functions: + - huber +""" + + +def test_fetch_config_structure(tmpdir): + # Given + # We make use of the pytest built-in tmpdir fixture + configs_dir = Path(tmpdir) + config_1 = configs_dir / "sample_config.yml" + config_1.write_text(TEST_CONFIG_TEXT) + parsed_config = fetch_config_from_yaml(cfg_path=config_1) + + # When + config = create_and_validate_config(parsed_config=parsed_config) + + # Then + assert config.model_config + assert config.app_config + + +def test_config_validation_raises_error_for_invalid_config(tmpdir): + # Given + # We make use of the pytest built-in tmpdir fixture + configs_dir = Path(tmpdir) + config_1 = configs_dir / "sample_config.yml" + + # invalid config attempts to set a prohibited loss + # function which we validate against an allowed set of + # loss function parameters. + config_1.write_text(INVALID_TEST_CONFIG_TEXT) + parsed_config = fetch_config_from_yaml(cfg_path=config_1) + + # When + with pytest.raises(ValidationError) as excinfo: + create_and_validate_config(parsed_config=parsed_config) + + # Then + assert "not in the allowed set" in str(excinfo.value) + + +def test_missing_config_field_raises_validation_error(tmpdir): + # Given + # We make use of the pytest built-in tmpdir fixture + configs_dir = Path(tmpdir) + config_1 = configs_dir / "sample_config.yml" + TEST_CONFIG_TEXT = """package_name: gradient_boosting_model""" + config_1.write_text(TEST_CONFIG_TEXT) + parsed_config = fetch_config_from_yaml(cfg_path=config_1) + + # When + with pytest.raises(ValidationError) as excinfo: + create_and_validate_config(parsed_config=parsed_config) + + # Then + assert "field required" in str(excinfo.value) + assert "pipeline_name" in str(excinfo.value) diff --git a/packages/gradient_boosting_model/tests/test_pipeline.py b/packages/gradient_boosting_model/tests/test_pipeline.py new file mode 100644 index 0000000..3820995 --- /dev/null +++ b/packages/gradient_boosting_model/tests/test_pipeline.py @@ -0,0 +1,54 @@ +from gradient_boosting_model import pipeline +from gradient_boosting_model.config.core import config +from gradient_boosting_model.processing.validation import validate_inputs + + +def test_pipeline_drops_unnecessary_features(pipeline_inputs): + # Given + X_train, X_test, y_train, y_test = pipeline_inputs + assert config.model_config.drop_features in X_train.columns + + # When + # We use the scikit-learn Pipeline private method `_fit` which is called + # by the `fit` method, since this allows us to access the transformed + # dataframe. For other models we could use the `transform` method, but + # the GradientBoostingRegressor does not have a `transform` method. + X_transformed, _ = pipeline.price_pipe._fit(X_train, y_train) + + # Then + assert config.model_config.drop_features in X_train.columns + assert config.model_config.drop_features not in X_transformed.columns + + +def test_pipeline_transforms_temporal_features(pipeline_inputs): + # Given + X_train, X_test, y_train, y_test = pipeline_inputs + + # When + # We use the scikit-learn Pipeline private method `_fit` which is called + # by the `fit` method, since this allows us to access the transformed + # dataframe. For other models we could use the `transform` method, but + # the GradientBoostingRegressor does not have a `transform` method. + X_transformed, _ = pipeline.price_pipe._fit(X_train, y_train) + + # Then + assert ( + X_transformed.iloc[0]["YearRemodAdd"] + == X_train.iloc[0]["YrSold"] - X_train.iloc[0]["YearRemodAdd"] + ) + + +def test_pipeline_predict_takes_validated_input(pipeline_inputs, sample_input_data): + # Given + X_train, X_test, y_train, y_test = pipeline_inputs + pipeline.price_pipe.fit(X_train, y_train) + + # When + validated_inputs, errors = validate_inputs(input_data=sample_input_data) + predictions = pipeline.price_pipe.predict( + validated_inputs[config.model_config.features] + ) + + # Then + assert predictions is not None + assert errors is None diff --git a/packages/gradient_boosting_model/tests/test_predict.py b/packages/gradient_boosting_model/tests/test_predict.py new file mode 100644 index 0000000..5be393e --- /dev/null +++ b/packages/gradient_boosting_model/tests/test_predict.py @@ -0,0 +1,62 @@ +from gradient_boosting_model.predict import make_prediction +from gradient_boosting_model.config.core import config + +from sklearn.metrics import mean_squared_error + +from regression_model.predict import make_prediction as alt_make_prediction + + +def test_prediction_quality_against_benchmark(raw_training_data, sample_input_data): + # Given + input_df = raw_training_data.drop(config.model_config.target, axis=1) + output_df = raw_training_data[config.model_config.target] + + # Generate rough benchmarks (you would tweak depending on your model) + benchmark_flexibility = 50000 + # setting ndigits to -4 will round the value to the nearest 10,000 i.e. 210,000 + benchmark_lower_boundary = ( + round(output_df.iloc[0], ndigits=-4) - benchmark_flexibility + ) # 210,000 - 50000 = 160000 + benchmark_upper_boundary = ( + round(output_df.iloc[0], ndigits=-4) + benchmark_flexibility + ) # 210000 + 50000 = 260000 + + # When + subject = make_prediction(input_data=input_df[0:1]) + + # Then + assert subject is not None + prediction = subject.get("predictions")[0] + assert isinstance(prediction, float) + assert prediction > benchmark_lower_boundary + assert prediction < benchmark_upper_boundary + + +def test_prediction_quality_against_another_model(raw_training_data, sample_input_data): + # Given + input_df = raw_training_data.drop(config.model_config.target, axis=1) + output_df = raw_training_data[config.model_config.target] + current_predictions = make_prediction(input_data=input_df) + + # the older model has these variable names reversed + input_df.rename( + columns={ + "FirstFlrSF": "1stFlrSF", + "SecondFlrSF": "2ndFlrSF", + "ThreeSsnPortch": "3SsnPorch", + }, + inplace=True, + ) + alternative_predictions = alt_make_prediction(input_data=input_df) + + # When + current_mse = mean_squared_error( + y_true=output_df.values, y_pred=current_predictions["predictions"] + ) + + alternative_mse = mean_squared_error( + y_true=output_df.values, y_pred=alternative_predictions["predictions"] + ) + + # Then + assert current_mse < alternative_mse diff --git a/packages/gradient_boosting_model/tests/test_preprocessors.py b/packages/gradient_boosting_model/tests/test_preprocessors.py new file mode 100644 index 0000000..11a4900 --- /dev/null +++ b/packages/gradient_boosting_model/tests/test_preprocessors.py @@ -0,0 +1,37 @@ +from gradient_boosting_model.config.core import config +from gradient_boosting_model.processing import preprocessors as pp + + +def test_drop_unnecessary_features_transformer(pipeline_inputs): + # Given + X_train, X_test, y_train, y_test = pipeline_inputs + assert config.model_config.drop_features in X_train.columns + + transformer = pp.DropUnecessaryFeatures( + variables_to_drop=config.model_config.drop_features, + ) + + # When + X_transformed = transformer.transform(X_train) + + # Then + assert config.model_config.drop_features not in X_transformed.columns + + +def test_temporal_variable_estimator(pipeline_inputs): + # Given + X_train, X_test, y_train, y_test = pipeline_inputs + + transformer = pp.TemporalVariableEstimator( + variables=config.model_config.temporal_vars, + reference_variable=config.model_config.drop_features, + ) + + # When + X_transformed = transformer.transform(X_train) + + # Then + assert ( + X_transformed.iloc[0]["YearRemodAdd"] + == X_train.iloc[0]["YrSold"] - X_train.iloc[0]["YearRemodAdd"] + ) diff --git a/packages/gradient_boosting_model/tests/test_validation.py b/packages/gradient_boosting_model/tests/test_validation.py new file mode 100644 index 0000000..b636674 --- /dev/null +++ b/packages/gradient_boosting_model/tests/test_validation.py @@ -0,0 +1,30 @@ +from gradient_boosting_model.processing.validation import validate_inputs + + +def test_validate_inputs(sample_input_data): + # When + validated_inputs, errors = validate_inputs(input_data=sample_input_data) + + # Then + assert not errors + + # we expect that 2 rows are removed due to missing vars + # 1459 is the total number of rows in the test data set (test.csv) + # and 1457 number returned after 2 rows are filtered out. + assert len(sample_input_data) == 1459 + assert len(validated_inputs) == 1457 + + +def test_validate_inputs_identifies_errors(sample_input_data): + # Given + test_inputs = sample_input_data.copy() + + # introduce errors + test_inputs.at[1, "BldgType"] = 50 # we expect a string + + # When + validated_inputs, errors = validate_inputs(input_data=test_inputs) + + # Then + assert errors + assert errors[1] == {"BldgType": ["Not a valid string."]} diff --git a/packages/gradient_boosting_model/tox.ini b/packages/gradient_boosting_model/tox.ini new file mode 100644 index 0000000..7c9059d --- /dev/null +++ b/packages/gradient_boosting_model/tox.ini @@ -0,0 +1,65 @@ +[tox] +envlist = unit_tests,typechecks,stylechecks +skipsdist = True + + +[testenv] +install_command = pip install {opts} {packages} +deps = + -rtest_requirements.txt + +passenv = + KAGGLE_USERNAME + KAGGLE_KEY + +setenv = + PYTHONPATH=. + +commands= + kaggle competitions download -c house-prices-advanced-regression-techniques -p gradient_boosting_model/datasets/ + unzip -o gradient_boosting_model/datasets/house-prices-advanced-regression-techniques.zip -d gradient_boosting_model/datasets + mv gradient_boosting_model/datasets/train.csv gradient_boosting_model/datasets/houseprice.csv + python gradient_boosting_model/train_pipeline.py + pytest \ + -s \ + -vv \ + {posargs:tests/} + + +[testenv:unit_tests] +envdir = {toxworkdir}/unit_tests +deps = + {[testenv]deps} + +setenv = + PYTHONPATH=. + +commands = + python gradient_boosting_model/train_pipeline.py + pytest \ + -s \ + -vv \ + {posargs:tests/} + + +[testenv:typechecks] +envdir = {toxworkdir}/unit_tests + +deps = + {[testenv:unit_tests]deps} + +commands = {posargs:mypy gradient_boosting_model} + + +[testenv:stylechecks] +envdir = {toxworkdir}/unit_tests + +deps = + {[testenv:unit_tests]deps} + +commands = {posargs:flake8 gradient_boosting_model tests} + + +[flake8] +exclude = .git,env +max-line-length = 90 \ No newline at end of file diff --git a/packages/ml_api/.dockerignore b/packages/ml_api/.dockerignore new file mode 100644 index 0000000..26ab026 --- /dev/null +++ b/packages/ml_api/.dockerignore @@ -0,0 +1,20 @@ +exercise_notebooks/* +*env* +*venv* +.circleci* +packages/gradient_boosting_model +*.env +*.log +.git +.gitignore +.dockerignore +*.mypy_cache +*.pytest_cache +*.tox + +# alembic +!alembic/env.py + +# Byte-compiled / optimized / DLL files +*__pycache__* +*.py[cod] \ No newline at end of file diff --git a/packages/ml_api/Makefile b/packages/ml_api/Makefile new file mode 100644 index 0000000..a544913 --- /dev/null +++ b/packages/ml_api/Makefile @@ -0,0 +1,40 @@ +# For details on Makefiles, see the section notes. +NAME=ml_api +VERSION=$(shell git rev-parse HEAD) +REPO=UPDATEME +PASSWORD=UPDATEME + +# Specify phony list to ensure make recipes do not conflict with real file names +.PHONY: run-service-development tag-push-master tag-push-local db-migrations + + +tag-push-local: + @echo "+ $@" + docker login --username $(REPO) --password $(PASSWORD) + env TARGET=$(VERSION) docker-compose -f docker/docker-compose-ci-candidate.yml build + docker push $(REPO)/$(NAME):$(VERSION) + +tag-push-master: + @echo "+ $@" + docker login --username $(REPO) --password $(PASSWORD) + env TARGET=master docker-compose -f docker/docker-compose-ci-master.yml build + docker push $(REPO)/$(NAME):master + +# start up Flask API service +run-service-development: + @echo "+ $@" + python run.py + +run-service-wsgi: + @echo "+ $@" + gunicorn --bind 0.0.0.0:5000 \ + --workers=1 \ + --log-config gunicorn_logging.conf \ + --log-level=DEBUG \ + --access-logfile=- \ + --error-logfile=- \ + run:application + +db-migrations: + @echo "+ $@" + PYTHONPATH=. alembic -c alembic.ini upgrade head diff --git a/packages/ml_api/__init__.py b/packages/ml_api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/ml_api/alembic.ini b/packages/ml_api/alembic.ini new file mode 100644 index 0000000..604e701 --- /dev/null +++ b/packages/ml_api/alembic.ini @@ -0,0 +1,49 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +script_location = alembic + +# timezone to use when rendering the date +# within the migration file as well as the filename. +# string value is passed to dateutil.tz.gettz() +# leave blank for localtime +timezone = UTC + +sqlalchemy.url = VALUE_IS_SET_AT_RUNTIME + + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/packages/ml_api/alembic/env.py b/packages/ml_api/alembic/env.py new file mode 100644 index 0000000..377cb2e --- /dev/null +++ b/packages/ml_api/alembic/env.py @@ -0,0 +1,63 @@ +import os + +from alembic import context +from sqlalchemy import engine_from_config, pool + +# Import the models so the changes in them are automatically reflected in the +# generated migrations. +from api.persistence import models # noqa +from api.config import DevelopmentConfig as user_config +from api.persistence.core import Base + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config +database_url = os.environ.get("ALEMBIC_DB_URI", user_config.SQLALCHEMY_DATABASE_URI) +config.set_main_option("sqlalchemy.url", database_url) + +# add your model's MetaData object here +# for 'autogenerate' support +target_metadata = Base.metadata + + +def run_migrations_offline(): + """Run migrations in 'offline' mode. + This configures the context with just a URL + and not a user_ratings, though a user_ratings is acceptable + here as well. By skipping the user_ratings creation + we don't even need a DBAPI to be available. + Calls to context.execute() here emit the given string to the + script output. + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, target_metadata=target_metadata, literal_binds=True, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online(): + """Run migrations in 'online' mode. + In this scenario we need to create a user_ratings + and associate a connection with the context. + """ + alembic_config = config.get_section(config.config_ini_section) + connectable = engine_from_config( + alembic_config, prefix="sqlalchemy.", poolclass=pool.NullPool, + ) + + with connectable.connect() as connection: + context.configure( + connection=connection, target_metadata=target_metadata, + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/packages/ml_api/alembic/script.py.mako b/packages/ml_api/alembic/script.py.mako new file mode 100644 index 0000000..2c01563 --- /dev/null +++ b/packages/ml_api/alembic/script.py.mako @@ -0,0 +1,24 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision = ${repr(up_revision)} +down_revision = ${repr(down_revision)} +branch_labels = ${repr(branch_labels)} +depends_on = ${repr(depends_on)} + + +def upgrade(): + ${upgrades if upgrades else "pass"} + + +def downgrade(): + ${downgrades if downgrades else "pass"} diff --git a/packages/ml_api/alembic/versions/cf4abb13368d_create_prediction_tables.py b/packages/ml_api/alembic/versions/cf4abb13368d_create_prediction_tables.py new file mode 100644 index 0000000..a26fb19 --- /dev/null +++ b/packages/ml_api/alembic/versions/cf4abb13368d_create_prediction_tables.py @@ -0,0 +1,78 @@ +"""create prediction tables + +Revision ID: cf4abb13368d +Revises: +Create Date: 2019-12-15 14:54:07.857500+00:00 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "cf4abb13368d" +down_revision = None +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "gradient_boosting_model_predictions", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("user_id", sa.String(length=36), nullable=False), + sa.Column( + "datetime_captured", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=True, + ), + sa.Column("model_version", sa.String(length=36), nullable=False), + sa.Column("inputs", postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column("outputs", postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + op.f("ix_gradient_boosting_model_predictions_datetime_captured"), + "gradient_boosting_model_predictions", + ["datetime_captured"], + unique=False, + ) + op.create_table( + "regression_model_predictions", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("user_id", sa.String(length=36), nullable=False), + sa.Column( + "datetime_captured", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=True, + ), + sa.Column("model_version", sa.String(length=36), nullable=False), + sa.Column("inputs", postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column("outputs", postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + op.f("ix_regression_model_predictions_datetime_captured"), + "regression_model_predictions", + ["datetime_captured"], + unique=False, + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index( + op.f("ix_regression_model_predictions_datetime_captured"), + table_name="regression_model_predictions", + ) + op.drop_table("regression_model_predictions") + op.drop_index( + op.f("ix_gradient_boosting_model_predictions_datetime_captured"), + table_name="gradient_boosting_model_predictions", + ) + op.drop_table("gradient_boosting_model_predictions") + # ### end Alembic commands ### diff --git a/packages/ml_api/api/__init__.py b/packages/ml_api/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/ml_api/api/app.py b/packages/ml_api/api/app.py new file mode 100644 index 0000000..f8f74e0 --- /dev/null +++ b/packages/ml_api/api/app.py @@ -0,0 +1,33 @@ +import logging + +import connexion +from sqlalchemy.orm import scoped_session + +from api.config import Config +from api.monitoring.middleware import setup_metrics +from api.persistence.core import init_database + +_logger = logging.getLogger('mlapi') + + +def create_app( + *, config_object: Config, db_session: scoped_session = None +) -> connexion.App: + """Create app instance.""" + + connexion_app = connexion.App( + __name__, debug=config_object.DEBUG, specification_dir="spec/" + ) + flask_app = connexion_app.app + flask_app.config.from_object(config_object) + + # Setup database + init_database(flask_app, config=config_object, db_session=db_session) + + # Setup prometheus monitoring + setup_metrics(flask_app) + + connexion_app.add_api("api.yaml") + _logger.info("Application instance created") + + return connexion_app diff --git a/packages/ml_api/api/config.py b/packages/ml_api/api/config.py new file mode 100644 index 0000000..1f528c4 --- /dev/null +++ b/packages/ml_api/api/config.py @@ -0,0 +1,101 @@ +import logging +import os +import pathlib +import sys +from logging.config import fileConfig + +import api + +# logging format +FORMATTER = logging.Formatter( + "%(asctime)s — %(name)s — %(levelname)s —" "%(funcName)s:%(lineno)d — %(message)s" +) + +# Project Directories +ROOT = pathlib.Path(api.__file__).resolve().parent.parent + +APP_NAME = 'ml_api' + + +class Config: + DEBUG = False + TESTING = False + ENV = os.getenv("FLASK_ENV", "production") + SERVER_PORT = int(os.getenv("SERVER_PORT", 5000)) + SERVER_HOST = os.getenv("SERVER_HOST", "0.0.0.0") + LOGGING_LEVEL = os.getenv("LOGGING_LEVEL", logging.INFO) + SHADOW_MODE_ACTIVE = os.getenv('SHADOW_MODE_ACTIVE', True) + SQLALCHEMY_DATABASE_URI = ( + f"postgresql+psycopg2://{os.getenv('DB_USER')}:" + f"{os.getenv('DB_PASSWORD')}@{os.getenv('DB_HOST')}/{os.getenv('DB_NAME')}" + ) + # DB config matches docker container + DB_USER = os.getenv("DB_USER", "user") + DB_PASSWORD = os.getenv("DB_PASSWORD", "password") + DB_PORT = os.getenv("DB_PORT", 6609) + DB_HOST = os.getenv("DB_HOST", "0.0.0.0") + DB_NAME = os.getenv("DB_NAME", "ml_api_dev") + + +class DevelopmentConfig(Config): + DEBUG = True + ENV = "development" # do not use in production! + LOGGING_LEVEL = logging.DEBUG + + +class TestingConfig(Config): + DEBUG = True + TESTING = True + LOGGING_LEVEL = logging.DEBUG + + # DB config matches test docker container + DB_USER = os.getenv("DB_USER", "test_user") + DB_PASSWORD = os.getenv("DB_PASSWORD", "password") + DB_PORT = os.getenv("DB_PORT", 6608) + DB_HOST = os.getenv("DB_HOST", "0.0.0.0") + DB_NAME = "ml_api_test" + SQLALCHEMY_DATABASE_URI = ( + f"postgresql+psycopg2://{DB_USER}:" + f"{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}" + ) + + +class ProductionConfig(Config): + DB_USER = os.getenv("DB_USER", "user") + DB_PASSWORD = os.getenv("DB_PASSWORD", "password") + DB_PORT = os.getenv("DB_PORT", 6609) + DB_HOST = os.getenv("DB_HOST", "database") + DB_NAME = os.getenv("DB_NAME", "ml_api") + SQLALCHEMY_DATABASE_URI = ( + f"postgresql+psycopg2://{DB_USER}:" + f"{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}" + ) + + +def get_console_handler(): + """Setup console logging handler.""" + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(FORMATTER) + return console_handler + + +def setup_app_logging(config: Config) -> None: + """Prepare custom logging for our application.""" + _disable_irrelevant_loggers() + fileConfig(ROOT / 'gunicorn_logging.conf') + logger = logging.getLogger('mlapi') + logger.setLevel(config.LOGGING_LEVEL) + + +def _disable_irrelevant_loggers() -> None: + """Disable loggers created by packages which create a lot of noise.""" + for logger_name in ( + "connexion.apis.flask_api", + "connexion.apis.abstract", + "connexion.decorators", + "connexion.operation", + "connexion.operations", + "connexion.app", + "openapi_spec_validator", + ): + logging.getLogger(logger_name).level = logging.WARNING diff --git a/packages/ml_api/api/controller.py b/packages/ml_api/api/controller.py new file mode 100644 index 0000000..47d9f07 --- /dev/null +++ b/packages/ml_api/api/controller.py @@ -0,0 +1,131 @@ +import json +import logging +import threading + +from flask import request, jsonify, Response, current_app +from prometheus_client import Histogram, Gauge, Info +from regression_model import __version__ as live_version + +from api.config import APP_NAME +from api.persistence.data_access import PredictionPersistence, ModelType +from gradient_boosting_model import __version__ as shadow_version +from gradient_boosting_model.predict import make_prediction + +_logger = logging.getLogger('mlapi') + + +PREDICTION_TRACKER = Histogram( + name='house_price_prediction_dollars', + documentation='ML Model Prediction on House Price', + labelnames=['app_name', 'model_name', 'model_version'] +) + +PREDICTION_GAUGE = Gauge( + name='house_price_gauge_dollars', + documentation='ML Model Prediction on House Price for min max calcs', + labelnames=['app_name', 'model_name', 'model_version'] +) + +PREDICTION_GAUGE.labels( + app_name=APP_NAME, + model_name=ModelType.LASSO.name, + model_version=live_version) + +MODEL_VERSIONS = Info( + 'model_version_details', + 'Capture model version information', +) + +MODEL_VERSIONS.info({ + 'live_model': ModelType.LASSO.name, + 'live_version': live_version, + 'shadow_model': ModelType.GRADIENT_BOOSTING.name, + 'shadow_version': shadow_version}) + + +def health(): + if request.method == "GET": + status = {"status": "ok"} + _logger.debug(status) + return jsonify(status) + + +def predict(): + if request.method == "POST": + # Step 1: Extract POST data from request body as JSON + json_data = request.get_json() + for entry in json_data: + _logger.info(entry) + + # Step 2a: Get and save live model predictions + persistence = PredictionPersistence(db_session=current_app.db_session) + result = persistence.make_save_predictions( + db_model=ModelType.LASSO, input_data=json_data + ) + + # Step 2b: Get and save shadow predictions asynchronously + if current_app.config.get("SHADOW_MODE_ACTIVE"): + _logger.debug( + f"Calling shadow model asynchronously: " + f"{ModelType.GRADIENT_BOOSTING.value}" + ) + thread = threading.Thread( + target=persistence.make_save_predictions, + kwargs={ + "db_model": ModelType.GRADIENT_BOOSTING, + "input_data": json_data, + }, + ) + thread.start() + + # Step 3: Handle errors + if result.errors: + _logger.warning(f"errors during prediction: {result.errors}") + return Response(json.dumps(result.errors), status=400) + + # Step 4: Monitoring + for _prediction in result.predictions: + PREDICTION_TRACKER.labels( + app_name=APP_NAME, + model_name=ModelType.LASSO.name, + model_version=live_version).observe(_prediction) + PREDICTION_GAUGE.labels( + app_name=APP_NAME, + model_name=ModelType.LASSO.name, + model_version=live_version).set(_prediction) + _logger.info( + f'Prediction results for model: {ModelType.LASSO.name} ' + f'version: {result.model_version} ' + f'Output values: {result.predictions}') + + # Step 5: Prepare prediction response + return jsonify( + { + "predictions": result.predictions, + "version": result.model_version, + "errors": result.errors, + } + ) + + +def predict_previous(): + if request.method == "POST": + # Step 1: Extract POST data from request body as JSON + json_data = request.get_json() + + # Step 2: Access the model prediction function (also validates data) + result = make_prediction(input_data=json_data) + + # Step 3: Handle errors + errors = result.get("errors") + if errors: + return Response(json.dumps(errors), status=400) + + # Step 4: Split out results + predictions = result.get("predictions").tolist() + version = result.get("version") + + # Step 5: Prepare prediction response + return jsonify( + {"predictions": predictions, "version": version, "errors": errors} + ) diff --git a/packages/ml_api/api/monitoring/__init__.py b/packages/ml_api/api/monitoring/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/ml_api/api/monitoring/middleware.py b/packages/ml_api/api/monitoring/middleware.py new file mode 100644 index 0000000..31712ef --- /dev/null +++ b/packages/ml_api/api/monitoring/middleware.py @@ -0,0 +1,60 @@ +from flask import request, Flask +from flask.wrappers import Response +from prometheus_client import Counter, Histogram +import time + +from api.config import APP_NAME + + +# Counter and Histogram are examples of default metrics +# available from the prometheus Python client. +REQUEST_COUNT = Counter( + name='http_request_count', + documentation='App Request Count', + labelnames=['app_name', 'method', 'endpoint', 'http_status'] +) +REQUEST_LATENCY = Histogram( + name='http_request_latency_seconds', + documentation='Request latency', + labelnames=['app_name', 'endpoint'] +) + + +def start_timer() -> None: + """Get start time of a request.""" + request._prometheus_metrics_request_start_time = time.time() + + +def stop_timer(response: Response) -> Response: + """Get stop time of a request..""" + request_latency = time.time() - request._prometheus_metrics_request_start_time + REQUEST_LATENCY.labels( + app_name=APP_NAME, + endpoint=request.path).observe(request_latency) + return response + + +def record_request_data(response: Response) -> Response: + """Capture request data. + + Uses the flask request object to extract information such as + the HTTP request method, endpoint and HTTP status. + """ + REQUEST_COUNT.labels( + app_name=APP_NAME, + method=request.method, + endpoint=request.path, + http_status=response.status_code).inc() + return response + + +def setup_metrics(app: Flask) -> None: + """Setup Prometheus metrics. + + This function uses the flask before_request + and after_request hooks to capture metrics + with each HTTP request to the application. + """ + app.before_request(start_timer) + app.after_request(record_request_data) + app.after_request(stop_timer) diff --git a/packages/ml_api/api/persistence/__init__.py b/packages/ml_api/api/persistence/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/ml_api/api/persistence/core.py b/packages/ml_api/api/persistence/core.py new file mode 100644 index 0000000..aab5555 --- /dev/null +++ b/packages/ml_api/api/persistence/core.py @@ -0,0 +1,67 @@ +import logging +import os + +import alembic.config +from flask import Flask +from sqlalchemy import create_engine +from sqlalchemy.engine import Engine +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import scoped_session, sessionmaker +from sqlalchemy_utils import database_exists, create_database + +from api.config import Config, ROOT + +_logger = logging.getLogger('mlapi') + +# Base class for SQLAlchemy models +Base = declarative_base() + + +def create_db_engine_from_config(*, config: Config) -> Engine: + """The Engine is the starting point for any SQLAlchemy application. + + It’s “home base” for the actual database and its DBAPI, delivered to the SQLAlchemy + application through a connection pool and a Dialect, which describes how to talk to + a specific kind of database / DBAPI combination. + """ + + db_url = config.SQLALCHEMY_DATABASE_URI + if not database_exists(db_url): + create_database(db_url) + engine = create_engine(db_url) + + _logger.info(f"creating DB conn with URI: {db_url}") + return engine + + +def create_db_session(*, engine: Engine) -> scoped_session: + """Broadly speaking, the Session establishes all conversations with the database. + + It represents a “holding zone” for all the objects which you’ve loaded or + associated with it during its lifespan. + """ + return scoped_session(sessionmaker(autocommit=False, autoflush=False, bind=engine)) + + +def init_database(app: Flask, config: Config, db_session=None) -> None: + """Connect to the database and attach DB session to the app.""" + + if not db_session: + engine = create_db_engine_from_config(config=config) + db_session = create_db_session(engine=engine) + + app.db_session = db_session + + @app.teardown_appcontext + def shutdown_session(exception=None): + db_session.remove() + + +def run_migrations(): + """Run the DB migrations prior to the tests.""" + + # alembic looks for the migrations in the current + # directory so we change to the correct directory. + os.chdir(str(ROOT)) + alembicArgs = ["--raiseerr", "upgrade", "head"] + alembic.config.main(argv=alembicArgs) diff --git a/packages/ml_api/api/persistence/data_access.py b/packages/ml_api/api/persistence/data_access.py new file mode 100644 index 0000000..f46d24f --- /dev/null +++ b/packages/ml_api/api/persistence/data_access.py @@ -0,0 +1,113 @@ +import enum +import json +import logging +import typing as t + +import numpy as np +import pandas as pd +from regression_model.predict import make_prediction as make_live_prediction +from sqlalchemy.orm.session import Session + +from api.persistence.models import ( + LassoModelPredictions, + GradientBoostingModelPredictions, +) +from gradient_boosting_model.predict import make_prediction as make_shadow_prediction + +_logger = logging.getLogger('mlapi') + + +SECONDARY_VARIABLES_TO_RENAME = { + "FirstFlrSF": "1stFlrSF", + "SecondFlrSF": "2ndFlrSF", + "ThreeSsnPortch": "3SsnPorch", +} + + +class ModelType(enum.Enum): + LASSO = "lasso" + GRADIENT_BOOSTING = "gradient_boosting" + + +class PredictionResult(t.NamedTuple): + errors: t.Any + predictions: np.array + model_version: str + + +MODEL_PREDICTION_MAP = { + ModelType.GRADIENT_BOOSTING: make_shadow_prediction, + ModelType.LASSO: make_live_prediction, +} + + +class PredictionPersistence: + def __init__(self, *, db_session: Session, user_id: str = None) -> None: + self.db_session = db_session + if not user_id: + # in reality, here we would use something like a UUID for anonymous users + # and if we had user logins, we would record the user ID. + self.user_id = "007" + + def make_save_predictions( + self, *, db_model: ModelType, input_data: t.List + ) -> PredictionResult: + """Get the prediction from a given model and persist it.""" + # Access the model prediction function via mapping + if db_model == ModelType.LASSO: + # we have to rename a few of the columns for backwards + # compatibility with the regression model package. + live_frame = pd.DataFrame(input_data) + input_data = live_frame.rename( + columns=SECONDARY_VARIABLES_TO_RENAME + ).to_dict(orient="records") + + result = MODEL_PREDICTION_MAP[db_model](input_data=input_data) + errors = None + try: + errors = result["errors"] + except KeyError: + # regression model `make_prediction` does not include errors + pass + + prediction_result = PredictionResult( + errors=errors, + predictions=result.get("predictions").tolist() if not errors else None, + model_version=result.get("version"), + ) + + if prediction_result.errors: + return prediction_result + + self.save_predictions( + inputs=input_data, prediction_result=prediction_result, db_model=db_model + ) + + return prediction_result + + def save_predictions( + self, + *, + inputs: t.List, + prediction_result: PredictionResult, + db_model: ModelType, + ) -> None: + """Persist model predictions to storage.""" + if db_model == db_model.LASSO: + prediction_data = LassoModelPredictions( + user_id=self.user_id, + model_version=prediction_result.model_version, + inputs=json.dumps(inputs), + outputs=json.dumps(prediction_result.predictions), + ) + else: + prediction_data = GradientBoostingModelPredictions( + user_id=self.user_id, + model_version=prediction_result.model_version, + inputs=json.dumps(inputs), + outputs=json.dumps(prediction_result.predictions), + ) + + self.db_session.add(prediction_data) + self.db_session.commit() + _logger.debug(f"saved data for model: {db_model}") diff --git a/packages/ml_api/api/persistence/models.py b/packages/ml_api/api/persistence/models.py new file mode 100644 index 0000000..65da0b8 --- /dev/null +++ b/packages/ml_api/api/persistence/models.py @@ -0,0 +1,29 @@ +from sqlalchemy import Column, String, DateTime, Integer +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.sql import func + +from api.persistence.core import Base + + +class LassoModelPredictions(Base): + __tablename__ = "regression_model_predictions" + id = Column(Integer, primary_key=True) + user_id = Column(String(36), nullable=False) + datetime_captured = Column( + DateTime(timezone=True), server_default=func.now(), index=True + ) + model_version = Column(String(36), nullable=False) + inputs = Column(JSONB) + outputs = Column(JSONB) + + +class GradientBoostingModelPredictions(Base): + __tablename__ = "gradient_boosting_model_predictions" + id = Column(Integer, primary_key=True) + user_id = Column(String(36), nullable=False) + datetime_captured = Column( + DateTime(timezone=True), server_default=func.now(), index=True + ) + model_version = Column(String(36), nullable=False) + inputs = Column(JSONB) + outputs = Column(JSONB) diff --git a/packages/ml_api/api/spec/__init__.py b/packages/ml_api/api/spec/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/ml_api/api/spec/api.yaml b/packages/ml_api/api/spec/api.yaml new file mode 100644 index 0000000..84c3075 --- /dev/null +++ b/packages/ml_api/api/spec/api.yaml @@ -0,0 +1,147 @@ +openapi: 3.0.0 + +info: + title: Spec for House Price Prediction API + version: '1' + +servers: +- url: http://{base}:5000/ + description: API for performing house price predictions. + variables: + base: + default: 0.0.0.0 + +paths: + /: + get: + operationId: api.controller.health + responses: + '200': + description: API Health Status + + /v1/predictions/regression: + post: + operationId: api.controller.predict + requestBody: + description: House details used to make price prediction + required: true + content: + application/json: + schema: + type: array + items: + $ref: '#/components/schemas/HouseDetails' + responses: + '200': + description: House Price Predictions + '400': + description: Bad request, house data validation failed + '5XX': + description: Unexpected error + + /v1/predictions/gradient: + post: + operationId: api.controller.predict_previous + requestBody: + description: House details used to make price prediction + required: true + content: + application/json: + schema: + type: array + items: + $ref: '#/components/schemas/HouseDetails' + responses: + '200': + description: House Price Predictions + '400': + description: Bad request, house data validation failed + '5XX': + description: Unexpected error + +components: + schemas: + HouseDetails: + type: object + description: "List of the houses to get predictions for." + example: + Id: 1461 + MSSubClass: 20 + MSZoning: RH + LotFrontage: 80.0 + LotArea: 11622 + Street: Pave + Alley: null + LotShape: Reg + LandContour: Lvl + Utilities: AllPub + LotConfig: Inside + LandSlope: Gtl + Neighborhood: NAmes + Condition1: Feedr + Condition2: Norm + BldgType: 1Fam + HouseStyle: 1Story + OverallQual: 5 + OverallCond: 6 + YearBuilt: 1961 + YearRemodAdd: 1961 + RoofStyle: Gable + RoofMatl: CompShg + Exterior1st: VinylSd + Exterior2nd: VinylSd + MasVnrType: None + MasVnrArea: 0.0 + ExterQual: TA + ExterCond: TA + Foundation: CBlock + BsmtQual: TA + BsmtCond: TA + BsmtExposure: null + BsmtFinType1: Rec + BsmtFinSF1: 468.0 + BsmtFinType2: LwQ + BsmtFinSF2: 144.0 + BsmtUnfSF: 270.0 + TotalBsmtSF: 882.0 + Heating: GasA + HeatingQC: TA + CentralAir: Y + Electrical: SBrkr + 1stFlrSF: 896 + 2ndFlrSF: 0 + LowQualFinSF: 0 + GrLivArea: 896 + BsmtFullBath: 0.0 + BsmtHalfBath: 0.0 + FullBath: 1 + HalfBath: 0 + BedroomAbvGr: 2 + KitchenAbvGr: 1 + KitchenQual: TA + TotRmsAbvGrd: 5 + Functional: Typ + Fireplaces: 0 + FireplaceQu: null + GarageType: Attchd + GarageYrBlt: 1961.0 + GarageFinish: Unf + GarageCars: 1.0 + GarageArea: 730.0 + GarageQual: TA + GarageCond: TA + PavedDrive: Y + WoodDeckSF: 140 + OpenPorchSF: 0 + EnclosedPorch: 0 + 3SsnPorch: 0 + ScreenPorch: 120 + PoolArea: 0 + PoolQC: null + Fence: MnPrv + MiscFeature: null + MiscVal: 0 + MoSold: 6 + YrSold: 2010 + SaleType: WD + SaleCondition: Normal diff --git a/packages/ml_api/differential_tests/__init__.py b/packages/ml_api/differential_tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/ml_api/differential_tests/__main__.py b/packages/ml_api/differential_tests/__main__.py new file mode 100644 index 0000000..cf86222 --- /dev/null +++ b/packages/ml_api/differential_tests/__main__.py @@ -0,0 +1,100 @@ +import json +from argparse import ArgumentParser, Namespace +from pathlib import Path +from typing import Mapping + +from differential_tests.compare import compare_predictions +from api.config import ROOT + +from termcolor import cprint +from yarl import URL +import requests + +Marginals = Mapping[str, Mapping[str, float]] + + +def parse_args() -> Namespace: + parser = ArgumentParser() + + subparsers = parser.add_subparsers(dest="command") + + compute_parser = subparsers.add_parser( + "compute", help="Compute the predictions for a test set" + ) + compute_parser.add_argument( + "--base-url", + default=URL("http://0.0.0.0:5000"), + type=URL, + help="Base URL of the service to test", + ) + compute_parser.add_argument( + "tests_dir", type=Path, help="Directory containing the test set to use" + ) + compute_parser.add_argument( + "results_dir", type=Path, help="Directory to save the prediction results to" + ) + + compare_parser = subparsers.add_parser( + "compare", help="Compare the actual results with the expected results" + ) + compare_parser.add_argument( + "--absolute-tolerance", + dest="abs_tol", + metavar="X", + type=float, + help="math.isclose(a, b, abs_tol=X)", + default=1e-5, + ) + compare_parser.add_argument( + "--relative-tolerance", + dest="rel_tol", + metavar="X", + type=float, + default=1e-5, + help="math.isclose(a, b, rel_tol=X)", + ) + compare_parser.add_argument( + "expected_results_dir", + type=Path, + help="Directory containing the expected results", + ) + compare_parser.add_argument( + "actual_results_dir", type=Path, help="Directory containing the actual results" + ) + + return parser.parse_args() + + +def main(args: Namespace) -> None: + if args.command == "compute": + compute_predictions(args) + elif args.command == "compare": + compare_predictions(args) + + +def compute_predictions(args: Namespace) -> None: + print("computing") + + diff_test_dir = ROOT / "differential_tests" + results_dir = args.results_dir + results_dir.mkdir(parents=True, exist_ok=True) + prepared_test_dir = diff_test_dir / Path(args.tests_dir) + + for test_filename in sorted(prepared_test_dir.glob("*.json")): + results_filename = results_dir / test_filename.name + print(f"Computing {results_filename} from {test_filename} ... ", end="") + + with test_filename.open() as f: + test = json.load(f) + + results = requests.post(f"{args.base_url}/v1/predictions/primary", json=test) + + with results_filename.open("w") as f: + json.dump(results.json(), f, indent=2, sort_keys=True) + + cprint("OK", "green") + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/packages/ml_api/differential_tests/compare.py b/packages/ml_api/differential_tests/compare.py new file mode 100644 index 0000000..012dd16 --- /dev/null +++ b/packages/ml_api/differential_tests/compare.py @@ -0,0 +1,95 @@ +import json +import math +import sys +import typing as t +from argparse import Namespace + +from termcolor import cprint + +from api.config import ROOT + + +def compare_differences( + *, + expected_predictions: t.List, + actual_predictions: t.List, + rel_tol: t.Optional[float] = None, + abs_tol: t.Optional[float] = None, +) -> None: + """ + :param rel_tol: is the relative tolerance – it is the maximum allowed difference + between a and b, relative to the larger absolute value of a or b. + For example, to set a tolerance of 5%, pass rel_tol=0.05. The default + tolerance is 1e-09, which assures that the two values are the same within + about 9 decimal digits. rel_tol must be greater than zero. + + :param abs_tol: abs_tol is the minimum absolute tolerance – useful for comparisons + near zero. abs_tol must be at least zero. + """ + only_in_expected = len(expected_predictions) - len(actual_predictions) + + if only_in_expected: + raise ValueError(f"Missing {only_in_expected} predictions") + + only_in_actual = len(actual_predictions) - len(expected_predictions) + + if only_in_actual: + raise ValueError(f"Found {only_in_actual} unexpected predictions") + + thresholds = {} + + if abs_tol is not None: + thresholds["abs_tol"] = abs_tol + + if rel_tol is not None: + thresholds["rel_tol"] = rel_tol + + for index, (actual_prediction, expected_prediction) in enumerate( + zip(actual_predictions, expected_predictions) + ): + if not math.isclose(expected_prediction, actual_prediction, **thresholds): + raise ValueError( + f"Price prediction {index} has changed by more " + f"than the thresholds: {thresholds}: " + f"{expected_prediction} (expected) vs " + f"{actual_prediction} (actual)" + ) + + +def compare_predictions(args: Namespace) -> None: + expected_results_dir = ROOT / args.expected_results_dir + actual_results_dir = ROOT / args.actual_results_dir + + expected_results_filenames = list(expected_results_dir.glob("*.json")) + + if not expected_results_filenames: + print("No results found!") + sys.exit(1) + + for expected_results_filename in sorted(expected_results_filenames): + name = expected_results_filename.name + actual_results_filename = actual_results_dir / name + + print( + f"Comparing {expected_results_filename} with {actual_results_filename} ... ", + end="", + ) + + with expected_results_filename.open() as f: + expected_results = json.load(f) + + with actual_results_filename.open() as f: + actual_results = json.load(f) + + try: + compare_differences( + expected_predictions=expected_results["predictions"], + actual_predictions=actual_results["predictions"], + rel_tol=args.rel_tol, + abs_tol=args.abs_tol, + ) + except ValueError as exc: + cprint("ERROR", "red") + cprint(f" • {exc}", "red") + else: + cprint("OK", "green") diff --git a/packages/ml_api/differential_tests/sample_payloads/sample_input1.json b/packages/ml_api/differential_tests/sample_payloads/sample_input1.json new file mode 100644 index 0000000..61f96e6 --- /dev/null +++ b/packages/ml_api/differential_tests/sample_payloads/sample_input1.json @@ -0,0 +1,488 @@ +[{ + "Id": 1461, + "MSSubClass": 20, + "MSZoning": "RH", + "LotFrontage": 80.0, + "LotArea": 11622, + "Street": "Pave", + "Alley": null, + "LotShape": "Reg", + "LandContour": "Lvl", + "Utilities": "AllPub", + "LotConfig": "Inside", + "LandSlope": "Gtl", + "Neighborhood": "NAmes", + "Condition1": "Feedr", + "Condition2": "Norm", + "BldgType": "1Fam", + "HouseStyle": "1Story", + "OverallQual": 5, + "OverallCond": 6, + "YearBuilt": 1961, + "YearRemodAdd": 1961, + "RoofStyle": "Gable", + "RoofMatl": "CompShg", + "Exterior1st": "VinylSd", + "Exterior2nd": "VinylSd", + "MasVnrType": "None", + "MasVnrArea": 0.0, + "ExterQual": "TA", + "ExterCond": "TA", + "Foundation": "CBlock", + "BsmtQual": "TA", + "BsmtCond": "TA", + "BsmtExposure": "No", + "BsmtFinType1": "Rec", + "BsmtFinSF1": 468.0, + "BsmtFinType2": "LwQ", + "BsmtFinSF2": 144.0, + "BsmtUnfSF": 270.0, + "TotalBsmtSF": 882.0, + "Heating": "GasA", + "HeatingQC": "TA", + "CentralAir": "Y", + "Electrical": "SBrkr", + "1stFlrSF": 896, + "2ndFlrSF": 0, + "LowQualFinSF": 0, + "GrLivArea": 896, + "BsmtFullBath": 0.0, + "BsmtHalfBath": 0.0, + "FullBath": 1, + "HalfBath": 0, + "BedroomAbvGr": 2, + "KitchenAbvGr": 1, + "KitchenQual": "TA", + "TotRmsAbvGrd": 5, + "Functional": "Typ", + "Fireplaces": 0, + "FireplaceQu": null, + "GarageType": "Attchd", + "GarageYrBlt": 1961.0, + "GarageFinish": "Unf", + "GarageCars": 1.0, + "GarageArea": 730.0, + "GarageQual": "TA", + "GarageCond": "TA", + "PavedDrive": "Y", + "WoodDeckSF": 140, + "OpenPorchSF": 0, + "EnclosedPorch": 0, + "3SsnPorch": 0, + "ScreenPorch": 120, + "PoolArea": 0, + "PoolQC": null, + "Fence": "MnPrv", + "MiscFeature": null, + "MiscVal": 0, + "MoSold": 6, + "YrSold": 2010, + "SaleType": "WD", + "SaleCondition": "Normal" +}, { + "Id": 1461, + "MSSubClass": 20, + "MSZoning": "RH", + "LotFrontage": 80.0, + "LotArea": 11689, + "Street": "Pave", + "Alley": null, + "LotShape": "Reg", + "LandContour": "Lvl", + "Utilities": "AllPub", + "LotConfig": "Inside", + "LandSlope": "Gtl", + "Neighborhood": "NAmes", + "Condition1": "Feedr", + "Condition2": "Norm", + "BldgType": "1Fam", + "HouseStyle": "1Story", + "OverallQual": 5, + "OverallCond": 6, + "YearBuilt": 1969, + "YearRemodAdd": 1961, + "RoofStyle": "Gable", + "RoofMatl": "CompShg", + "Exterior1st": "VinylSd", + "Exterior2nd": "VinylSd", + "MasVnrType": "None", + "MasVnrArea": 0.0, + "ExterQual": "TA", + "ExterCond": "TA", + "Foundation": "CBlock", + "BsmtQual": "TA", + "BsmtCond": "TA", + "BsmtExposure": "No", + "BsmtFinType1": "Rec", + "BsmtFinSF1": 468.0, + "BsmtFinType2": "LwQ", + "BsmtFinSF2": 144.0, + "BsmtUnfSF": 270.0, + "TotalBsmtSF": 882.0, + "Heating": "GasA", + "HeatingQC": "TA", + "CentralAir": "Y", + "Electrical": "SBrkr", + "1stFlrSF": 752, + "2ndFlrSF": 0, + "LowQualFinSF": 0, + "GrLivArea": 896, + "BsmtFullBath": 0.0, + "BsmtHalfBath": 0.0, + "FullBath": 1, + "HalfBath": 0, + "BedroomAbvGr": 2, + "KitchenAbvGr": 1, + "KitchenQual": "TA", + "TotRmsAbvGrd": 5, + "Functional": "Typ", + "Fireplaces": 0, + "FireplaceQu": null, + "GarageType": "Attchd", + "GarageYrBlt": 1961.0, + "GarageFinish": "Unf", + "GarageCars": 1.0, + "GarageArea": 730.0, + "GarageQual": "TA", + "GarageCond": "TA", + "PavedDrive": "Y", + "WoodDeckSF": 140, + "OpenPorchSF": 0, + "EnclosedPorch": 0, + "3SsnPorch": 0, + "ScreenPorch": 120, + "PoolArea": 0, + "PoolQC": null, + "Fence": "MnPrv", + "MiscFeature": null, + "MiscVal": 0, + "MoSold": 6, + "YrSold": 2010, + "SaleType": "WD", + "SaleCondition": "Normal" +}, +{ + "Id": 1461, + "MSSubClass": 20, + "MSZoning": "RH", + "LotFrontage": 80.0, + "LotArea": 22689, + "Street": "Pave", + "Alley": null, + "LotShape": "Reg", + "LandContour": "Lvl", + "Utilities": "AllPub", + "LotConfig": "Inside", + "LandSlope": "Gtl", + "Neighborhood": "NAmes", + "Condition1": "Feedr", + "Condition2": "Norm", + "BldgType": "1Fam", + "HouseStyle": "1Story", + "OverallQual": 5, + "OverallCond": 6, + "YearBuilt": 1969, + "YearRemodAdd": 1961, + "RoofStyle": "Gable", + "RoofMatl": "CompShg", + "Exterior1st": "VinylSd", + "Exterior2nd": "VinylSd", + "MasVnrType": "None", + "MasVnrArea": 0.0, + "ExterQual": "TA", + "ExterCond": "TA", + "Foundation": "CBlock", + "BsmtQual": "TA", + "BsmtCond": "TA", + "BsmtExposure": "No", + "BsmtFinType1": "Rec", + "BsmtFinSF1": 468.0, + "BsmtFinType2": "LwQ", + "BsmtFinSF2": 144.0, + "BsmtUnfSF": 270.0, + "TotalBsmtSF": 882.0, + "Heating": "GasA", + "HeatingQC": "TA", + "CentralAir": "Y", + "Electrical": "SBrkr", + "1stFlrSF": 752, + "2ndFlrSF": 0, + "LowQualFinSF": 0, + "GrLivArea": 896, + "BsmtFullBath": 0.0, + "BsmtHalfBath": 0.0, + "FullBath": 1, + "HalfBath": 0, + "BedroomAbvGr": 2, + "KitchenAbvGr": 1, + "KitchenQual": "TA", + "TotRmsAbvGrd": 5, + "Functional": "Typ", + "Fireplaces": 0, + "FireplaceQu": null, + "GarageType": "Attchd", + "GarageYrBlt": 1961.0, + "GarageFinish": "Unf", + "GarageCars": 1.0, + "GarageArea": 730.0, + "GarageQual": "TA", + "GarageCond": "TA", + "PavedDrive": "Y", + "WoodDeckSF": 140, + "OpenPorchSF": 0, + "EnclosedPorch": 0, + "3SsnPorch": 0, + "ScreenPorch": 120, + "PoolArea": 0, + "PoolQC": null, + "Fence": "MnPrv", + "MiscFeature": null, + "MiscVal": 0, + "MoSold": 6, + "YrSold": 2010, + "SaleType": "WD", + "SaleCondition": "Normal" +},{ + "Id": 1461, + "MSSubClass": 20, + "MSZoning": "RH", + "LotFrontage": 80.0, + "LotArea": 11689, + "Street": "Pave", + "Alley": null, + "LotShape": "Reg", + "LandContour": "Lvl", + "Utilities": "AllPub", + "LotConfig": "Inside", + "LandSlope": "Gtl", + "Neighborhood": "NAmes", + "Condition1": "Feedr", + "Condition2": "Norm", + "BldgType": "1Fam", + "HouseStyle": "1Story", + "OverallQual": 5, + "OverallCond": 6, + "YearBuilt": 1969, + "YearRemodAdd": 1961, + "RoofStyle": "Gable", + "RoofMatl": "CompShg", + "Exterior1st": "VinylSd", + "Exterior2nd": "VinylSd", + "MasVnrType": "None", + "MasVnrArea": 0.0, + "ExterQual": "TA", + "ExterCond": "TA", + "Foundation": "CBlock", + "BsmtQual": "TA", + "BsmtCond": "TA", + "BsmtExposure": "No", + "BsmtFinType1": "Rec", + "BsmtFinSF1": 468.0, + "BsmtFinType2": "LwQ", + "BsmtFinSF2": 144.0, + "BsmtUnfSF": 270.0, + "TotalBsmtSF": 882.0, + "Heating": "GasA", + "HeatingQC": "TA", + "CentralAir": "Y", + "Electrical": "SBrkr", + "1stFlrSF": 988, + "2ndFlrSF": 0, + "LowQualFinSF": 0, + "GrLivArea": 896, + "BsmtFullBath": 0.0, + "BsmtHalfBath": 0.0, + "FullBath": 1, + "HalfBath": 0, + "BedroomAbvGr": 2, + "KitchenAbvGr": 1, + "KitchenQual": "TA", + "TotRmsAbvGrd": 5, + "Functional": "Typ", + "Fireplaces": 0, + "FireplaceQu": null, + "GarageType": "Attchd", + "GarageYrBlt": 1961.0, + "GarageFinish": "Unf", + "GarageCars": 1.0, + "GarageArea": 730.0, + "GarageQual": "TA", + "GarageCond": "TA", + "PavedDrive": "Y", + "WoodDeckSF": 140, + "OpenPorchSF": 0, + "EnclosedPorch": 0, + "3SsnPorch": 0, + "ScreenPorch": 120, + "PoolArea": 0, + "PoolQC": null, + "Fence": "MnPrv", + "MiscFeature": null, + "MiscVal": 0, + "MoSold": 6, + "YrSold": 2010, + "SaleType": "WD", + "SaleCondition": "Normal" +},{ + "Id": 1461, + "MSSubClass": 20, + "MSZoning": "RH", + "LotFrontage": 80.0, + "LotArea": 11689, + "Street": "Pave", + "Alley": null, + "LotShape": "Reg", + "LandContour": "Lvl", + "Utilities": "AllPub", + "LotConfig": "Inside", + "LandSlope": "Gtl", + "Neighborhood": "NAmes", + "Condition1": "Feedr", + "Condition2": "Norm", + "BldgType": "1Fam", + "HouseStyle": "1Story", + "OverallQual": 5, + "OverallCond": 6, + "YearBuilt": 1969, + "YearRemodAdd": 1961, + "RoofStyle": "Gable", + "RoofMatl": "CompShg", + "Exterior1st": "VinylSd", + "Exterior2nd": "VinylSd", + "MasVnrType": "None", + "MasVnrArea": 0.0, + "ExterQual": "TA", + "ExterCond": "TA", + "Foundation": "CBlock", + "BsmtQual": "TA", + "BsmtCond": "TA", + "BsmtExposure": "No", + "BsmtFinType1": "Rec", + "BsmtFinSF1": 468.0, + "BsmtFinType2": "LwQ", + "BsmtFinSF2": 144.0, + "BsmtUnfSF": 270.0, + "TotalBsmtSF": 882.0, + "Heating": "GasA", + "HeatingQC": "TA", + "CentralAir": "Y", + "Electrical": "SBrkr", + "1stFlrSF": 752, + "2ndFlrSF": 0, + "LowQualFinSF": 0, + "GrLivArea": 896, + "BsmtFullBath": 0.0, + "BsmtHalfBath": 0.0, + "FullBath": 1, + "HalfBath": 0, + "BedroomAbvGr": 2, + "KitchenAbvGr": 1, + "KitchenQual": "TA", + "TotRmsAbvGrd": 5, + "Functional": "Typ", + "Fireplaces": 0, + "FireplaceQu": null, + "GarageType": "Attchd", + "GarageYrBlt": 1961.0, + "GarageFinish": "Unf", + "GarageCars": 1.0, + "GarageArea": 730.0, + "GarageQual": "TA", + "GarageCond": "TA", + "PavedDrive": "Y", + "WoodDeckSF": 140, + "OpenPorchSF": 0, + "EnclosedPorch": 0, + "3SsnPorch": 0, + "ScreenPorch": 120, + "PoolArea": 0, + "PoolQC": null, + "Fence": "MnPrv", + "MiscFeature": null, + "MiscVal": 0, + "MoSold": 6, + "YrSold": 2008, + "SaleType": "WD", + "SaleCondition": "Normal" +},{ + "Id": 1461, + "MSSubClass": 20, + "MSZoning": "RH", + "LotFrontage": 80.0, + "LotArea": 25000, + "Street": "Pave", + "Alley": null, + "LotShape": "Reg", + "LandContour": "Lvl", + "Utilities": "AllPub", + "LotConfig": "Inside", + "LandSlope": "Gtl", + "Neighborhood": "NAmes", + "Condition1": "Feedr", + "Condition2": "Norm", + "BldgType": "1Fam", + "HouseStyle": "1Story", + "OverallQual": 5, + "OverallCond": 6, + "YearBuilt": 1969, + "YearRemodAdd": 1961, + "RoofStyle": "Gable", + "RoofMatl": "CompShg", + "Exterior1st": "VinylSd", + "Exterior2nd": "VinylSd", + "MasVnrType": "None", + "MasVnrArea": 0.0, + "ExterQual": "TA", + "ExterCond": "TA", + "Foundation": "CBlock", + "BsmtQual": "TA", + "BsmtCond": "TA", + "BsmtExposure": "No", + "BsmtFinType1": "Rec", + "BsmtFinSF1": 468.0, + "BsmtFinType2": "LwQ", + "BsmtFinSF2": 144.0, + "BsmtUnfSF": 270.0, + "TotalBsmtSF": 882.0, + "Heating": "GasA", + "HeatingQC": "TA", + "CentralAir": "Y", + "Electrical": "SBrkr", + "1stFlrSF": 752, + "2ndFlrSF": 0, + "LowQualFinSF": 0, + "GrLivArea": 896, + "BsmtFullBath": 0.0, + "BsmtHalfBath": 0.0, + "FullBath": 1, + "HalfBath": 0, + "BedroomAbvGr": 2, + "KitchenAbvGr": 1, + "KitchenQual": "TA", + "TotRmsAbvGrd": 5, + "Functional": "Typ", + "Fireplaces": 0, + "FireplaceQu": null, + "GarageType": "Attchd", + "GarageYrBlt": 1961.0, + "GarageFinish": "Unf", + "GarageCars": 1.0, + "GarageArea": 730.0, + "GarageQual": "TA", + "GarageCond": "TA", + "PavedDrive": "Y", + "WoodDeckSF": 140, + "OpenPorchSF": 0, + "EnclosedPorch": 0, + "3SsnPorch": 0, + "ScreenPorch": 120, + "PoolArea": 0, + "PoolQC": null, + "Fence": "MnPrv", + "MiscFeature": null, + "MiscVal": 0, + "MoSold": 6, + "YrSold": 2010, + "SaleType": "WD", + "SaleCondition": "Normal" +}] \ No newline at end of file diff --git a/packages/ml_api/docker/Dockerfile b/packages/ml_api/docker/Dockerfile new file mode 100644 index 0000000..9c948fc --- /dev/null +++ b/packages/ml_api/docker/Dockerfile @@ -0,0 +1,20 @@ +FROM python:3.7.5-slim-buster + +RUN mkdir -p /opt/app +COPY requirements /opt/app/requirements +RUN pip install --upgrade pip + +# ensure we can run the make commands +RUN apt-get update -y && \ + apt-get install -y make && \ + apt-get install -y libffi-dev gcc && \ + # for swagger + apt-get install -y curl && \ + # for postgres driver + apt-get install -y libpq-dev + +RUN pip install -r /opt/app/requirements/requirements.txt +ENV PYTHONPATH "${PYTHONPATH}:/opt/app/" + +ADD . /opt/app +WORKDIR /opt/app diff --git a/packages/ml_api/docker/Dockerfile.test b/packages/ml_api/docker/Dockerfile.test new file mode 100644 index 0000000..46c29ac --- /dev/null +++ b/packages/ml_api/docker/Dockerfile.test @@ -0,0 +1,18 @@ +FROM python:3.7.5-slim-buster + +RUN mkdir -p /opt/app +COPY requirements /opt/app/requirements +RUN pip install --upgrade pip + +# ensure we can run the make commands +RUN apt-get update -y && \ + apt-get install -y make && \ + apt-get install -y libffi-dev gcc && \ + # for swagger + apt-get install -y curl + +ENV PYTHONPATH "${PYTHONPATH}:/opt/app" +RUN pip install -r /opt/app/requirements/test_requirements.txt + +ADD . /opt/app +WORKDIR /opt/app diff --git a/packages/ml_api/docker/config/grafana/basic_cadvisor_dashboard_ml_api.json b/packages/ml_api/docker/config/grafana/basic_cadvisor_dashboard_ml_api.json new file mode 100644 index 0000000..58b24a1 --- /dev/null +++ b/packages/ml_api/docker/config/grafana/basic_cadvisor_dashboard_ml_api.json @@ -0,0 +1,605 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Docker monitoring with Prometheus and cAdvisor with node selection", + "editable": true, + "gnetId": 8321, + "graphTooltip": 1, + "id": 1, + "iteration": 1578230538273, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "Prometheus", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 8, + "x": 0, + "y": 0 + }, + "height": "20", + "id": 7, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(container_last_seen{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"})", + "intervalFactor": 2, + "legendFormat": "", + "metric": "container_last_seen", + "refId": "A", + "step": 240 + } + ], + "thresholds": "", + "title": "Running containers", + "transparent": true, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "Prometheus", + "editable": true, + "error": false, + "format": "mbytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 8, + "x": 8, + "y": 0 + }, + "height": "20", + "id": 5, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"})/1024/1024", + "intervalFactor": 2, + "legendFormat": "", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 240 + } + ], + "thresholds": "", + "title": "Total Memory Usage", + "transparent": true, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "Prometheus", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 8, + "x": 16, + "y": 0 + }, + "height": "20", + "id": 6, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(container_cpu_user_seconds_total{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}[5m]) * 100)", + "intervalFactor": 2, + "legendFormat": "", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 240 + } + ], + "thresholds": "", + "title": "Total CPU Usage", + "transparent": true, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 3 + }, + "hiddenSeries": false, + "id": 2, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(container_cpu_user_seconds_total{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}[5m]) * 100", + "intervalFactor": 2, + "legendFormat": "{{name}}", + "metric": "cpu", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 10 + }, + "hiddenSeries": false, + "id": 1, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "container_memory_usage_bytes{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{name}}", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "5s", + "schemaVersion": 21, + "style": "dark", + "tags": [ + "docker" + ], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "text": "cadvisor", + "value": "cadvisor" + }, + "datasource": "Prometheus", + "definition": "", + "hide": 0, + "includeAll": false, + "label": "Job", + "multi": false, + "name": "job", + "options": [], + "query": "label_values(container_cpu_user_seconds_total, job)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "cadvisor", + "value": "cadvisor" + }, + "datasource": "Prometheus", + "definition": "", + "hide": 0, + "includeAll": false, + "label": "Host:", + "multi": false, + "name": "node", + "options": [], + "query": "label_values(container_cpu_user_seconds_total{job=~\"$job\"}, instance)", + "refresh": 1, + "regex": "/([^:]+):.*/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "8080", + "value": "8080" + }, + "datasource": "Prometheus", + "definition": "", + "hide": 0, + "includeAll": false, + "label": "Port", + "multi": false, + "name": "port", + "options": [], + "query": "label_values(container_cpu_user_seconds_total{instance=~\"$node:(.*)\"}, instance)", + "refresh": 1, + "regex": "/[^:]+:(.*)/", + "skipUrlSync": false, + "sort": 3, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Docker monitoring with node selection", + "uid": "pHUTSjLZk", + "version": 2 +} \ No newline at end of file diff --git a/packages/ml_api/docker/config/grafana/grafana_flask_basic_dashboard_ml_api.json b/packages/ml_api/docker/config/grafana/grafana_flask_basic_dashboard_ml_api.json new file mode 100644 index 0000000..39224a7 --- /dev/null +++ b/packages/ml_api/docker/config/grafana/grafana_flask_basic_dashboard_ml_api.json @@ -0,0 +1,224 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 3, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(http_request_count_total{job=\"ml_api\"}[5m])", + "legendFormat": "{{app_name}} {{method}} {{endpoint}} {{http_status}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Requests Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 0 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum (rate(http_request_latency_seconds_sum{job=\"ml_api\"}[5m])) / sum (rate(http_request_latency_seconds_count{job=\"ml_api\"}[5m]))", + "legendFormat": "Average (seconds)", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "schemaVersion": 21, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Really Simple Flask Dashboard", + "uid": "q8vgEpLZl", + "version": 3 +} \ No newline at end of file diff --git a/packages/ml_api/docker/config/grafana/ml_api_dashboard.json b/packages/ml_api/docker/config/grafana/ml_api_dashboard.json new file mode 100644 index 0000000..b55b2ce --- /dev/null +++ b/packages/ml_api/docker/config/grafana/ml_api_dashboard.json @@ -0,0 +1,625 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "6.6.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Machine learning-specific metrics", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "columns": [], + "datasource": "${DS_PROMETHEUS}", + "fontSize": "100%", + "gridPos": { + "h": 4, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 7, + "links": [], + "options": {}, + "pageSize": 1, + "pluginVersion": "6.5.2", + "showHeader": true, + "sort": { + "col": null, + "desc": false + }, + "styles": [ + { + "alias": "Time", + "align": "auto", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "date" + }, + { + "alias": "", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "pattern": "/.*/", + "thresholds": [], + "type": "number", + "unit": "short" + } + ], + "targets": [ + { + "expr": "count by(live_model, live_version, shadow_model, shadow_version, version)(model_version_details_info\n* on (instance, job) group_left(version)\npython_info)", + "format": "table", + "legendFormat": "{{model_version}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Model Versions", + "transform": "table", + "transparent": true, + "type": "table" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 4 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum (rate(house_price_prediction_dollars_sum{job=\"ml_api\"}[1m])) / sum (rate(house_price_prediction_dollars_count{job=\"ml_api\"}[1m]))", + "legendFormat": "Average Prediction Amount ($)", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Average House Price Prediction Amount (USD)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 4 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(house_price_prediction_dollars_count{job=\"ml_api\"}[1m])", + "legendFormat": "Average Prediction Rate", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Average House Price Prediction Rate (/second)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 9 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg_over_time(house_price_gauge_dollars[1m])", + "legendFormat": "AVG", + "refId": "A" + }, + { + "expr": "stddev_over_time(house_price_gauge_dollars[1m])", + "legendFormat": "STD", + "refId": "B" + }, + { + "expr": "stddev_over_time(house_price_gauge_dollars[1m]) / (sqrt(count_over_time(house_price_prediction_dollars_count[1m])))", + "legendFormat": "SEM", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Standard Error of the Mean (SEM)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 9 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "(avg_over_time(house_price_gauge_dollars[1m]) - avg_over_time(house_price_gauge_dollars[1w])) / (stddev_over_time(house_price_gauge_dollars[1w]))", + "legendFormat": "Z-Score", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Standard Score (Z-Score)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 4, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 5, + "options": { + "fieldOptions": { + "calcs": [ + "logmin" + ], + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "currencyUSD" + }, + "overrides": [], + "values": false + }, + "orientation": "auto", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "6.6.1", + "targets": [ + { + "expr": "house_price_gauge_dollars", + "legendFormat": "Average Prediction Amount ($)", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Min Prediction", + "type": "gauge" + }, + { + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 8, + "options": { + "fieldOptions": { + "calcs": [ + "max" + ], + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "currencyUSD" + }, + "overrides": [], + "values": false + }, + "orientation": "auto", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "6.6.1", + "targets": [ + { + "expr": "house_price_gauge_dollars", + "legendFormat": "Average Prediction Amount ($)", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Max Prediction", + "type": "gauge" + } + ], + "schemaVersion": 22, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "ML API Dashboard", + "uid": "q8vgEpLZk", + "version": 2 +} \ No newline at end of file diff --git a/packages/ml_api/docker/config/prometheus/prometheus.yml b/packages/ml_api/docker/config/prometheus/prometheus.yml new file mode 100644 index 0000000..1e9fa32 --- /dev/null +++ b/packages/ml_api/docker/config/prometheus/prometheus.yml @@ -0,0 +1,42 @@ +# my global config +global: + scrape_interval: 15s # By default, scrape targets every 15 seconds. + evaluation_interval: 15s # By default, scrape targets every 15 seconds. + # scrape_timeout is set to the global default (10s). + + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor: 'my-project' + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + - job_name: 'prometheus' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + static_configs: + - targets: ['prometheus:9090'] + - job_name: 'ml_api' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + static_configs: + - targets: ['ml_api:5000'] + + - job_name: 'cadvisor' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + static_configs: + - targets: ['cadvisor:8080'] diff --git a/packages/ml_api/docker/docker-compose-ci-candidate.yml b/packages/ml_api/docker/docker-compose-ci-candidate.yml new file mode 100644 index 0000000..47bbc16 --- /dev/null +++ b/packages/ml_api/docker/docker-compose-ci-candidate.yml @@ -0,0 +1,20 @@ +version: '3' +services: + + ml_api: + image: christophergs/ml_api:${TARGET} + environment: + SERVER_PORT: ${SERVER_PORT:-5001} + build: + context: ../ + dockerfile: docker/Dockerfile.test + ports: + - "5001:5001" + tty: true + command: bash -c "make run-service-development" + + differential-tests: + image: christophergs/ml_api:${TARGET} + command: ["true"] + depends_on: + - ml_api \ No newline at end of file diff --git a/packages/ml_api/docker/docker-compose-ci-master.yml b/packages/ml_api/docker/docker-compose-ci-master.yml new file mode 100644 index 0000000..c0844e3 --- /dev/null +++ b/packages/ml_api/docker/docker-compose-ci-master.yml @@ -0,0 +1,20 @@ +version: '3' +services: + + ml_api: + image: christophergs/ml_api:${TARGET} + environment: + SERVER_PORT: ${SERVER_PORT:-5000} + build: + context: ../ + dockerfile: docker/Dockerfile.test + ports: + - "5000:5000" + tty: true + command: bash -c "make run-service-development" + + differential-tests: + image: christophergs/ml_api:${TARGET} + command: ["true"] + depends_on: + - ml_api \ No newline at end of file diff --git a/packages/ml_api/docker/docker-compose-elk.yml b/packages/ml_api/docker/docker-compose-elk.yml new file mode 100644 index 0000000..3ed1806 --- /dev/null +++ b/packages/ml_api/docker/docker-compose-elk.yml @@ -0,0 +1,99 @@ +version: '3.2' +services: + ml_api: + build: + context: ../ + dockerfile: docker/Dockerfile + environment: + DB_HOST: database + DB_PORT: 5432 + DB_USER: user + DB_PASSWORD: ${DB_PASSWORD:-password} + DB_NAME: ml_api_dev + networks: + - elk + depends_on: + - database + - logstash + ports: + - "5000:5000" # expose webserver to localhost host:container + command: bash -c "make db-migrations && make run-service-wsgi" + + database: + image: postgres:latest + environment: + POSTGRES_USER: user + POSTGRES_PASSWORD: password + POSTGRES_DB: ml_api_dev + ports: + # expose postgres container on different host port to default (host:container) + - "6609:5432" + volumes: + - my_dbdata:/var/lib/postgresql/data + networks: + - elk + + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:${ELK_VERSION} + volumes: + - type: bind + source: ./elasticsearch/config/elasticsearch.yml + target: /usr/share/elasticsearch/config/elasticsearch.yml + read_only: true + - type: volume + source: elasticsearch + target: /usr/share/elasticsearch/data + ports: + - "9200:9200" + - "9300:9300" + environment: + ES_JAVA_OPTS: "-Xmx256m -Xms256m" + ELASTIC_PASSWORD: changeme + # Use single node discovery in order to disable production mode and avoid bootstrap checks + # see https://www.elastic.co/guide/en/elasticsearch/reference/current/bootstrap-checks.html + discovery.type: single-node + networks: + - elk + + logstash: + image: docker.elastic.co/logstash/logstash:${ELK_VERSION} + volumes: + - type: bind + source: ./logstash/config/logstash.yml + target: /usr/share/logstash/config/logstash.yml + read_only: true + - type: bind + source: ./logstash/pipeline + target: /usr/share/logstash/pipeline + read_only: true + ports: + - "5001:5001" + - "9600:9600" + environment: + LS_JAVA_OPTS: "-Xmx256m -Xms256m" + networks: + - elk + depends_on: + - elasticsearch + + kibana: + image: docker.elastic.co/kibana/kibana:${ELK_VERSION} + volumes: + - type: bind + source: ./kibana/config/kibana.yml + target: /usr/share/kibana/config/kibana.yml + read_only: true + ports: + - "5601:5601" + networks: + - elk + depends_on: + - elasticsearch + +networks: + elk: + driver: bridge + +volumes: + my_dbdata: + elasticsearch: \ No newline at end of file diff --git a/packages/ml_api/docker/docker-compose.test.yml b/packages/ml_api/docker/docker-compose.test.yml new file mode 100644 index 0000000..44109b6 --- /dev/null +++ b/packages/ml_api/docker/docker-compose.test.yml @@ -0,0 +1,33 @@ +version: '3' +services: + ml_api_test: + image: christophergs/ml_api:master + build: + context: ../ + dockerfile: docker/Dockerfile.test + environment: + DB_HOST: test_database + DB_PORT: 5432 + DB_USER: test_user + DB_PASSWORD: ${DB_PASSWORD:-password} + DB_NAME: ml_api_test + depends_on: + - test_database + ports: + - "5000:5000" # expose webserver to localhost host:container + command: bash -c "make db-migrations && make run-service-development" + + test_database: + image: postgres:latest + environment: + POSTGRES_USER: test_user + POSTGRES_PASSWORD: password + POSTGRES_DB: ml_api_test + ports: + # expose postgres container on different host port to default (host:container) + - "6608:5432" + volumes: + - my_dbdata_test:/var/lib/postgresql/test_data + +volumes: + my_dbdata_test: diff --git a/packages/ml_api/docker/docker-compose.yml b/packages/ml_api/docker/docker-compose.yml new file mode 100644 index 0000000..92cc691 --- /dev/null +++ b/packages/ml_api/docker/docker-compose.yml @@ -0,0 +1,72 @@ +version: '3' +services: + ml_api: + build: + context: ../ + dockerfile: docker/Dockerfile + environment: + DB_HOST: database + DB_PORT: 5432 + DB_USER: user + DB_PASSWORD: ${DB_PASSWORD:-password} + DB_NAME: ml_api_dev + depends_on: + - database + - cadvisor + ports: + - "5000:5000" # expose webserver to localhost host:container + command: bash -c "make db-migrations && make run-service-wsgi" + + database: + image: postgres:latest + environment: + POSTGRES_USER: user + POSTGRES_PASSWORD: password + POSTGRES_DB: ml_api_dev + ports: + # expose postgres container on different host port to default (host:container) + - "6609:5432" + volumes: + - my_dbdata:/var/lib/postgresql/data + + prometheus: + image: prom/prometheus + container_name: prometheus + volumes: + - ./config/prometheus/:/etc/prometheus/ + - prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + expose: + - 9090 + ports: + - 9090:9090 + depends_on: + - cadvisor + + grafana: + image: grafana/grafana + depends_on: + - prometheus + ports: + - 3000:3000 + volumes: + - grafana_data:/var/lib/grafana + environment: + - GF_SECURITY_ADMIN_PASSWORD=foobar + - GF_USERS_ALLOW_SIGN_UP=false + + cadvisor: + image: google/cadvisor + volumes: + - /:/rootfs:ro + - /var/run:/var/run:rw + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + ports: + - 8080:8080 + +volumes: + my_dbdata: {} + prometheus_data: {} + grafana_data: {} diff --git a/packages/ml_api/docker/elasticsearch/config/elasticsearch.yml b/packages/ml_api/docker/elasticsearch/config/elasticsearch.yml new file mode 100644 index 0000000..b831729 --- /dev/null +++ b/packages/ml_api/docker/elasticsearch/config/elasticsearch.yml @@ -0,0 +1,10 @@ +## Default Elasticsearch configuration from Elasticsearch base image. +## https://github.com/elastic/elasticsearch/blob/master/distribution/docker/src/docker/config/elasticsearch.yml +cluster.name: "docker-cluster" +network.host: 0.0.0.0 + +## X-Pack settings +## see https://www.elastic.co/guide/en/elasticsearch/reference/current/setup-xpack.html +xpack.license.self_generated.type: basic +xpack.security.enabled: true +xpack.monitoring.collection.enabled: true diff --git a/packages/ml_api/docker/kibana/config/kibana.yml b/packages/ml_api/docker/kibana/config/kibana.yml new file mode 100644 index 0000000..6f0a087 --- /dev/null +++ b/packages/ml_api/docker/kibana/config/kibana.yml @@ -0,0 +1,12 @@ +--- +## Default Kibana configuration from Kibana base image. +## https://github.com/elastic/kibana/blob/master/src/dev/build/tasks/os_packages/docker_generator/templates/kibana_yml.template.js + +server.name: kibana +server.host: "0" +elasticsearch.hosts: [ "http://elasticsearch:9200" ] +xpack.monitoring.ui.container.elasticsearch.enabled: true + +## X-Pack security credentials +elasticsearch.username: elastic +elasticsearch.password: changeme diff --git a/packages/ml_api/docker/kibana/config/kibana_example_inputs_dashboard.ndjson b/packages/ml_api/docker/kibana/config/kibana_example_inputs_dashboard.ndjson new file mode 100644 index 0000000..266941d --- /dev/null +++ b/packages/ml_api/docker/kibana/config/kibana_example_inputs_dashboard.ndjson @@ -0,0 +1,5 @@ +{"attributes":{"fields":"[{\"name\":\"1stFlrSF\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"2ndFlrSF\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"3SsnPorch\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"@timestamp\",\"type\":\"date\",\"esTypes\":[\"date\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"@version\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"@version.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"@version\",\"subType\":\"multi\"},{\"name\":\"Alley\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Alley.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Alley\",\"subType\":\"multi\"},{\"name\":\"BedroomAbvGr\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"BldgType\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"BldgType.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"BldgType\",\"subType\":\"multi\"},{\"name\":\"BsmtCond\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"BsmtCond.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"BsmtCond\",\"subType\":\"multi\"},{\"name\":\"BsmtExposure\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"BsmtExposure.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"BsmtExposure\",\"subType\":\"multi\"},{\"name\":\"BsmtFinSF1\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"BsmtFinSF2\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"BsmtFinType1\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"BsmtFinType1.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"BsmtFinType1\",\"subType\":\"multi\"},{\"name\":\"BsmtFinType2\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"BsmtFinType2.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"BsmtFinType2\",\"subType\":\"multi\"},{\"name\":\"BsmtFullBath\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"BsmtHalfBath\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"BsmtQual\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"BsmtQual.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"BsmtQual\",\"subType\":\"multi\"},{\"name\":\"BsmtUnfSF\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"CentralAir\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"CentralAir.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"CentralAir\",\"subType\":\"multi\"},{\"name\":\"Condition1\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Condition1.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Condition1\",\"subType\":\"multi\"},{\"name\":\"Condition2\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Condition2.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Condition2\",\"subType\":\"multi\"},{\"name\":\"Electrical\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Electrical.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Electrical\",\"subType\":\"multi\"},{\"name\":\"EnclosedPorch\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"ExterCond\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"ExterCond.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"ExterCond\",\"subType\":\"multi\"},{\"name\":\"ExterQual\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"ExterQual.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"ExterQual\",\"subType\":\"multi\"},{\"name\":\"Exterior1st\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Exterior1st.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Exterior1st\",\"subType\":\"multi\"},{\"name\":\"Exterior2nd\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Exterior2nd.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Exterior2nd\",\"subType\":\"multi\"},{\"name\":\"Fence\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Fence.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Fence\",\"subType\":\"multi\"},{\"name\":\"FireplaceQu\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"FireplaceQu.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"FireplaceQu\",\"subType\":\"multi\"},{\"name\":\"Fireplaces\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"FirstFlrSF\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"Foundation\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Foundation.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Foundation\",\"subType\":\"multi\"},{\"name\":\"FullBath\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"Functional\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Functional.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Functional\",\"subType\":\"multi\"},{\"name\":\"GarageArea\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"GarageCars\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"GarageCond\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"GarageCond.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"GarageCond\",\"subType\":\"multi\"},{\"name\":\"GarageFinish\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"GarageFinish.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"GarageFinish\",\"subType\":\"multi\"},{\"name\":\"GarageQual\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"GarageQual.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"GarageQual\",\"subType\":\"multi\"},{\"name\":\"GarageType\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"GarageType.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"GarageType\",\"subType\":\"multi\"},{\"name\":\"GarageYrBlt\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"GrLivArea\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"HalfBath\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"Heating\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Heating.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Heating\",\"subType\":\"multi\"},{\"name\":\"HeatingQC\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"HeatingQC.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"HeatingQC\",\"subType\":\"multi\"},{\"name\":\"HouseStyle\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"HouseStyle.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"HouseStyle\",\"subType\":\"multi\"},{\"name\":\"Id\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"KitchenAbvGr\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"KitchenQual\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"KitchenQual.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"KitchenQual\",\"subType\":\"multi\"},{\"name\":\"LandContour\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"LandContour.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"LandContour\",\"subType\":\"multi\"},{\"name\":\"LandSlope\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"LandSlope.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"LandSlope\",\"subType\":\"multi\"},{\"name\":\"LotArea\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"LotConfig\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"LotConfig.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"LotConfig\",\"subType\":\"multi\"},{\"name\":\"LotFrontage\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"LotShape\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"LotShape.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"LotShape\",\"subType\":\"multi\"},{\"name\":\"LowQualFinSF\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"MSSubClass\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"MSZoning\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"MSZoning.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"MSZoning\",\"subType\":\"multi\"},{\"name\":\"MasVnrArea\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"MasVnrType\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"MasVnrType.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"MasVnrType\",\"subType\":\"multi\"},{\"name\":\"MiscFeature\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"MiscFeature.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"MiscFeature\",\"subType\":\"multi\"},{\"name\":\"MiscVal\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"MoSold\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"Neighborhood\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Neighborhood.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Neighborhood\",\"subType\":\"multi\"},{\"name\":\"OpenPorchSF\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"OverallCond\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"OverallQual\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"PavedDrive\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"PavedDrive.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"PavedDrive\",\"subType\":\"multi\"},{\"name\":\"PoolArea\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"RoofMatl\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"RoofMatl.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"RoofMatl\",\"subType\":\"multi\"},{\"name\":\"RoofStyle\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"RoofStyle.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"RoofStyle\",\"subType\":\"multi\"},{\"name\":\"SaleCondition\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"SaleCondition.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"SaleCondition\",\"subType\":\"multi\"},{\"name\":\"SaleType\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"SaleType.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"SaleType\",\"subType\":\"multi\"},{\"name\":\"ScreenPorch\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"SecondFlrSF\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"Street\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Street.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Street\",\"subType\":\"multi\"},{\"name\":\"ThreeSsnPortch\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"TotRmsAbvGrd\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"TotalBsmtSF\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"Utilities\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Utilities.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Utilities\",\"subType\":\"multi\"},{\"name\":\"WoodDeckSF\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"YearBuilt\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"YearRemodAdd\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"YrSold\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"_id\",\"type\":\"string\",\"esTypes\":[\"_id\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"esTypes\":[\"_index\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"esTypes\":[\"_source\"],\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"esTypes\":[\"_type\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"host\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"host.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"host\",\"subType\":\"multi\"},{\"name\":\"message\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"message.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"message\",\"subType\":\"multi\"},{\"name\":\"port\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"tags\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"tags.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"tags\",\"subType\":\"multi\"},{\"name\":\"type\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"type.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"type\",\"subType\":\"multi\"}]","timeFieldName":"@timestamp","title":"input*"},"id":"61d12f10-4b74-11ea-a505-e57bbdfb6038","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-02-15T08:38:19.009Z","version":"WzExLDJd"} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":\"\",\"language\":\"kuery\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"BsmtQual Pie Chart","uiStateJSON":"{\"vis\":{\"legendOpen\":true}}","version":1,"visState":"{\"title\":\"BsmtQual Pie Chart\",\"type\":\"pie\",\"params\":{\"type\":\"pie\",\"addTooltip\":true,\"addLegend\":true,\"legendPosition\":\"right\",\"isDonut\":true,\"labels\":{\"show\":false,\"values\":true,\"last_level\":true,\"truncate\":100},\"dimensions\":{\"metric\":{\"accessor\":0,\"format\":{\"id\":\"number\"},\"params\":{},\"aggType\":\"count\"}}},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"enabled\":true,\"type\":\"terms\",\"schema\":\"segment\",\"params\":{\"field\":\"BsmtQual.keyword\",\"orderBy\":\"1\",\"order\":\"desc\",\"size\":5,\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\"}}]}"},"id":"1d3afa50-4b76-11ea-a505-e57bbdfb6038","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"61d12f10-4b74-11ea-a505-e57bbdfb6038","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-02-15T08:38:19.009Z","version":"WzEyLDJd"} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":\"\",\"language\":\"kuery\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"LotArea Line Graph","uiStateJSON":"{}","version":1,"visState":"{\"title\":\"LotArea Line Graph\",\"type\":\"histogram\",\"params\":{\"type\":\"histogram\",\"grid\":{\"categoryLines\":false},\"categoryAxes\":[{\"id\":\"CategoryAxis-1\",\"type\":\"category\",\"position\":\"bottom\",\"show\":true,\"style\":{},\"scale\":{\"type\":\"linear\"},\"labels\":{\"show\":true,\"filter\":true,\"truncate\":100},\"title\":{}}],\"valueAxes\":[{\"id\":\"ValueAxis-1\",\"name\":\"LeftAxis-1\",\"type\":\"value\",\"position\":\"left\",\"show\":true,\"style\":{},\"scale\":{\"type\":\"linear\",\"mode\":\"normal\"},\"labels\":{\"show\":true,\"rotate\":0,\"filter\":false,\"truncate\":100},\"title\":{\"text\":\"Count\"}}],\"seriesParams\":[{\"show\":true,\"type\":\"histogram\",\"mode\":\"stacked\",\"data\":{\"label\":\"Count\",\"id\":\"1\"},\"valueAxis\":\"ValueAxis-1\",\"drawLinesBetweenPoints\":true,\"lineWidth\":2,\"showCircles\":true}],\"addTooltip\":true,\"addLegend\":true,\"legendPosition\":\"right\",\"times\":[],\"addTimeMarker\":false,\"labels\":{\"show\":false},\"thresholdLine\":{\"show\":false,\"value\":10,\"width\":1,\"style\":\"full\",\"color\":\"#34130C\"},\"dimensions\":{\"x\":null,\"y\":[{\"accessor\":0,\"format\":{\"id\":\"number\"},\"params\":{},\"aggType\":\"count\"}]}},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"enabled\":true,\"type\":\"range\",\"schema\":\"segment\",\"params\":{\"field\":\"LotArea\",\"ranges\":[{\"from\":0,\"to\":10000},{\"from\":10000,\"to\":20000},{\"from\":20000,\"to\":30000},{\"from\":40000,\"to\":50000},{\"from\":50000}]}}]}"},"id":"49eceef0-4b76-11ea-a505-e57bbdfb6038","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"61d12f10-4b74-11ea-a505-e57bbdfb6038","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-02-15T08:38:19.009Z","version":"WzEzLDJd"} +{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"language\":\"kuery\",\"query\":\"\"},\"filter\":[]}"},"optionsJSON":"{\"hidePanelTitles\":false,\"useMargins\":true}","panelsJSON":"[{\"embeddableConfig\":{},\"gridData\":{\"h\":15,\"i\":\"76c9e383-bdbb-4658-8795-118493af4ec3\",\"w\":24,\"x\":24,\"y\":0},\"panelIndex\":\"76c9e383-bdbb-4658-8795-118493af4ec3\",\"version\":\"7.5.1\",\"panelRefName\":\"panel_0\"},{\"embeddableConfig\":{},\"gridData\":{\"h\":15,\"i\":\"17e4efa7-9495-4df4-b7cf-1c3900042d8d\",\"w\":24,\"x\":0,\"y\":0},\"panelIndex\":\"17e4efa7-9495-4df4-b7cf-1c3900042d8d\",\"version\":\"7.5.1\",\"panelRefName\":\"panel_1\"}]","timeRestore":false,"title":"Example Inputs Dashboard","version":1},"id":"2daf01d0-4fcf-11ea-bad8-6dbf60384395","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"1d3afa50-4b76-11ea-a505-e57bbdfb6038","name":"panel_0","type":"visualization"},{"id":"49eceef0-4b76-11ea-a505-e57bbdfb6038","name":"panel_1","type":"visualization"}],"type":"dashboard","updated_at":"2020-02-15T08:43:03.149Z","version":"WzIyLDJd"} +{"exportedCount":4,"missingRefCount":0,"missingReferences":[]} \ No newline at end of file diff --git a/packages/ml_api/docker/logstash/config/logstash.yml b/packages/ml_api/docker/logstash/config/logstash.yml new file mode 100644 index 0000000..9f69eac --- /dev/null +++ b/packages/ml_api/docker/logstash/config/logstash.yml @@ -0,0 +1,11 @@ +--- +## Default Logstash configuration from Logstash base image. +## https://github.com/elastic/logstash/blob/master/docker/data/logstash/config/logstash-full.yml +# +http.host: "0.0.0.0" +xpack.monitoring.elasticsearch.hosts: [ "http://elasticsearch:9200" ] + +## X-Pack security credentials +xpack.monitoring.enabled: true +xpack.monitoring.elasticsearch.username: elastic +xpack.monitoring.elasticsearch.password: changeme \ No newline at end of file diff --git a/packages/ml_api/docker/logstash/pipeline/logstash.conf b/packages/ml_api/docker/logstash/pipeline/logstash.conf new file mode 100644 index 0000000..8cb6770 --- /dev/null +++ b/packages/ml_api/docker/logstash/pipeline/logstash.conf @@ -0,0 +1,26 @@ +input { + tcp { + port => 5001 + tags => ["webapp_logs"] + type => "webapp_logs" + codec => json + } +} + +output { + if [LotArea] { + elasticsearch { + hosts => "elasticsearch:9200" + user => "elastic" + password => "changeme" + index => "input_logs-%{+YYYY.MM.dd}" + } + } else { + elasticsearch { + hosts => "elasticsearch:9200" + user => "elastic" + password => "changeme" + index => "webapp_logs-%{+YYYY.MM.dd}" + } + } +} \ No newline at end of file diff --git a/packages/ml_api/docker/workaround_32_os/Dockerfile.workaround b/packages/ml_api/docker/workaround_32_os/Dockerfile.workaround new file mode 100644 index 0000000..ca8c7e9 --- /dev/null +++ b/packages/ml_api/docker/workaround_32_os/Dockerfile.workaround @@ -0,0 +1,20 @@ +FROM python:3.7.5-slim-buster + +RUN mkdir -p /opt/app +COPY requirements /opt/app/requirements +RUN pip install --upgrade pip +RUN pip install tox + +# ensure we can run the make commands +RUN apt-get update -y && \ + apt-get install -y make && \ + apt-get install -y libffi-dev gcc && \ + # for swagger + apt-get install -y curl + +RUN pip install -r /opt/app/requirements/test_requirements.txt +COPY tests /opt/app/tests +COPY tox.ini /opt/app/tox.ini +COPY api /opt/app/api +COPY run.py /opt/app/run.py +WORKDIR /opt/app diff --git a/packages/ml_api/docker/workaround_32_os/docker-compose-workaround.yml b/packages/ml_api/docker/workaround_32_os/docker-compose-workaround.yml new file mode 100644 index 0000000..171d4e2 --- /dev/null +++ b/packages/ml_api/docker/workaround_32_os/docker-compose-workaround.yml @@ -0,0 +1,13 @@ +# This is only to be used as a workaround for students who +# are unable to install the gradient_boosting_model package +# because they are on a 32 bit operating system + +version: '3' +services: + ml_api: + build: + context: ../../ + dockerfile: docker/workaround_32_os/Dockerfile.workaround + ports: + - "5000:5000" + command: bash -c "tox -e integration_tests" diff --git a/packages/ml_api/gunicorn_logging.conf b/packages/ml_api/gunicorn_logging.conf new file mode 100644 index 0000000..f4a2bdf --- /dev/null +++ b/packages/ml_api/gunicorn_logging.conf @@ -0,0 +1,49 @@ +[loggers] +keys=root, mlapi, logstash.error, logstash.access + +[handlers] +keys=console, logstash + +[formatters] +keys=generic, json + +[logger_root] +level=INFO +handlers=console +propagate=1 + +[logger_mlapi] +level=INFO +handlers=console,logstash +propagate=0 +qualname=mlapi + +[logger_logstash.error] +level=INFO +handlers=logstash +propagate=1 +qualname=gunicorn.error + +[logger_logstash.access] +level=INFO +handlers=logstash +propagate=0 +qualname=gunicorn.access + +[handler_console] +class=StreamHandler +formatter=generic +args=(sys.stdout, ) + +[handler_logstash] +class=logstash.TCPLogstashHandler +formatter=json +args=('logstash', 5001) + +[formatter_generic] +format=%(asctime)s [%(process)d] [%(levelname)s] %(message)s +datefmt=%Y-%m-%d %H:%M:%S +class=logging.Formatter + +[formatter_json] +class=pythonjsonlogger.jsonlogger.JsonFormatter diff --git a/packages/ml_api/mypy.ini b/packages/ml_api/mypy.ini new file mode 100644 index 0000000..97e52a5 --- /dev/null +++ b/packages/ml_api/mypy.ini @@ -0,0 +1,11 @@ +[mypy] +warn_unused_ignores = True +follow_imports = skip +show_error_context = True +warn_incomplete_stub = True +ignore_missing_imports = True +check_untyped_defs = True +cache_dir = /dev/null +warn_redundant_casts = True +warn_unused_configs = True +strict_optional = True diff --git a/packages/ml_api/requirements/requirements.txt b/packages/ml_api/requirements/requirements.txt new file mode 100644 index 0000000..2cab5cd --- /dev/null +++ b/packages/ml_api/requirements/requirements.txt @@ -0,0 +1,30 @@ +# ML Model +tid-gradient-boosting-model>=0.1.18,<0.2.0 + +# Old model +tid-regression-model>=2.0.20,<2.1.0 + +# Web microframework for the API +flask>=1.1.1,<1.2.0 +connexion[swagger-ui]>=2.5.1,<2.6.0 + +# repo maintenance tooling +black>=19.10b0,<20.0 +flake8>=3.7.9,<4.0 +mypy>=0.740 + +# Persistence +sqlalchemy>=1.3.11,<1.4.0 # ORM +psycopg2>=2.8.4,<2.9.0 # DB Driver +alembic>=1.3.1,<1.4.0 # DB Migrations +sqlalchemy_utils>=0.36.0,<0.37.0 # DB Utils + +# Metrics +prometheus_client>=0.7.1,<0.8.0 + +# Logging +python3-logstash>=0.4.80,<0.5.0 +python-json-logger>=0.1.11,<0.2.0 + +# Deployment +gunicorn>=20.0.4,<20.1.0 diff --git a/packages/ml_api/requirements/test_requirements.txt b/packages/ml_api/requirements/test_requirements.txt new file mode 100644 index 0000000..c909f64 --- /dev/null +++ b/packages/ml_api/requirements/test_requirements.txt @@ -0,0 +1,14 @@ +-r requirements.txt + +# testing requirements +pytest>=5.3.2,<6.0.0 +requests>=2.22.0,<2.23.0 + +# repo maintenance tooling +black>=19.10b0,<20.0 +flake8>=3.7.9,<4.0 +mypy>=0.740 + +# diff test tooling +termcolor==1.1.0 +yarl==1.3.0 \ No newline at end of file diff --git a/packages/ml_api/run.py b/packages/ml_api/run.py new file mode 100644 index 0000000..5bb5e0a --- /dev/null +++ b/packages/ml_api/run.py @@ -0,0 +1,19 @@ +import prometheus_client +from werkzeug.middleware.dispatcher import DispatcherMiddleware + +from api.app import create_app +from api.config import DevelopmentConfig, setup_app_logging + +_config = DevelopmentConfig() + +# setup logging as early as possible +setup_app_logging(config=_config) +main_app = create_app(config_object=_config).app +application = DispatcherMiddleware( + app=main_app.wsgi_app, + mounts={'/metrics': prometheus_client.make_wsgi_app()} + ) + + +if __name__ == "__main__": + main_app.run(port=_config.SERVER_PORT, host=_config.SERVER_HOST) diff --git a/packages/ml_api/scripts/differential_tests.sh b/packages/ml_api/scripts/differential_tests.sh new file mode 100755 index 0000000..98c8a91 --- /dev/null +++ b/packages/ml_api/scripts/differential_tests.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +set -euox pipefail + +MODEL_VERSION="master" +MODEL_VARIANT="candidate" +NUMBER_OF_TESTS="50" + +CANDIDATE_MODEL_SHA="$(git rev-parse HEAD)" + +# required once only (or whenever you make local changes): +# comment these two lines out otherwise as they can take some time. +make tag-push-local + +# should only be run once a model version has been finalized +# best practice is to run as part of a CI pipeline on merge to master branch. +make tag-push-master + +## Pull latest published image +env TARGET=master docker-compose --file docker/docker-compose.yml pull + +# start latest (master) image and local image +env TARGET=master SERVER_PORT=5000 docker-compose --project-name master --file docker/docker-compose-ci-master.yml up --no-recreate -d ml_api +env TARGET=$CANDIDATE_MODEL_SHA SERVER_PORT=5001 docker-compose --project-name head --file docker/docker-compose-ci-candidate.yml up --no-recreate -d ml_api + +## Start the test runner containers +env TARGET=master docker-compose --project-name master --file docker/docker-compose-ci-master.yml run -d --name differential-tests-expected differential-tests sleep infinity +env TARGET=$CANDIDATE_MODEL_SHA docker-compose --project-name head --file docker/docker-compose-ci-candidate.yml run -d --name differential-tests-actual differential-tests sleep infinity + +docker ps --all + +echo "===== Running $CANDIDATE_MODEL_SHA ... =====" + +## Compute the actual predictions (i.e. candidate model) +docker exec --user root differential-tests-actual \ + python3 differential_tests compute sample_payloads differential_tests/actual_results --base-url http://head_ml_api_1:5001 + +## Copy the actual predictions +docker cp differential-tests-actual:/opt/app/differential_tests/actual_results/. differential_tests/actual_results + +echo "===== Running master ... =====" +## Compute the expected marginals (i.e. existing model) +docker exec --user root differential-tests-expected \ + python3 differential_tests compute sample_payloads differential_tests/expected_results --base-url http://master_ml_api_1:5000 + +## Copy the expected marginals +docker cp differential-tests-expected:/opt/app/differential_tests/expected_results/. differential_tests/expected_results + +# then copy all results into the differential-tests-actual container for comparison +docker cp differential_tests/expected_results/. differential-tests-actual:/opt/app/differential_tests/expected_results + +echo "===== Comparing $CANDIDATE_MODEL_SHA vs. master ... =====" +## Compare the expected and actual marginals +docker exec differential-tests-actual \ + python3 -m differential_tests compare differential_tests/expected_results differential_tests/actual_results + +# clear any docker containers (will stop the script if no containers found) +docker rm $(docker ps -a -q) -f diff --git a/packages/ml_api/scripts/populate_database.py b/packages/ml_api/scripts/populate_database.py new file mode 100644 index 0000000..1a40323 --- /dev/null +++ b/packages/ml_api/scripts/populate_database.py @@ -0,0 +1,121 @@ +import argparse +import os +import time +import typing as t +from random import randint, choice + +import pandas as pd +import requests +from gradient_boosting_model.config.core import config +from gradient_boosting_model.processing.data_management import load_dataset + +LOCAL_URL = f'http://{os.getenv("DB_HOST", "localhost")}:5000' + +HEADERS = {"Accept": "application/json", "Content-Type": "application/json"} + +LOT_AREA_MAP = {"min": 1470, "max": 56600} + +FIRST_FLR_SF_MAP = {"min": 407, "max": 5095} + +SECOND_FLR_SF_MAP = {"min": 0, "max": 1862} + +BSMT_QUAL_VALUES = ('Gd', 'TA', 'Ex', 'Fa') + + +def _generate_random_int(value: int, value_ranges: t.Mapping) -> int: + """Generate random integer within a min and max range.""" + random_value = randint(value_ranges["min"], value_ranges["max"]) + return int(random_value) + + +def _select_random_category(value: str, value_options: t.Sequence) -> str: + """Select random category given a sequence of categories.""" + random_category = choice(value_options) + return random_category + + +def _prepare_inputs(dataframe: pd.DataFrame) -> pd.DataFrame: + """Prepare input data by removing key rows with NA values.""" + clean_inputs_df = dataframe.dropna( + subset=config.model_config.features + ["KitchenQual", "LotFrontage"] + ).copy() + + clean_inputs_df.loc[:, "FirstFlrSF"] = clean_inputs_df["FirstFlrSF"].apply( + _generate_random_int, value_ranges=FIRST_FLR_SF_MAP + ) + clean_inputs_df.loc[:, "SecondFlrSF"] = clean_inputs_df["SecondFlrSF"].apply( + _generate_random_int, value_ranges=SECOND_FLR_SF_MAP + ) + clean_inputs_df.loc[:, "LotArea"] = clean_inputs_df["LotArea"].apply( + _generate_random_int, value_ranges=LOT_AREA_MAP + ) + + clean_inputs_df.loc[:, "BsmtQual"] = clean_inputs_df["BsmtQual"].apply( + _select_random_category, value_options=BSMT_QUAL_VALUES + ) + + return clean_inputs_df + + +def populate_database(n_predictions: int = 500, anomaly: bool = False) -> None: + """ + Manipulate the test data to generate random + predictions and save them to the database. + Before running this script, ensure that the + API and Database docker containers are running. + """ + + print(f"Preparing to generate: {n_predictions} predictions.") + + # Load the gradient boosting test dataset which + # is included in the model package + test_inputs_df = load_dataset(file_name="test.csv") + clean_inputs_df = _prepare_inputs(dataframe=test_inputs_df) + if len(clean_inputs_df) < n_predictions: + print( + f"If you want {n_predictions} predictions, you need to" + "extend the script to handle more predictions." + ) + + if anomaly: + # set extremely low values to generate an outlier + n_predictions = 1 + clean_inputs_df.loc[:, "FirstFlrSF"] = 1 + clean_inputs_df.loc[:, "LotArea"] = 1 + clean_inputs_df.loc[:, "OverallQual"] = 1 + clean_inputs_df.loc[:, "GrLivArea"] = 1 + + clean_inputs_df = clean_inputs_df.where(pd.notnull(clean_inputs_df), None) + for index, data in clean_inputs_df.iterrows(): + if index > n_predictions: + if anomaly: + print('Created 1 anomaly') + break + + response = requests.post( + f"{LOCAL_URL}/v1/predictions/regression", + headers=HEADERS, + json=[data.to_dict()], + ) + response.raise_for_status() + + if index % 50 == 0: + print(f"{index} predictions complete") + + # prevent overloading the server + time.sleep(0.5) + + print("Prediction generation complete.") + + +if __name__ == "__main__": + anomaly = False + parser = argparse.ArgumentParser( + description='Send random requests to House Price API.') + parser.add_argument('--anomaly', help="generate unusual inputs") + args = parser.parse_args() + if args.anomaly: + print("Generating unusual inputs") + anomaly = True + + populate_database(n_predictions=500, anomaly=anomaly) diff --git a/packages/ml_api/tests/__init__.py b/packages/ml_api/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/ml_api/tests/conftest.py b/packages/ml_api/tests/conftest.py new file mode 100644 index 0000000..8939e04 --- /dev/null +++ b/packages/ml_api/tests/conftest.py @@ -0,0 +1,54 @@ +import os + +from unittest import mock +import pytest +from gradient_boosting_model.processing.data_management import load_dataset +from sqlalchemy_utils import create_database, database_exists + +from api.app import create_app +from api.config import TestingConfig +from api.persistence import core + + +@pytest.fixture(scope='session') +def _db(): + db_url = TestingConfig.SQLALCHEMY_DATABASE_URI + if not database_exists(db_url): + create_database(db_url) + # alembic can be configured through the configuration file. For testing + # purposes 'env.py' also checks the 'ALEMBIC_DB_URI' variable first. + engine = core.create_db_engine_from_config(config=TestingConfig()) + evars = {"ALEMBIC_DB_URI": db_url} + with mock.patch.dict(os.environ, evars): + core.run_migrations() + + yield engine + + +@pytest.fixture(scope='session') +def _db_session(_db): + """ Create DB session for testing. + """ + session = core.create_db_session(engine=_db) + yield session + + +@pytest.fixture(scope='session') +def app(_db_session): + app = create_app(config_object=TestingConfig(), db_session=_db_session).app + with app.app_context(): + yield app + + +@pytest.fixture +def client(app): + with app.test_client() as client: + yield client # Has to be yielded to access session cookies + + +@pytest.fixture +def test_inputs_df(): + # Load the gradient boosting test dataset which + # is included in the model package + test_inputs_df = load_dataset(file_name="test.csv") + return test_inputs_df.copy(deep=True) diff --git a/packages/ml_api/tests/test_api.py b/packages/ml_api/tests/test_api.py new file mode 100644 index 0000000..3d9fb5b --- /dev/null +++ b/packages/ml_api/tests/test_api.py @@ -0,0 +1,135 @@ +import json +import time + +import numpy as np +import pytest + +from api.persistence.data_access import SECONDARY_VARIABLES_TO_RENAME +from api.persistence.models import ( + GradientBoostingModelPredictions, + LassoModelPredictions, +) +from gradient_boosting_model.processing.data_management import load_dataset + + +@pytest.mark.integration +def test_health_endpoint(client): + # When + response = client.get("/") + + # Then + assert response.status_code == 200 + assert json.loads(response.data) == {"status": "ok"} + + +@pytest.mark.integration +@pytest.mark.parametrize( + "api_endpoint, expected_no_predictions", + ( + ( + "v1/predictions/regression", + # test csv contains 1459 rows + # we expect 2 rows to be filtered + 1451, + ), + ( + "v1/predictions/gradient", + # we expect 8 rows to be filtered + 1457, + ), + ), +) +def test_prediction_endpoint( + api_endpoint, expected_no_predictions, client, test_inputs_df +): + # Given + # Load the test dataset which is included in the model package + test_inputs_df = load_dataset(file_name="test.csv") # dataframe + if api_endpoint == "v1/predictions/regression": + # adjust column names to those expected by the secondary model + test_inputs_df.rename(columns=SECONDARY_VARIABLES_TO_RENAME, inplace=True) + + # When + response = client.post(api_endpoint, json=test_inputs_df.to_dict(orient="records")) + + # Then + assert response.status_code == 200 + data = json.loads(response.data) + assert data["errors"] is None + assert len(data["predictions"]) == expected_no_predictions + + +# parameterizationa allows us to try many combinations of data +# within the same test, see the pytest docs for details: +# https://docs.pytest.org/en/latest/parametrize.html +@pytest.mark.parametrize( + "field, field_value, index, expected_error", + ( + ( + "BldgType", + 1, # expected str + 33, + {"33": {"BldgType": ["Not a valid string."]}}, + ), + ( + "GarageArea", # model feature + "abc", # expected float + 45, + {"45": {"GarageArea": ["Not a valid number."]}}, + ), + ( + "CentralAir", + np.nan, # nan not allowed + 34, + {"34": {"CentralAir": ["Field may not be null."]}}, + ), + ("LotArea", "", 2, {"2": {"LotArea": ["Not a valid integer."]}}), + ), +) +@pytest.mark.integration +def test_prediction_validation( + field, field_value, index, expected_error, client, test_inputs_df +): + # Given + # Check gradient_boosting_model.processing.validation import HouseDataInputSchema + # and you will see the expected values for the inputs to the house price prediction + # model. In this test, inputs are changed to incorrect values to check the validation. + test_inputs_df.loc[index, field] = field_value + + # When + response = client.post( + "/v1/predictions/gradient", json=test_inputs_df.to_dict(orient="records") + ) + + # Then + assert response.status_code == 400 + data = json.loads(response.data) + assert data == expected_error + + +@pytest.mark.integration +def test_prediction_data_saved(client, app, test_inputs_df): + # Given + initial_gradient_count = app.db_session.query( + GradientBoostingModelPredictions + ).count() + initial_lasso_count = app.db_session.query(LassoModelPredictions).count() + + # When + response = client.post( + "/v1/predictions/regression", json=test_inputs_df.to_dict(orient="records") + ) + + # Then + assert response.status_code == 200 + assert ( + app.db_session.query(LassoModelPredictions).count() == initial_lasso_count + 1 + ) + + # The gradient prediction save occurs on a separate async thread which can take + # time to complete. We pause the test briefly to allow the save operation to finish. + time.sleep(2) + assert ( + app.db_session.query(GradientBoostingModelPredictions).count() + == initial_gradient_count + 1 + ) diff --git a/packages/ml_api/tests/test_back_to_back_models.py b/packages/ml_api/tests/test_back_to_back_models.py new file mode 100644 index 0000000..af98797 --- /dev/null +++ b/packages/ml_api/tests/test_back_to_back_models.py @@ -0,0 +1,37 @@ +import json + +import pytest +from gradient_boosting_model.processing.data_management import load_dataset + +from api.persistence.data_access import SECONDARY_VARIABLES_TO_RENAME +from differential_tests.compare import compare_differences + + +@pytest.mark.differential +def test_model_prediction_differentials(client): + test_inputs_df = load_dataset(file_name="test.csv") + old_model_inputs_df = test_inputs_df.rename( + columns=SECONDARY_VARIABLES_TO_RENAME + ) + + new_model_response = client.post( + "v1/predictions/gradient", json=test_inputs_df.to_dict(orient="records") + ) + new_model_predictions = json.loads(new_model_response.data)["predictions"] + + old_model_response = client.post( + "v1/predictions/regression", + json=old_model_inputs_df.to_dict(orient="records"), + ) + old_model_predictions = json.loads(old_model_response.data)["predictions"] + + # We just pass in the first 10 rows as the two models' validation differs + # which means they filter out a slightly different number of rows + # which would cause the differential tests to fail. + compare_differences( + expected_predictions=new_model_predictions[:10], + actual_predictions=old_model_predictions[:10], + # you would adjust the rel_tol level parameter on your model. + # right now this is extremely permissive of variation. + rel_tol=0.2, + ) diff --git a/packages/ml_api/tests/test_persistence.py b/packages/ml_api/tests/test_persistence.py new file mode 100644 index 0000000..8172157 --- /dev/null +++ b/packages/ml_api/tests/test_persistence.py @@ -0,0 +1,36 @@ +from unittest import mock +import pytest + +from api.persistence.data_access import PredictionPersistence, ModelType + +from api.persistence.models import ( + GradientBoostingModelPredictions, + LassoModelPredictions, +) + + +# parameterizationa allows us to try many combinations of data +# within the same test, see the pytest docs for details: +# https://docs.pytest.org/en/latest/parametrize.html +@pytest.mark.parametrize( + "model_type, model,", + ( + (ModelType.GRADIENT_BOOSTING, GradientBoostingModelPredictions), + (ModelType.LASSO, LassoModelPredictions), + ), +) +def test_data_access(model_type, model, test_inputs_df): + # Given + # We mock the database session + mock_session = mock.MagicMock() + _persistence = PredictionPersistence(db_session=mock_session) + + # When + _persistence.make_save_predictions( + db_model=model_type, input_data=test_inputs_df.to_dict(orient="records") + ) + + # Then + assert mock_session.commit.call_count == 1 + assert mock_session.add.call_count == 1 + assert isinstance(mock_session.add.call_args[0][0], model) diff --git a/packages/ml_api/tox.ini b/packages/ml_api/tox.ini new file mode 100644 index 0000000..031f45f --- /dev/null +++ b/packages/ml_api/tox.ini @@ -0,0 +1,141 @@ +[tox] +envlist = integration_tests,unit_tests,differential_tests,typechecks,stylechecks +skipsdist = True + + +[testenv] +install_command = pip install {opts} {packages} + +deps = + -rrequirements/test_requirements.txt + +setenv = + PYTHONPATH=. + +passenv = +# A list of wildcard environment variable names which shall be copied from +# the tox invocation environment to the test environment when executing test commands + DB_* + SHADOW_MODE_ACTIVE + +commands= + py.test + + +[testenv:integration_tests] +envdir = {toxworkdir}/integration_tests +deps = + {[testenv]deps} + +passenv = + {[testenv]passenv} + +setenv = + PYTHONPATH=. + DB_USER={env:DB_USER:test_user} + DB_PASSWORD={env:DB_PASSWORD:password} + DB_HOST={env:DB_HOST:localhost} + DB_PORT={env:DB_PORT:6608} + DB_NAME={env:DB_NAME:ml_api_test} + SHADOW_MODE_ACTIVE={env:SHADOW_MODE_ACTIVE:true} + +commands = + pytest \ + -s \ + -vv \ + -m integration \ + {posargs:tests/} + + +[testenv:unit_tests] +envdir = {toxworkdir}/integration_tests +deps = + {[testenv]deps} + +passenv = + {[testenv]passenv} + +setenv = + PYTHONPATH=. + +commands = + pytest \ + -s \ + -vv \ + -m "not integration and not differential" \ + {posargs:tests/} + + +[testenv:differential_tests] +envdir = {toxworkdir}/integration_tests +deps = + {[testenv]deps} + +passenv = + {[testenv]passenv} + +setenv = + PYTHONPATH=. + DB_USER={env:DB_USER:test_user} + DB_PASSWORD={env:DB_PASSWORD:password} + DB_HOST={env:DB_HOST:localhost} + DB_PORT={env:DB_PORT:6608} + DB_NAME={env:DB_NAME:ml_api_test} + SHADOW_MODE_ACTIVE={env:SHADOW_MODE_ACTIVE:true} + +commands = + pytest \ + -s \ + -vv \ + -m differential \ + {posargs:tests/} + + +[testenv:generate_predictions] +envdir = {toxworkdir}/generate_predictions +deps = + {[testenv]deps} + +passenv = + {[testenv]passenv} + +setenv = + PYTHONPATH=. + DB_HOST=localhost + +commands = python scripts/populate_database.py {posargs} + + +[testenv:typechecks] +envdir = {toxworkdir}/integration_tests + +deps = + {[testenv:integration_tests]deps} + +commands = {posargs:mypy api} + + +[testenv:stylechecks] +envdir = {toxworkdir}/integration_tests + +deps = + {[testenv:integration_tests]deps} + +commands = {posargs:flake8 api tests} + + +[flake8] +exclude = .git,env +max-line-length = 90 + + +[pytest] +markers = + integration: mark a test as an integration test. + differential: mark a test as a differential test. + +filterwarnings = + ignore::DeprecationWarning + ignore::RuntimeWarning + ignore::UserWarning + ignore::FutureWarning diff --git a/test.txt b/test.txt new file mode 100644 index 0000000..e69de29