Skip to content

Commit

Permalink
Merge pull request #24 from artefactory/hotfix/iam_bindings_on_SA_dep…
Browse files Browse the repository at this point in the history
…endency

QoL changes
  • Loading branch information
griseau authored Oct 13, 2020
2 parents 0472f14 + a3eb1f2 commit 0c59383
Show file tree
Hide file tree
Showing 11 changed files with 160 additions and 11 deletions.
6 changes: 6 additions & 0 deletions IaC/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,9 @@ module "mlflow" {
network_self_link = module.network.network_self_link
network_short_name = module.network.network_short_name
}

module "log_pusher" {
source = "./modules/mlflow/log_pusher"
project_id = var.project_id
depends_on = [module.mlflow]
}
1 change: 1 addition & 0 deletions IaC/modules/mlflow/artifacts/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ resource "google_storage_bucket" "this" {
type = "Delete"
}
}
uniform_bucket_level_access = var.storage_uniform
}
5 changes: 5 additions & 0 deletions IaC/modules/mlflow/artifacts/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,8 @@ variable "module_depends_on" {
type = any
default = null
}
variable "storage_uniform" {
type = bool
description = "Wether or not uniform level acces is to be activated for the buckets"
default = true
}
16 changes: 16 additions & 0 deletions IaC/modules/mlflow/log_pusher/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
resource "google_service_account" "log_pusher" {
account_id = "mlflow-log-pusher"
display_name = "mlflow log pusher"
}

resource "google_project_iam_member" "log_pusher_iap" {
project = var.project_id
role = "roles/iap.httpsResourceAccessor"
member = "serviceAccount:${google_service_account.log_pusher.email}"
}

resource "google_project_iam_member" "log_pusher_storage" {
project = var.project_id
role = "roles/storage.objectCreator"
member = "serviceAccount:${google_service_account.log_pusher.email}"
}
3 changes: 3 additions & 0 deletions IaC/modules/mlflow/log_pusher/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
variable "project_id" {
type = string
}
23 changes: 17 additions & 6 deletions IaC/modules/mlflow/server/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -23,32 +23,43 @@ resource "google_app_engine_application" "app" {
}

resource "google_project_iam_member" "cloudsql" {
depends_on = [google_app_engine_application.app]
project = data.google_project.project.project_id
role = "roles/cloudsql.client"
member = format("serviceAccount:%[email protected]", data.google_project.project.name)
}

resource "google_project_iam_member" "secret" {
depends_on = [google_app_engine_application.app]
project = data.google_project.project.project_id
role = "roles/secretmanager.secretAccessor"
member = format("serviceAccount:%[email protected]", data.google_project.project.name)
}

resource "google_project_iam_member" "gcs" {
depends_on = [google_app_engine_application.app]
project = data.google_project.project.project_id
role = "roles/storage.objectAdmin"
member = format("serviceAccount:service-%[email protected]", data.google_project.project.number)
}

resource "google_project_iam_member" "gae_gcs" {
depends_on = [google_app_engine_application.app]
project = data.google_project.project.project_id
role = "roles/storage.objectViewer"
member = format("serviceAccount:%[email protected]", data.google_project.project.name)
}

resource "google_project_iam_member" "gae_api" {
depends_on = [google_app_engine_application.app]
project = data.google_project.project.project_id
role = "roles/compute.networkUser"
member = format("serviceAccount:%[email protected]", data.google_project.project.name)
}

resource "google_app_engine_flexible_app_version" "myapp_v1" {
service = var.service
version_id = "v1"
version_id = "v0"
runtime = "custom"

deployment {
Expand All @@ -69,12 +80,13 @@ resource "google_app_engine_flexible_app_version" "myapp_v1" {

automatic_scaling {
cool_down_period = "120s"
max_total_instances = 1
min_total_instances = 1
max_total_instances = var.max_appengine_instances
min_total_instances = var.min_appengine_instances
cpu_utilization {
target_utilization = 0.5
}
}

resources {
cpu = 1
memory_gb = 2
Expand All @@ -88,7 +100,7 @@ resource "google_app_engine_flexible_app_version" "myapp_v1" {
}

noop_on_destroy = true
depends_on = [google_project_iam_member.gcs, google_project_iam_member.cloudsql, google_project_iam_member.secret, google_project_iam_member.gae_api]
depends_on = [google_project_iam_member.gcs, google_project_iam_member.gae_gcs, google_project_iam_member.cloudsql, google_project_iam_member.secret, google_project_iam_member.gae_api]
}

resource "google_iap_brand" "project_brand" {
Expand All @@ -106,5 +118,4 @@ resource "google_iap_app_engine_service_iam_binding" "member" {
service = google_app_engine_flexible_app_version.myapp_v1.service
role = "roles/iap.httpsResourceAccessor"
members = var.web_app_users
depends_on = [google_app_engine_flexible_app_version.myapp_v1]
}
}
14 changes: 13 additions & 1 deletion IaC/modules/mlflow/server/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,16 @@ variable "web_app_users" {
variable "service" {
default = "default"
}
variable "network_short_name" {}
variable "network_short_name" {
type = string
}
variable "max_appengine_instances" {
description = "The maximum number of app engine instances to scale up to"
type = number
default = 1
}
variable "min_appengine_instances" {
description = "The minimum number of app engine instances to scale down to"
type = number
default = 1
}
3 changes: 3 additions & 0 deletions IaC/prerequesites/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ module "services" {
source = "./../modules/services"
project_id = var.project_id
services = [
"cloudresourcemanager.googleapis.com",
"container.googleapis.com",
"servicenetworking.googleapis.com",
"stackdriver.googleapis.com",
Expand All @@ -30,4 +31,6 @@ module "bucket_backend" {
bucket_location = var.backend_bucket_location
number_of_version = var.backend_bucket_number_of_version
storage_class = var.backend_bucket_storage_class
storage_uniform = var.storage_uniform
versioning_enabled = var.tfstate_versionning
}
10 changes: 10 additions & 0 deletions IaC/prerequesites/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,14 @@ variable "backend_bucket_storage_class" {
description = "Storage class of your bucket"
type = string
default ="STANDARD"
}
variable "storage_uniform" {
type = bool
description = "Wether or not uniform level acces is to be activated for the buckets"
default = true
}
variable "tfstate_versionning" {
type = bool
description = "Wether or not the remote TFstate should be versioned"
default = true
}
5 changes: 5 additions & 0 deletions IaC/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,9 @@ variable "web_app_users" {
variable "network_name" {
type = string
description = "Name of the network to attach to. If empty, a new network will be created"
}
variable "storage_uniform" {
type = bool
description = "Wether or not uniform level acces is to be activated for the buckets"
default = true
}
85 changes: 81 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,102 @@ A tool to deploy a mostly serverless MLflow on a GCP project with one command
## How to use

### Pre-requesites
- A GCP project
- A GCP project on which you are owner
- Initialized gcloud SDK
- Docker engine running
- No app engine application running

### Deploying
Fill out the `vars` file.

|Variable name|Description|
|---|---|
|`TF_VAR_project_id`|Name of the GCP project|
|`TF_VAR_backend_bucket`|Name of the terraform backend bucket. Should be unique. No `gs://` prefix|
|`TF_VAR_backend_bucket`|Name of the terraform backend bucket. Should be globally unique. No `gs://` prefix|
|`TF_VAR_consent_screen_support_email`|Contact email address displayed by the SSO screen when the user trying to log in is not authorized. The address should be that of the user deploying mlflow (you) or a Cloud Identity group managed by this user|
|`TF_VAR_web_app_users`|List of authorized users/groups/domains. Should be a single quoted list of string such as '["user:jane@example.com", "group:people@example.com", "domain:example.com"]'|
|`TF_VAR_network_name`|The network the application and backend should attach to. If blank, a new network will be created.|
|`TF_VAR_network_name`|The network the application and backend should attach to. If left blank, a new network will be created.|

Run `make one-clic-mlflow` and follow the prompts.
**Run `make one-click-mlflow` and follow the prompts.**

### What it does
- Enables the necessary services
- Builds and pushes the MLFlow docker image
- Creates a private IP CloudSQL (MySQL) database for the tracking server
- Creates an AppEngine Flex service for the web UI, secured by IAP
- Manages all the network magic
- Creates the `mlflow-log-pusher` service account

### Other available make commands
- `make deploy`: builds and pushes the application image and (re)deploys the infrastructure
- `make docker`: builds and pushes the application image
- `make apply`: (re)deploys the infrastructure
- `make destroy`: destroys the infrastructure. **Will not delete the OAuth consent screen, and the app engine application**


### Pushing logs and artifacts

You will need to specify the project id hosting the tracking server and the name of your MLFlow experiment:
- `export PROJECT_ID=<my_mlflow_gcp_project>`
- `export EXPERIMENT_NAME=<my_experiement>`

You may also need to get a service account key for `mlflow-log-pusher` if you lack the necessary permissions:
- `export GOOGLE_APPLICATION_CREDENTIALS=secrets/<my_key.json>`

To be able to push logs and artifacts to the tracking server, you will need to authenticate your request.
Simply paste the following snippet in your `config.py` or `__init__.py`.

````python
import os

import six
from mlflow import set_tracking_uri, set_experiment
from google.auth.transport.requests import Request
from google.oauth2 import id_token
import requests


def _get_client_id(service_uri):
redirect_response = requests.get(service_uri, allow_redirects=False)
if redirect_response.status_code != 302:
print(f"The URI {service_uri} does not seem to be a valid AppEngine endpoint.")
return None

redirect_location = redirect_response.headers.get("location")
if not redirect_location:
print(f"No redirect location for request to {service_uri}")
return None

parsed = six.moves.urllib.parse.urlparse(redirect_location)
query_string = six.moves.urllib.parse.parse_qs(parsed.query)
return query_string["client_id"][0]


PROJECT_ID = os.environ["PROJECT_ID"]
EXPERIMENT_NAME = os.environ["EXPERIMENT_NAME"]

tracking_uri = f"https://{PROJECT_ID}.ew.r.appspot.com/"
client_id = _get_client_id(tracking_uri)
open_id_connect_token = id_token.fetch_id_token(Request(), client_id)
os.environ["MLFLOW_TRACKING_TOKEN"] = open_id_connect_token

set_tracking_uri(tracking_uri)
set_experiment(EXPERIMENT_NAME)
````

You shoud then be able to push logs and artifacts with:
```python
import os
import mlflow

# Log a parameter (key-value pair)
mlflow.log_param("param1", 42)

# Log a metric; metrics can be updated throughout the run
mlflow.log_metric("foo", 42 + 1)
mlflow.log_metric("foo", 42 + 2)
mlflow.log_metric("foo", 42 + 3)

# Log an artifact (output file)
mlflow.log_artifacts("artifact_file_path")
```

0 comments on commit 0c59383

Please sign in to comment.