From 90562854f37a24e199118d7b9dc667cf7b12ce81 Mon Sep 17 00:00:00 2001 From: My Date: Thu, 18 Jun 2020 09:48:01 -0700 Subject: [PATCH] update arm template to make workspace sku configurable (#283) --- ...diabetes_regression-variables-template.yml | 17 +++--- docs/getting_started.md | 55 ++++++++++--------- .../arm-templates/cloud-environment.json | 20 ++++++- .../iac-create-environment-pipeline-arm.yml | 39 +++++++------ 4 files changed, 72 insertions(+), 59 deletions(-) diff --git a/.pipelines/diabetes_regression-variables-template.yml b/.pipelines/diabetes_regression-variables-template.yml index def14549..159f76af 100644 --- a/.pipelines/diabetes_regression-variables-template.yml +++ b/.pipelines/diabetes_regression-variables-template.yml @@ -1,8 +1,7 @@ # Pipeline template that defines common runtime environment variables. variables: - # Source Config - # The directory containing the scripts for training, evaluating, and registering the model + # The directory containing the scripts for training, evaluating, and registering the model - name: SOURCES_DIR_TRAIN value: diabetes_regression # The path to the model training script under SOURCES_DIR_TRAIN @@ -23,7 +22,7 @@ variables: value: mlopspython - name: DATASET_NAME value: diabetes_ds - # Uncomment DATASTORE_NAME if you have configured non default datastore to point to your data + # Uncomment DATASTORE_NAME if you have configured non default datastore to point to your data # - name: DATASTORE_NAME # value: datablobstore - name: DATASET_VERSION @@ -50,25 +49,23 @@ variables: # The name for the (docker/webapp) scoring image - name: IMAGE_NAME value: "diabetestrained" - + # Optional. Used by a training pipeline with R on Databricks - name: DB_CLUSTER_ID value: "" # These are the default values set in ml_service\util\env_variables.py. Uncomment and override if desired. - # Set to false to disable the evaluation step in the ML pipeline and register the newly trained model unconditionally. + # Set to false to disable the evaluation step in the ML pipeline and register the newly trained model unconditionally. # - name: RUN_EVALUATION # value: "true" - # Set to false to register the model regardless of the outcome of the evaluation step in the ML pipeline. + # Set to false to register the model regardless of the outcome of the evaluation step in the ML pipeline. # - name: ALLOW_RUN_CANCEL # value: "true" - # For debugging deployment issues. Specify a build id with the MODEL_BUILD_ID pipeline variable at queue time - # to skip training and deploy a model registered by a previous build. + # For debugging deployment issues. Specify a build id with the MODEL_BUILD_ID pipeline variable at queue time + # to skip training and deploy a model registered by a previous build. - name: modelbuildid value: $[coalesce(variables['MODEL_BUILD_ID'], variables['Build.BuildId'])] - - # Flag to allow rebuilding the AML Environment after it was built for the first time. This enables dependency updates from conda_dependencies.yaml. # - name: AML_REBUILD_ENVIRONMENT # value: "false" diff --git a/docs/getting_started.md b/docs/getting_started.md index 3dc67ab3..89981fa3 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -1,9 +1,8 @@ - # Getting Started with MLOpsPython -This guide shows how to get MLOpsPython working with a sample ML project ***diabetes_regression***. The project creates a linear regression model to predict diabetes and has CI/CD DevOps practices enabled for model training and serving when these steps are completed in this getting started guide. +This guide shows how to get MLOpsPython working with a sample ML project **_diabetes_regression_**. The project creates a linear regression model to predict diabetes and has CI/CD DevOps practices enabled for model training and serving when these steps are completed in this getting started guide. -If you would like to bring your own model code to use this template structure, follow the [custom model](custom_model.md) guide. We recommend completing this getting started guide with the diabetes model through ACI deployment first to ensure everything is working in your environment before converting the template to use your own model code. +If you would like to bring your own model code to use this template structure, follow the [custom model](custom_model.md) guide. We recommend completing this getting started guide with the diabetes model through ACI deployment first to ensure everything is working in your environment before converting the template to use your own model code. - [Setting up Azure DevOps](#setting-up-azure-devops) - [Install the Azure Machine Learning extension](#install-the-azure-machine-learning-extension) @@ -46,13 +45,13 @@ We recommend using the [repository template](https://github.com/microsoft/MLOpsP ## Create a Variable Group for your Pipeline -MLOpsPython requires some variables to be set before you can run any pipelines. You'll need to create a *variable group* in Azure DevOps to store values that are reused across multiple pipelines or pipeline stages. Either store the values directly in [Azure DevOps](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=designer#create-a-variable-group) or connect to an Azure Key Vault in your subscription. Check out the [Add & use variable groups](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=yaml#use-a-variable-group) documentation to learn more about how to create a variable group and link it to your pipeline. +MLOpsPython requires some variables to be set before you can run any pipelines. You'll need to create a _variable group_ in Azure DevOps to store values that are reused across multiple pipelines or pipeline stages. Either store the values directly in [Azure DevOps](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=designer#create-a-variable-group) or connect to an Azure Key Vault in your subscription. Check out the [Add & use variable groups](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=yaml#use-a-variable-group) documentation to learn more about how to create a variable group and link it to your pipeline. Navigate to **Library** in the **Pipelines** section as indicated below: ![Library Variable Groups](./images/library_variable_groups.png) -Create a variable group named **``devopsforai-aml-vg``**. The YAML pipeline definitions in this repository refer to this variable group by name. +Create a variable group named **`devopsforai-aml-vg`**. The YAML pipeline definitions in this repository refer to this variable group by name. The variable group should contain the following required variables. **Azure resources that don't exist yet will be created in the [Provisioning resources using Azure Pipelines](#provisioning-resources-using-azure-pipelines) step below.** @@ -80,7 +79,7 @@ More variables are available for further tweaking, but the above variables are a **WORKSPACE_NAME** is used for creating the Azure Machine Learning Workspace. You can provide an existing Azure ML Workspace here if you've got one. -**AZURE_RM_SVC_CONNECTION** is used by the [Azure Pipeline]((../environment_setup/iac-create-environment-pipeline.yml)) in Azure DevOps that creates the Azure ML workspace and associated resources through Azure Resource Manager. You'll create the connection in a [step below](#create-an-azure-devops-service-connection-for-the-azure-resource-manager). +**AZURE_RM_SVC_CONNECTION** is used by the [Azure Pipeline](../environment_setup/iac-create-environment-pipeline.yml) in Azure DevOps that creates the Azure ML workspace and associated resources through Azure Resource Manager. You'll create the connection in a [step below](#create-an-azure-devops-service-connection-for-the-azure-resource-manager). **WORKSPACE_SVC_CONNECTION** is used to reference a [service connection for the Azure ML workspace](#create-an-azure-devops-azure-ml-workspace-service-connection). You'll create the connection after [provisioning the workspace](#provisioning-resources-using-azure-pipelines) in the [Create an Azure DevOps Service Connection for the Azure ML Workspace](#create-an-azure-devops-service-connection-for-the-azure-ml-workspace) section below. @@ -92,11 +91,11 @@ The easiest way to create all required Azure resources (Resource Group, Azure ML ### Create an Azure DevOps Service Connection for the Azure Resource Manager -The [IaC provisioning pipeline]((../environment_setup/iac-create-environment-pipeline.yml)) requires an **Azure Resource Manager** [service connection](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/service-endpoints?view=azure-devops&tabs=yaml#create-a-service-connection). +The [IaC provisioning pipeline](../environment_setup/iac-create-environment-pipeline.yml) requires an **Azure Resource Manager** [service connection](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/service-endpoints?view=azure-devops&tabs=yaml#create-a-service-connection). ![Create service connection](./images/create-rm-service-connection.png) -Leave the **``Resource Group``** field empty. +Leave the **`Resource Group`** field empty. **Note:** Creating the Azure Resource Manager service connection scope requires 'Owner' or 'User Access Administrator' permissions on the subscription. You'll also need sufficient permissions to register an application with your Azure AD tenant, or you can get the ID and secret of a service principal from your Azure AD Administrator. That principal must have 'Contributor' permissions on the subscription. @@ -143,7 +142,7 @@ Now that you've provisioned all the required Azure resources and service connect ### Set up the Pipeline -In your Azure DevOps project, create and run a new build pipeline based on the [diabetes_regression-ci.yml](../.pipelines/diabetes_regression-ci.yml) +In your Azure DevOps project, create and run a new build pipeline based on the [diabetes_regression-ci.yml](../.pipelines/diabetes_regression-ci.yml) pipeline definition in your forked repository. ![Configure CI build pipeline](./images/ci-build-pipeline-configure.png) @@ -164,15 +163,15 @@ The pipeline stages are summarized below: - Linting (code quality analysis) - Unit tests and code coverage analysis -- Build and publish *ML Training Pipeline* in an *ML Workspace* +- Build and publish _ML Training Pipeline_ in an _ML Workspace_ #### Train model -- Determine the ID of the *ML Training Pipeline* published in the previous stage. -- Trigger the *ML Training Pipeline* and waits for it to complete. +- Determine the ID of the _ML Training Pipeline_ published in the previous stage. +- Trigger the _ML Training Pipeline_ and waits for it to complete. - This is an **agentless** job. The CI pipeline can wait for ML pipeline completion for hours or even days without using agent resources. -- Determine if a new model was registered by the *ML Training Pipeline*. - - If the model evaluation determines that the new model doesn't perform any better than the previous one, the new model won't register and the *ML Training Pipeline* will be **canceled**. In this case, you'll see a message in the 'Train Model' job under the 'Determine if evaluation succeeded and new model is registered' step saying '**Model was not registered for this run.**' +- Determine if a new model was registered by the _ML Training Pipeline_. + - If the model evaluation determines that the new model doesn't perform any better than the previous one, the new model won't register and the _ML Training Pipeline_ will be **canceled**. In this case, you'll see a message in the 'Train Model' job under the 'Determine if evaluation succeeded and new model is registered' step saying '**Model was not registered for this run.**' - See [evaluate_model.py](../diabetes_regression/evaluate/evaluate_model.py#L118) for the evaluation logic and [diabetes_regression_verify_train_pipeline.py](../ml_service/pipelines/diabetes_regression_verify_train_pipeline.py#L54) for the ML pipeline reporting logic. - [Additional Variables and Configuration](#additional-variables-and-configuration) for configuring this and other behavior. @@ -182,13 +181,13 @@ The pipeline stages are summarized below: - Smoke test - The test sends a sample query to the scoring web service and verifies that it returns the expected response. Have a look at the [smoke test code](../ml_service/util/smoke_test_scoring_service.py) for an example. -The pipeline uses a Docker container on the Azure Pipelines agents to accomplish the pipeline steps. The container image ***mcr.microsoft.com/mlops/python:latest*** is built with [this Dockerfile](../environment_setup/Dockerfile) and has all the necessary dependencies installed for MLOpsPython and ***diabetes_regression***. This image is an example of a custom Docker image with a pre-baked environment. The environment is guaranteed to be the same on any building agent, VM, or local machine. In your project, you'll want to build your own Docker image that only contains the dependencies and tools required for your use case. Your image will probably be smaller and faster, and it will be maintained by your team. +The pipeline uses a Docker container on the Azure Pipelines agents to accomplish the pipeline steps. The container image **_mcr.microsoft.com/mlops/python:latest_** is built with [this Dockerfile](../environment_setup/Dockerfile) and has all the necessary dependencies installed for MLOpsPython and **_diabetes_regression_**. This image is an example of a custom Docker image with a pre-baked environment. The environment is guaranteed to be the same on any building agent, VM, or local machine. In your project, you'll want to build your own Docker image that only contains the dependencies and tools required for your use case. Your image will probably be smaller and faster, and it will be maintained by your team. After the pipeline is finished, you'll see a new model in the **ML Workspace**: ![Trained model](./images/trained-model.png) -To disable the automatic trigger of the training pipeline, change the `auto-trigger-training` variable as listed in the `.pipelines\diabetes_regression-ci.yml` pipeline to `false`. You can also override the variable at runtime execution of the pipeline. +To disable the automatic trigger of the training pipeline, change the `auto-trigger-training` variable as listed in the `.pipelines\diabetes_regression-ci.yml` pipeline to `false`. You can also override the variable at runtime execution of the pipeline. To skip model training and registration, and deploy a model successfully registered by a previous build (for testing changes to the score file or inference configuration), add the variable `MODEL_BUILD_ID` when the pipeline is queued, and set the value to the ID of the previous build. @@ -202,7 +201,9 @@ MLOpsPython also can deploy to [Azure Kubernetes Service](https://azure.microsof Creating a cluster on Azure Kubernetes Service is out of scope of this tutorial, but you can find set up information on the [Quickstart: Deploy an Azure Kubernetes Service (AKS) cluster using the Azure portal](https://docs.microsoft.com/en-us/azure/aks/kubernetes-walkthrough-portal#create-an-aks-cluster) page. -**Note:** If your target deployment environment is a Kubernetes cluster and you want to implement Canary and/or A/B testing deployment strategies, check out this [tutorial](./canary_ab_deployment.md). +> **_Note_** +> +> If your target deployment environment is a Kubernetes cluster and you want to implement Canary and/or A/B testing deployment strategies, check out this [tutorial](./canary_ab_deployment.md). Keep the Azure Container Instances deployment active because it's a lightweight way to validate changes before deploying to Azure Kubernetes Service. @@ -213,7 +214,7 @@ In the Variables tab, edit your variable group (`devopsforai-aml-vg`). In the va | AKS_COMPUTE_NAME | aks | | AKS_DEPLOYMENT_NAME | mlops-aks | -Set **AKS_COMPUTE_NAME** to the *Compute name* of the Inference Cluster that references the Azure Kubernetes Service cluster in your Azure ML Workspace. +Set **AKS_COMPUTE_NAME** to the _Compute name_ of the Inference Cluster that references the Azure Kubernetes Service cluster in your Azure ML Workspace. After successfully deploying to Azure Container Instances, the next stage will deploy the model to Kubernetes and run a smoke test. @@ -249,8 +250,8 @@ Make sure your webapp has the credentials to pull the image from the Azure Conta The build pipeline also supports building and publishing Azure ML pipelines using R to train a model. You can enable it by changing the `build-train-script` pipeline variable to either of the following values: -* `diabetes_regression_build_train_pipeline_with_r.py` to train a model with R on Azure ML Compute. You'll also need to uncomment (include) the `r-essentials` Conda packages in the environment definition YAML `diabetes_regression/conda_dependencies.yml`. -* `diabetes_regression_build_train_pipeline_with_r_on_dbricks.py` to train a model with R on Databricks. You'll need to manually create a Databricks cluster and attach it to the Azure ML Workspace as a compute resource. Set the DB_CLUSTER_ID and DATABRICKS_COMPUTE_NAME variables in your variable group. +- `diabetes_regression_build_train_pipeline_with_r.py` to train a model with R on Azure ML Compute. You'll also need to uncomment (include) the `r-essentials` Conda packages in the environment definition YAML `diabetes_regression/conda_dependencies.yml`. +- `diabetes_regression_build_train_pipeline_with_r_on_dbricks.py` to train a model with R on Databricks. You'll need to manually create a Databricks cluster and attach it to the Azure ML Workspace as a compute resource. Set the DB_CLUSTER_ID and DATABRICKS_COMPUTE_NAME variables in your variable group. Example ML pipelines using R have a single step to train a model. They don't demonstrate how to evaluate and register a model. The evaluation and registering techniques are shown only in the Python implementation. @@ -258,9 +259,9 @@ Example ML pipelines using R have a single step to train a model. They don't dem You can explore aspects of model observability in the solution, such as: -* **Logging**: Navigate to the Application Insights instance linked to the Azure ML Portal, then go to the Logs (Analytics) pane. The following sample query correlates HTTP requests with custom logs generated in `score.py`. This can be used, for example, to analyze query duration vs. scoring batch size: +- **Logging**: Navigate to the Application Insights instance linked to the Azure ML Portal, then go to the Logs (Analytics) pane. The following sample query correlates HTTP requests with custom logs generated in `score.py`. This can be used, for example, to analyze query duration vs. scoring batch size: - ``` + ```sql let Traceinfo=traces | extend d=parse_json(tostring(customDimensions.Content)) | project workspace=customDimensions.["Workspace Name"], @@ -274,8 +275,8 @@ You can explore aspects of model observability in the solution, such as: | project-away id1 ``` -* **Distributed tracing**: The smoke test client code sets an HTTP `traceparent` header (per the [W3C Trace Context proposed specification](https://www.w3.org/TR/trace-context-1)), and the `score.py` code logs the header. The query above shows how to surface this value. You can adapt it to your tracing framework. -* **Monitoring**: You can use [Azure Monitor for containers](https://docs.microsoft.com/en-us/azure/azure-monitor/insights/container-insights-overview) to monitor the Azure ML scoring containers' performance. +- **Distributed tracing**: The smoke test client code sets an HTTP `traceparent` header (per the [W3C Trace Context proposed specification](https://www.w3.org/TR/trace-context-1)), and the `score.py` code logs the header. The query above shows how to surface this value. You can adapt it to your tracing framework. +- **Monitoring**: You can use [Azure Monitor for containers](https://docs.microsoft.com/en-us/azure/azure-monitor/insights/container-insights-overview) to monitor the Azure ML scoring containers' performance. ### Clean up the example resources @@ -283,8 +284,8 @@ To remove the resources created for this project, use the [/environment_setup/ia ## Next Steps: Integrating your project -* The [custom model](custom_model.md) guide includes information on bringing your own code to this repository template. -* Consider using [Azure Pipelines self-hosted agents](https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/agents?view=azure-devops&tabs=browser#install) to speed up your Azure ML pipeline execution. The Docker container image for the Azure ML pipeline is sizable, and having it cached on the agent between runs can trim several minutes from your runs. +- The [custom model](custom_model.md) guide includes information on bringing your own code to this repository template. +- Consider using [Azure Pipelines self-hosted agents](https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/agents?view=azure-devops&tabs=browser#install) to speed up your Azure ML pipeline execution. The Docker container image for the Azure ML pipeline is sizable, and having it cached on the agent between runs can trim several minutes from your runs. ### Additional Variables and Configuration @@ -294,7 +295,7 @@ There are more variables used in the project. They're defined in two places: one For using Azure Pipelines, all other variables are stored in the file `.pipelines/diabetes_regression-variables-template.yml`. Using the default values as a starting point, adjust the variables to suit your requirements. -In that folder, you'll also find the `parameters.json` file that we recommend using to provide parameters for training, evaluation, and scoring scripts. The sample parameter that `diabetes_regression` uses is the ridge regression [*alpha* hyperparameter](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html). We don't provide any serializers for this config file. +In that folder, you'll also find the `parameters.json` file that we recommend using to provide parameters for training, evaluation, and scoring scripts. The sample parameter that `diabetes_regression` uses is the ridge regression [_alpha_ hyperparameter](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html). We don't provide any serializers for this config file. #### Local configuration diff --git a/environment_setup/arm-templates/cloud-environment.json b/environment_setup/arm-templates/cloud-environment.json index f2b2ac2f..6d1114f6 100644 --- a/environment_setup/arm-templates/cloud-environment.json +++ b/environment_setup/arm-templates/cloud-environment.json @@ -45,6 +45,17 @@ "acr": { "type": "string", "defaultValue": "[concat(toLower(parameters('baseName')),'amlcr')]" + }, + "sku": { + "type": "string", + "defaultValue": "basic", + "allowedValues": [ + "basic", + "enterprise" + ], + "metadata": { + "description": "Specifies the sku, also referred as 'edition' of the Azure Machine Learning workspace." + } } }, "variables": { @@ -92,7 +103,8 @@ "name": "standard", "family": "A" }, - "accessPolicies": [] + "accessPolicies": [ + ] } }, { @@ -131,6 +143,10 @@ "identity": { "type": "systemAssigned" }, + "sku": { + "tier": "[parameters('sku')]", + "name": "[parameters('sku')]" + }, "properties": { "friendlyName": "[variables('amlWorkspaceName')]", "keyVault": "[resourceId('Microsoft.KeyVault/vaults',variables('keyVaultName'))]", @@ -138,6 +154,6 @@ "containerRegistry": "[resourceId('Microsoft.ContainerRegistry/registries',variables('containerRegistryName'))]", "storageAccount": "[resourceId('Microsoft.Storage/storageAccounts/',variables('storageAccountName'))]" } - } + } ] } \ No newline at end of file diff --git a/environment_setup/iac-create-environment-pipeline-arm.yml b/environment_setup/iac-create-environment-pipeline-arm.yml index f21c6eab..0b9f474c 100644 --- a/environment_setup/iac-create-environment-pipeline-arm.yml +++ b/environment_setup/iac-create-environment-pipeline-arm.yml @@ -2,36 +2,35 @@ trigger: branches: include: - - master + - master paths: include: - - environment_setup/arm-templates/* + - environment_setup/arm-templates/* pr: branches: include: - - master + - master paths: include: - - environment_setup/arm-templates/* + - environment_setup/arm-templates/* pool: - vmImage: 'ubuntu-latest' + vmImage: "ubuntu-latest" variables: -- group: devopsforai-aml-vg - + - group: devopsforai-aml-vg + - name: WORKSPACE_SKU # https://docs.microsoft.com/en-us/azure/machine-learning/overview-what-is-azure-ml#sku + value: basic steps: -- task: AzureResourceGroupDeployment@2 - inputs: - azureSubscription: '$(AZURE_RM_SVC_CONNECTION)' - action: 'Create Or Update Resource Group' - resourceGroupName: '$(RESOURCE_GROUP)' - location: $(LOCATION) - templateLocation: 'Linked artifact' - csmFile: '$(Build.SourcesDirectory)/environment_setup/arm-templates/cloud-environment.json' - overrideParameters: '-baseName $(BASE_NAME) -location $(LOCATION) -workspace $(WORKSPACE_NAME)' - deploymentMode: 'Incremental' - displayName: 'Deploy MLOps resources to Azure' - - \ No newline at end of file + - task: AzureResourceGroupDeployment@2 + inputs: + azureSubscription: "$(AZURE_RM_SVC_CONNECTION)" + action: "Create Or Update Resource Group" + resourceGroupName: "$(RESOURCE_GROUP)" + location: $(LOCATION) + templateLocation: "Linked artifact" + csmFile: "$(Build.SourcesDirectory)/environment_setup/arm-templates/cloud-environment.json" + overrideParameters: "-baseName $(BASE_NAME) -location $(LOCATION) -workspace $(WORKSPACE_NAME) -sku $(WORKSPACE_SKU)" + deploymentMode: "Incremental" + displayName: "Deploy MLOps resources to Azure"