diff --git a/.pipelines/azdo-ci-build-train.yml b/.pipelines/azdo-ci-build-train.yml index d1d95843..e1ed0dc6 100644 --- a/.pipelines/azdo-ci-build-train.yml +++ b/.pipelines/azdo-ci-build-train.yml @@ -8,6 +8,7 @@ trigger: - docs/ - environment_setup/ - ml_service/util/create_scoring_image.* + - ml_service/util/smoke_test_scoring_service.py variables: - template: azdo-variables.yml @@ -36,8 +37,9 @@ stages: # Invoke the Python building and publishing a training pipeline python $(Build.SourcesDirectory)/ml_service/pipelines/${{ variables.BUILD_TRAIN_SCRIPT }} displayName: 'Publish Azure Machine Learning Pipeline' + - stage: 'Trigger_AML_Pipeline' - displayName: 'Train, evaluate, register model via previously published AML pipeline' + displayName: 'Train model' jobs: - job: "Get_Pipeline_ID" condition: and(succeeded(), eq(coalesce(variables['auto-trigger-training'], 'true'), 'true')) @@ -87,32 +89,121 @@ stages: - job: "Training_Run_Report" dependsOn: "Run_ML_Pipeline" displayName: "Determine if evaluation succeeded and new model is registered" + pool: + vmImage: 'ubuntu-latest' + container: mcr.microsoft.com/mlops/python:latest + timeoutInMinutes: 0 + steps: + - template: azdo-template-get-model-version.yml +- stage: 'Deploy_ACI' + displayName: 'Deploy to ACI' + dependsOn: Trigger_AML_Pipeline + condition: and(succeeded(), variables['ACI_DEPLOYMENT_NAME']) + jobs: + - job: "Deploy_ACI" + displayName: "Deploy to ACI" + pool: + vmImage: 'ubuntu-latest' + container: mcr.microsoft.com/mlops/python:latest + timeoutInMinutes: 0 + steps: + - template: azdo-template-get-model-version.yml + - task: ms-air-aiagility.vss-services-azureml.azureml-model-deploy-task.AMLModelDeploy@0 + displayName: 'Azure ML Model Deploy' + inputs: + azureSubscription: $(WORKSPACE_SVC_CONNECTION) + modelSourceType: manualSpec + modelName: '$(MODEL_NAME)' + modelVersion: $(MODEL_VERSION) + inferencePath: '$(Build.SourcesDirectory)/code/scoring/inference_config.yml' + deploymentTarget: ACI + deploymentName: $(ACI_DEPLOYMENT_NAME) + deployConfig: '$(Build.SourcesDirectory)/code/scoring/deployment_config_aci.yml' + overwriteExistingDeployment: true + - task: AzureCLI@1 + displayName: 'Smoke test' + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + inlineScript: | + set -e # fail on error + export SUBSCRIPTION_ID=$(az account show --query id -o tsv) + python ml_service/util/smoke_test_scoring_service.py --type ACI --service "$(ACI_DEPLOYMENT_NAME)" + +- stage: 'Deploy_AKS' + displayName: 'Deploy to AKS' + dependsOn: Deploy_ACI + condition: and(succeeded(), variables['AKS_DEPLOYMENT_NAME']) + jobs: + - job: "Deploy_AKS" + displayName: "Deploy to AKS" pool: vmImage: 'ubuntu-latest' container: mcr.microsoft.com/mlops/python:latest timeoutInMinutes: 0 steps: + - template: azdo-template-get-model-version.yml + - task: ms-air-aiagility.vss-services-azureml.azureml-model-deploy-task.AMLModelDeploy@0 + displayName: 'Azure ML Model Deploy' + inputs: + azureSubscription: $(WORKSPACE_SVC_CONNECTION) + modelSourceType: manualSpec + modelName: '$(MODEL_NAME)' + modelVersion: $(MODEL_VERSION) + inferencePath: '$(Build.SourcesDirectory)/code/scoring/inference_config.yml' + deploymentTarget: AKS + aksCluster: $(AKS_COMPUTE_NAME) + deploymentName: $(AKS_DEPLOYMENT_NAME) + deployConfig: '$(Build.SourcesDirectory)/code/scoring/deployment_config_aks.yml' + overwriteExistingDeployment: true - task: AzureCLI@1 + displayName: 'Smoke test' inputs: azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' scriptLocation: inlineScript inlineScript: | set -e # fail on error export SUBSCRIPTION_ID=$(az account show --query id -o tsv) - python $(Build.SourcesDirectory)/ml_service/pipelines/verify_train_pipeline.py --build_id $(Build.BuildId) - displayName: "Determine if evaluation succeeded and new model is registered" - - task: CopyFiles@2 - displayName: 'Copy Files to: $(Build.ArtifactStagingDirectory)' + python ml_service/util/smoke_test_scoring_service.py --type AKS --service "$(AKS_DEPLOYMENT_NAME)" + +- stage: 'Deploy_Webapp' + displayName: 'Deploy to Webapp' + dependsOn: Trigger_AML_Pipeline + condition: and(succeeded(), variables['WEBAPP_DEPLOYMENT_NAME']) + jobs: + - job: "Deploy_Webapp" + displayName: "Deploy to Webapp" + pool: + vmImage: 'ubuntu-latest' + container: mcr.microsoft.com/mlops/python:latest + timeoutInMinutes: 0 + steps: + - template: azdo-template-get-model-version.yml + - task: AzureCLI@1 + displayName: 'Create scoring image and set IMAGE_LOCATION variable' inputs: - SourceFolder: '$(Build.SourcesDirectory)' - TargetFolder: '$(Build.ArtifactStagingDirectory)' - Contents: | - code/scoring/** - ml_service/util/** - - task: PublishBuildArtifacts@1 - displayName: 'Publish Artifact' + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + inlineScript: | + set -e # fail on error + export SUBSCRIPTION_ID=$(az account show --query id -o tsv) + python ml_service/util/create_scoring_image.py --output_image_location_file image_location.txt + # Output image location to Azure DevOps job + IMAGE_LOCATION="$(cat image_location.txt)" + echo "##vso[task.setvariable variable=IMAGE_LOCATION]$IMAGE_LOCATION" + - task: AzureWebAppContainer@1 + name: WebAppDeploy + displayName: 'Azure Web App on Container Deploy' + inputs: + azureSubscription: 'AzureResourceConnection' + appName: '$(WEBAPP_DEPLOYMENT_NAME)' + containers: '$(IMAGE_LOCATION)' + - task: AzureCLI@1 + displayName: 'Smoke test' inputs: - ArtifactName: 'mlops-pipelines' - publishLocation: 'container' - pathtoPublish: '$(Build.ArtifactStagingDirectory)' - TargetPath: '$(Build.ArtifactStagingDirectory)' + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + inlineScript: | + set -e # fail on error + export SUBSCRIPTION_ID=$(az account show --query id -o tsv) + python ml_service/util/smoke_test_scoring_service.py --type Webapp --service "$(WebAppDeploy.AppServiceApplicationUrl)/score" diff --git a/.pipelines/azdo-template-get-model-version.yml b/.pipelines/azdo-template-get-model-version.yml new file mode 100644 index 00000000..f69f3366 --- /dev/null +++ b/.pipelines/azdo-template-get-model-version.yml @@ -0,0 +1,14 @@ +steps: +- task: AzureCLI@1 + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + inlineScript: | + set -e # fail on error + export SUBSCRIPTION_ID=$(az account show --query id -o tsv) + python $(Build.SourcesDirectory)/ml_service/pipelines/verify_train_pipeline.py --build_id $(Build.BuildId) --output_model_version_file "model_version.txt" + # Output model version to Azure DevOps job + MODEL_VERSION="$(cat model_version.txt)" + echo "##vso[task.setvariable variable=MODEL_VERSION]$MODEL_VERSION" + name: 'getversion' + displayName: "Determine if evaluation succeeded and new model is registered" diff --git a/code/scoring/score.py b/code/scoring/score.py index 716cd0e4..b78a435c 100644 --- a/code/scoring/score.py +++ b/code/scoring/score.py @@ -38,10 +38,27 @@ def init(): model = joblib.load(model_path) -def run(raw_data): +def run(raw_data, request_headers): data = json.loads(raw_data)["data"] data = numpy.array(data) result = model.predict(data) + + # Demonstrate how we can log custom data into the Application Insights + # traces collection. + # The 'X-Ms-Request-id' value is generated internally and can be used to + # correlate a log entry with the Application Insights requests collection. + # The HTTP 'traceparent' header may be set by the caller to implement + # distributed tracing (per the W3C Trace Context proposed specification) + # and can be used to correlate the request to external systems. + print(('{{"RequestId":"{0}", ' + '"TraceParent":"{1}", ' + '"NumberOfPredictions":{2}}}' + ).format( + request_headers.get("X-Ms-Request-Id", ""), + request_headers.get("Traceparent", ""), + len(result) + )) + return {"result": result.tolist()} @@ -49,5 +66,5 @@ def run(raw_data): # Test scoring init() test_row = '{"data":[[1,2,3,4,5,6,7,8,9,10],[10,9,8,7,6,5,4,3,2,1]]}' - prediction = run(test_row) + prediction = run(test_row, {}) print("Test result: ", prediction) diff --git a/docs/getting_started.md b/docs/getting_started.md index 8cff7ec1..b5e92902 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -1,328 +1,264 @@ -# Getting Started with this Repo - -## Clone or fork this repository - -## Create an Azure DevOps account - -We use Azure DevOps for running our build(CI), retraining trigger and release -(CD) pipelines. If you don't already have an Azure DevOps account, create one by -following the instructions [here](https://docs.microsoft.com/en-us/azure/devops/organizations/accounts/create-organization?view=azure-devops). - -If you already have Azure DevOps account, create a [new project](https://docs.microsoft.com/en-us/azure/devops/organizations/projects/create-project?view=azure-devops). - -## Create an ARM Service Connection to deploy resources - -The repository includes a DevOps pipeline to deploy the Azure ML workspace and associated resources through Azure Resource Manager. - -The pipeline requires an **Azure Resource Manager** -[service connection](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/service-endpoints?view=azure-devops&tabs=yaml#create-a-service-connection). -Given this service connection, you will be able to run the IaC pipeline -and have the required permissions to generate resources. - -![create service connection](./images/create-rm-service-connection.png) - -Use **``AzureResourceConnection``** as the connection name, since it is used -in the IaC pipeline definition. Leave the **``Resource Group``** field empty. - -**Note:** Creating the ARM service connection scope requires 'Owner' or 'User Access Administrator' permissions on the subscription. -You must also have sufficient permissions to register an application with -your Azure AD tenant, or receive the ID and secret of a service principal -from your Azure AD Administrator. That principal must have 'Contributor' -permissions on the subscription. - -## Create a Variable Group for your Pipelines - -We make use of variable group inside Azure DevOps to store variables and their -values that we want to make available across multiple pipelines. You can either -store the values directly in [Azure DevOps](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=designer#create-a-variable-group) -or connect to an Azure Key Vault in your subscription. Please refer to the -documentation [here](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=designer#create-a-variable-group) to -learn more about how to create a variable group and -[link](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=designer#use-a-variable-group) it to your pipeline. -Click on **Library** in the **Pipelines** section as indicated below: - -![library_variable groups](./images/library_variable_groups.png) - -Please name your variable group **``devopsforai-aml-vg``** as we are using this -name within our build yaml file. - -The variable group should contain the following required variables: - -| Variable Name | Suggested Value | -| --------------------------- | -----------------------------------| -| BASE_NAME | [unique base name] | -| LOCATION | centralus | -| RESOURCE_GROUP | | -| WORKSPACE_NAME | mlops-AML-WS | -| WORKSPACE_SVC_CONNECTION | aml-workspace-connection | - -**Note:** - -The **WORKSPACE_NAME** parameter is used for the Azure Machine Learning Workspace creation. You can provide here an existing AML Workspace if you have one. - -The **BASE_NAME** parameter is used throughout the solution for naming -Azure resources. When the solution is used in a shared subscription, there can -be naming collisions with resources that require unique names like azure blob -storage and registry DNS naming. Make sure to give a unique value to the -BASE_NAME variable (e.g. MyUniqueML), so that the created resources will have -unique names (e.g. MyUniqueML-AML-RG, MyUniqueML-AML-KV, etc.). The length of -the BASE_NAME value should not exceed 10 characters. - -Make sure to select the **Allow access to all pipelines** checkbox in the -variable group configuration. - -## More variable options - -There are more variables used in the project. They're defined in two places one for local execution one for using Azure DevOps Pipelines - -### Local configuration - -In order to configure the project locally you have to create a copy from `.env.example` to the root and name it `.env`. Fill out all missing values and adjust the existing ones to your needs. - -For local development, you will also need to [install the Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli). Azure CLI will be used to log you in interactively. -Please be aware that the local environment also needs access to the Azure subscription so you have to have Contributor access on the Azure ML Workspace. - -### Azure DevOps configuration - -For using Azure DevOps Pipelines all other variables are stored in the file `.pipelines/azdo-variables.yml`. Adjust as needed the variables, also the defaults will give you an easy jump start. - -Up until now you should have: - -* Forked (or cloned) the repo -* Created a devops account or use an existing one -* A variable group with all configuration values - -## Create Resources with Azure Pipelines - -The easiest way to create all required resources (Resource Group, ML Workspace, -Container Registry, Storage Account, etc.) is to leverage an -"Infrastructure as Code" [pipeline in this repository](../environment_setup/iac-create-environment.yml). This **IaC** pipeline takes care of setting up -all required resources based on these [ARM templates](../environment_setup/arm-templates/cloud-environment.json). - -To set up this pipeline, you will need to do the following steps: - -1. Create an Azure Resource Manager Service Connection -1. Create a Build IaC Pipeline - -### Create a Build IaC Pipeline - -In your DevOps project, create a build pipeline from your forked **GitHub** -repository: - -![build connnect step](./images/build-connect.png) - -Then, refer to an **Existing Azure Pipelines YAML file**: - -![configure step](./images/select-iac-pipeline.png) - -Having done that, run the pipeline: - -![iac run](./images/run-iac-pipeline.png) - -Check out created resources in the [Azure Portal](portal.azure.com): - -![created resources](./images/created-resources.png) - -Alternatively, you can also use a [cleaning pipeline](../environment_setup/iac-remove-environment.yml) that removes resources created for this project or -you can just delete a resource group in the [Azure Portal](portal.azure.com). - -## Create an Azure DevOps Azure ML Workspace Service Connection -Install the **Azure Machine Learning** extension to your organization from the -[marketplace](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml), -so that you can set up a service connection to your AML workspace. - -Create a service connection to your ML workspace via the [Azure DevOps Azure ML task instructions](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml) to be able to execute the Azure ML training pipeline. The connection name specified here needs to be used for the value of the `WORKSPACE_SVC_CONNECTION` set in the variable group below. - -**Note:** Creating service connection with Azure Machine Learning workspace scope requires 'Owner' or 'User Access Administrator' permissions on the Workspace. -You must also have sufficient permissions to register an application with -your Azure AD tenant, or receive the ID and secret of a service principal -from your Azure AD Administrator. That principal must have Contributor -permissions on the Azure ML Workspace. - -## Set up Build, Release Trigger, and Release Deployment Pipelines - -Now that you have all the required resources created from the IaC pipeline, -you can set up the rest of the pipelines necessary for deploying your ML model -to production. These are the pipelines that you will be setting up: - -1. **Build pipeline:** triggered on code change to master branch on GitHub, -performs linting, unit testing, publishing a training pipeline, and runs the published training pipeline to train, evaluate, and register a model. -1. **Release Deployment pipeline:** deploys a model to QA (ACI) and Prod (AKS) -environments. - -### Set up a Build Training Pipeline - -In your [Azure DevOps](https://dev.azure.com) project create and run a new build -pipeline referring to the [azdo-ci-build-train.yml](../.pipelines/azdo-ci-build-train.yml) -pipeline in your forked **GitHub** repository: - -![configure ci build pipeline](./images/ci-build-pipeline-configure.png) - -Name the pipeline **ci-build**. Once the pipline is finished, explore the -execution logs: - -![ci build logs](./images/ci-build-logs.png) - -and checkout a published training pipeline in the **mlops-AML-WS** workspace in -[Azure Portal](https://ms.portal.azure.com/): - -![training pipeline](./images/training-pipeline.png) - -Great, you now have the build pipeline set up which automatically triggers every time there's a change in the master -branch. The pipeline performs linting, unit testing, builds and publishes and executes a -**ML Training Pipeline** in a **ML Workspace**. - -**Note:** The build pipeline contains disabled steps to build and publish ML -pipelines using R to train a model. Enable these steps if you want to play with -this approach by changing the `build-train-script` pipeline variable to either `build_train_pipeline_with_r.py`, or `build_train_pipeline_with_r_on_dbricks.py`. For the pipeline training a model with R on Databricks you have -to manually create a Databricks cluster and attach it to the ML Workspace as a -compute (Values DB_CLUSTER_ID and DATABRICKS_COMPUTE_NAME variables shoud be -specified). - -![running training pipeline](./images/running-training-pipeline.png) - -The training pipeline will train, evaluate, and register a new model. Wait until -it is finished and make sure there is a new model in the **ML Workspace**: - -![trained model](./images/trained-model.png) - -To disable the automatic trigger of the training pipeline, change the `auto-trigger-training` variable as listed in the `.pipelines\azdo-ci-build-train.yml` pipeline to `false`. This can also be overridden at runtime execution of the pipeline. - -### Set up a Release Deployment Pipeline to Deploy the Model - -The final step is to deploy the model across environments with a release -pipeline. There will be a **``QA``** environment running on -[Azure Container Instances](https://azure.microsoft.com/en-us/services/container-instances/) -and a **``Prod``** environment running on -[Azure Kubernetes Service](https://azure.microsoft.com/en-us/services/kubernetes-service). -This is the final picture of what your release pipeline should look like: - -![deploy model](./images/deploy-model.png) - -The pipeline consumes two artifacts: - -1. the result of the **Build Pipeline** as it contains configuration files -1. the **model** trained and registered by the ML training pipeline - -Add an artifact to the pipeline and select **AzureML Model Artifact** source -type. Select the **Service Endpoint** and **Model Names** from the drop down -lists. **Service Endpoint** refers to the **Service connection** created in -the previous step: - -![model artifact](./images/model-artifact.png) - -Go to the new **Releases Pipelines** section, and click new to create a new -release pipeline. A first stage is automatically created and choose -**start with an Empty job**. Name the stage **QA (ACI)** and add a single task -to the job **Azure ML Model Deploy**. Make sure that the Agent Specification -is ubuntu-16.04 under the Agent Job: - -![deploy aci](./images/deploy-aci.png) - -Specify task parameters as it is shown in the table below: - -| Parameter | Value | -| ----------------------------- | ---------------------------------------------------------------------------------------------------- | -| Display Name | Azure ML Model Deploy | -| Azure ML Workspace | mlops-AML-WS | -| Inference config Path | `$(System.DefaultWorkingDirectory)/_ci-build/mlops-pipelines/code/scoring/inference_config.yml`
_(The `_ci-build` part of the path is the source alias of your CI artifact)_ | -| Model Deployment Target | Azure Container Instance | -| Deployment Name | mlopspython-aci | -| Deployment Configuration file | `$(System.DefaultWorkingDirectory)/_ci-build/mlops-pipelines/code/scoring/deployment_config_aci.yml`
_(The `_ci-build` part of the path is the source alias of your CI artifact)_ | -| Overwrite existing deployment | X | - -In a similar way, create a stage **Prod (AKS)** and add a single task to the job -**Azure ML Model Deploy**. Make sure that the Agent Specification is -ubuntu-16.04 under the Agent Job: - -![deploy aks](./images/deploy-aks.png) - -Specify task parameters as it is shown in the table below: - -| Parameter | Value | -| --------------------------------- | ---------------------------------------------------------------------------------------------------- | -| Display Name | Azure ML Model Deploy | -| Azure ML Workspace | mlops-AML-WS | -| Inference config Path | `$(System.DefaultWorkingDirectory)/_ci-build/mlops-pipelines/code/scoring/inference_config.yml`
_(The `_ci-build` part of the path is the source alias of your CI artifact)_ | -| Model Deployment Target | Azure Kubernetes Service | -| Select AKS Cluster for Deployment | YOUR_DEPLOYMENT_K8S_CLUSTER | -| Deployment Name | mlopspython-aks | -| Deployment Configuration file | `$(System.DefaultWorkingDirectory)/_ci-build/mlops-pipelines/code/scoring/deployment_config_aks.yml`
_(The `_ci-build` part of the path is the source alias of your CI artifact)_ | -| Overwrite existing deployment | X | - -**Note:** Creating of a Kubernetes cluster on AKS is out of scope of this -tutorial, but you can find set up information in the docs -[here](https://docs.microsoft.com/en-us/azure/aks/kubernetes-walkthrough-portal#create-an-aks-cluster). - -Similarly to the **Invoke Training Pipeline** release pipeline, previously -created, in order to trigger a coutinuous integration, click on the lightning -bolt icon, make sure the **Continuous deployment trigger** is checked and -save the trigger: - -![Automate Deploy Model Pipeline](./images/automate_deploy_model_pipeline.png) - -Congratulations! You have three pipelines set up end to end: - -* **Build pipeline:** triggered on code change to master branch on GitHub, -performs linting, unit testing and publishing a training pipeline. -* **Release Trigger pipeline:** runs a published training pipeline to train, -evaluate and register a model. -* **Release Deployment pipeline:** deploys a model to QA (ACI) and Prod (AKS) -environments. - -## Deploy the trained model to Azure Web App for containers - -Note: This is an optional step and can be used only if you are deploying your -scoring service on Azure Web Apps. - -[Create Image Script](../ml_service/util/create_scoring_image.py) -can be used to create a scoring image from the release pipeline. The image -created by this script will be registered under Azure Container Registry (ACR) -instance that belongs to Azure Machine Learning Service. Any dependencies that -scoring file depends on can also be packaged with the container with Image -config. To learn more on how to create a container with AML SDK click -[here](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.image.image.image?view=azure-ml-py#create-workspace--name--models--image-config-). - -Below is release pipeline with two tasks one to create an image using the above -script and second is the deploy the image to Web App for containers. - -![release_webapp](./images/release-webapp-pipeline.PNG) - -In the Variables tab, link the pipeline to your variable group (`devopsforai-aml-vg`). In the variable group definition, add the following variables: - -| Variable Name | Suggested Value | -| --------------------------- | -----------------------------------| -| MODEL_NAME | sklearn_regression_model.pkl | -| IMAGE_NAME | diabetes | - -Add as an artifact to the pipeline the result of the **Build Pipeline** as it contains the necessary scripts. - -Use an Agent of type `ubuntu-16.04`. - -For the Azure CLI task to invoke the [Create Image Script](../ml_service/util/create_scoring_image.py), specify the following task parameters: - -| Parameter | Value | -| ------------------ | --------------------------------------------------------------------------------------------------- | -| Display name | Create Scoring Image | -| Azure subscription | aml-workspace-connection | -| Script Path | `$(System.DefaultWorkingDirectory)/_ci-build/mlops-pipeline/ml_service/util/create_scoring_image.sh`
_(The `_ci-build` part of the path is the source alias of your CI artifact)_ | -| Working directory | `$(System.DefaultWorkingDirectory)/_ci-build/mlops-pipelines`
_(The `_ci-build` part of the path is the source alias of your CI artifact)_ | - -![release_createimage](./images/release-task-createimage.PNG) - -Finally, for the Azure Web App for Containers Task, specify the following task -parameters as it is shown in the table below: - -| Parameter | Value | -| ------------------ | --------------------------------------------------------------------------------------------------- | -| Azure subscription | Subscription used to deploy Web App | -| App name | Web App for Containers name | -| Image name | Specify the fully qualified container image name. For example, 'myregistry.azurecr.io/nginx:latest' | - -![release_webapp](./images/release-task-webappdeploy.PNG) - -Save the pipeline and create a release to trigger it manually. To create the -trigger, click on the "Create release" button on the top right of your screen, -leave the fields blank and click on **Create** at the bottom of the screen. -Once the pipeline execution is finished, check out deployments in the -**mlops-AML-WS** workspace. +# Getting Started with this Repo + +## Clone or fork this repository + +## Create an Azure DevOps account + +We use Azure DevOps for running our multi-stage pipeline with build(CI), ML training and scoring service release +(CD) stages. If you don't already have an Azure DevOps account, create one by +following the instructions [here](https://docs.microsoft.com/en-us/azure/devops/organizations/accounts/create-organization?view=azure-devops). + +If you already have Azure DevOps account, create a [new project](https://docs.microsoft.com/en-us/azure/devops/organizations/projects/create-project?view=azure-devops). + +## Create an ARM Service Connection to deploy resources + +The repository includes a DevOps pipeline to deploy the Azure ML workspace and associated resources through Azure Resource Manager. + +The pipeline requires an **Azure Resource Manager** +[service connection](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/service-endpoints?view=azure-devops&tabs=yaml#create-a-service-connection). +Given this service connection, you will be able to run the IaC pipeline +and have the required permissions to generate resources. + +![create service connection](./images/create-rm-service-connection.png) + +Use **``AzureResourceConnection``** as the connection name, since it is used +in the IaC pipeline definition. Leave the **``Resource Group``** field empty. + +**Note:** Creating the ARM service connection scope requires 'Owner' or 'User Access Administrator' permissions on the subscription. +You must also have sufficient permissions to register an application with +your Azure AD tenant, or receive the ID and secret of a service principal +from your Azure AD Administrator. That principal must have 'Contributor' +permissions on the subscription. + +## Create a Variable Group for your Pipeline + +We make use of variable group inside Azure DevOps to store variables and their +values that we want to make available across multiple pipelines or pipeline stages. You can either +store the values directly in [Azure DevOps](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=designer#create-a-variable-group) +or connect to an Azure Key Vault in your subscription. Please refer to the +documentation [here](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=designer#create-a-variable-group) to +learn more about how to create a variable group and +[link](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=designer#use-a-variable-group) it to your pipeline. +Click on **Library** in the **Pipelines** section as indicated below: + +![library_variable groups](./images/library_variable_groups.png) + +Please name your variable group **``devopsforai-aml-vg``** as we are using this +name within our build yaml file. + +The variable group should contain the following required variables: + +| Variable Name | Suggested Value | +| --------------------------- | -----------------------------------| +| BASE_NAME | [unique base name] | +| LOCATION | centralus | +| RESOURCE_GROUP | | +| WORKSPACE_NAME | mlops-AML-WS | +| WORKSPACE_SVC_CONNECTION | aml-workspace-connection | +| ACI_DEPLOYMENT_NAME | diabetes-aci | + +**Note:** + +The **WORKSPACE_NAME** parameter is used for the Azure Machine Learning Workspace creation. You can provide here an existing AML Workspace if you have one. + +The **BASE_NAME** parameter is used throughout the solution for naming +Azure resources. When the solution is used in a shared subscription, there can +be naming collisions with resources that require unique names like azure blob +storage and registry DNS naming. Make sure to give a unique value to the +BASE_NAME variable (e.g. MyUniqueML), so that the created resources will have +unique names (e.g. MyUniqueML-AML-RG, MyUniqueML-AML-KV, etc.). The length of +the BASE_NAME value should not exceed 10 characters. + +Make sure to select the **Allow access to all pipelines** checkbox in the +variable group configuration. + +## More variable options + +There are more variables used in the project. They're defined in two places one for local execution one for using Azure DevOps Pipelines + +### Local configuration + +In order to configure the project locally you have to create a copy from `.env.example` to the root and name it `.env`. Fill out all missing values and adjust the existing ones to your needs. + +For local development, you will also need to [install the Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli). Azure CLI will be used to log you in interactively. +Please be aware that the local environment also needs access to the Azure subscription so you have to have Contributor access on the Azure ML Workspace. + +### Azure DevOps configuration + +For using Azure DevOps Pipelines all other variables are stored in the file `.pipelines/azdo-variables.yml`. Adjust as needed the variables, also the defaults will give you an easy jump start. + +Up until now you should have: + +* Forked (or cloned) the repo +* Created a devops account or use an existing one +* A variable group with all configuration values + +## Create Resources with Azure Pipelines + +The easiest way to create all required resources (Resource Group, ML Workspace, +Container Registry, Storage Account, etc.) is to leverage an +"Infrastructure as Code" [pipeline in this repository](../environment_setup/iac-create-environment.yml). This **IaC** pipeline takes care of setting up +all required resources based on these [ARM templates](../environment_setup/arm-templates/cloud-environment.json). + +To set up this pipeline, you will need to do the following steps: + +1. Create an Azure Resource Manager Service Connection +1. Create a Build IaC Pipeline + +### Create a Build IaC Pipeline + +In your DevOps project, create a build pipeline from your forked **GitHub** +repository: + +![build connnect step](./images/build-connect.png) + +Then, refer to an **Existing Azure Pipelines YAML file**: + +![configure step](./images/select-iac-pipeline.png) + +Having done that, run the pipeline: + +![iac run](./images/run-iac-pipeline.png) + +Check out created resources in the [Azure Portal](portal.azure.com): + +![created resources](./images/created-resources.png) + +Alternatively, you can also use a [cleaning pipeline](../environment_setup/iac-remove-environment.yml) that removes resources created for this project or +you can just delete a resource group in the [Azure Portal](portal.azure.com). + +## Create an Azure DevOps Azure ML Workspace Service Connection +Install the **Azure Machine Learning** extension to your organization from the +[marketplace](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml), +so that you can set up a service connection to your AML workspace. + +Create a service connection to your ML workspace via the [Azure DevOps Azure ML task instructions](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml) to be able to execute the Azure ML training pipeline. The connection name specified here needs to be used for the value of the `WORKSPACE_SVC_CONNECTION` set in the variable group below. + +**Note:** Creating service connection with Azure Machine Learning workspace scope requires 'Owner' or 'User Access Administrator' permissions on the Workspace. +You must also have sufficient permissions to register an application with +your Azure AD tenant, or receive the ID and secret of a service principal +from your Azure AD Administrator. That principal must have Contributor +permissions on the Azure ML Workspace. + +## Set up Build, Release Trigger, and Release Multi-Stage Pipeline + +Now that you have all the required resources created from the IaC pipeline, +you can set up the pipeline necessary for deploying your ML model +to production. The pipeline has a sequence of stages for: + +1. **Model Code Continuous Integration:** triggered on code change to master branch on GitHub, +performs linting, unit testing, publishing a training pipeline, and runs the published training pipeline to train, evaluate, and register a model. +1. **Train Model**: invokes the Azure ML service to trigger model training. +1. **Release Deployment:** deploys a model to QA (ACI) and Prod (AKS) +environments, or alternatively to Azure App Service. + +### Set up the Pipeline + +In your [Azure DevOps](https://dev.azure.com) project create and run a new build +pipeline referring to the [azdo-ci-build-train.yml](../.pipelines/azdo-ci-build-train.yml) +pipeline in your forked **GitHub** repository: + +![configure ci build pipeline](./images/ci-build-pipeline-configure.png) + +Once the pipeline is finished, explore the execution result: + +![build](./images/multi-stage-aci.png) + +and checkout a published training pipeline in the **mlops-AML-WS** workspace in +[Azure Portal](https://ms.portal.azure.com/): + +![training pipeline](./images/training-pipeline.png) + +Great, you now have the build pipeline set up which automatically triggers every time there's a change in the master branch. + +* The first stage of the pipeline, **Model CI**, perform linting, unit testing, build and publishes an **ML Training Pipeline** in a **ML Workspace**. + + **Note:** The build pipeline also supports building and publishing ML +pipelines using R to train a model. This is enabled +by changing the `build-train-script` pipeline variable to either `build_train_pipeline_with_r.py`, or `build_train_pipeline_with_r_on_dbricks.py`. For the pipeline training a model with R on Databricks you have +to manually create a Databricks cluster and attach it to the ML Workspace as a +compute (Values DB_CLUSTER_ID and DATABRICKS_COMPUTE_NAME variables should be +specified). + +* The second stage of the pipeline, **Train model**, triggers the run of the ML Training Pipeline. The training pipeline will train, evaluate, and register a new model. The actual computation is performed in an [Azure Machine Learning Compute cluster](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute). In Azure DevOps, this stage runs an agentless job that waits for the completion of the Azure ML job, so it can wait for training completion for hours or even days without using agent resources. + +* The third stage of the pipeline, **Deploy to ACI**, deploys the model to the QA environment in [Azure Container Instances](https://azure.microsoft.com/en-us/services/container-instances/). It then runs a *smoke test* to validate the deployment, i.e. sends a sample query to the scoring web service and verifies that it returns a response in the expected format. + +Wait until the pipeline finished and make sure there is a new model in the **ML Workspace**: + +![trained model](./images/trained-model.png) + +To disable the automatic trigger of the training pipeline, change the `auto-trigger-training` variable as listed in the `.pipelines\azdo-ci-build-train.yml` pipeline to `false`. This can also be overridden at runtime execution of the pipeline. + +### Deploy the Model to Azure Kubernetes Service + +The final stage is to deploy the model to the production environment running on +[Azure Kubernetes Service](https://azure.microsoft.com/en-us/services/kubernetes-service). + +**Note:** Creating of a Kubernetes cluster on AKS is out of scope of this +tutorial, but you can find set up information in the docs +[here](https://docs.microsoft.com/en-us/azure/aks/kubernetes-walkthrough-portal#create-an-aks-cluster). + +In the Variables tab, edit your variable group (`devopsforai-aml-vg`). In the variable group definition, add the following variables: + +| Variable Name | Suggested Value | +| --------------------------- | -----------------------------------| +| AKS_COMPUTE_NAME | aks | +| AKS_DEPLOYMENT_NAME | diabetes-aks | + +Set **AKS_COMPUTE_NAME** to the *Compute name* of the Inference Cluster referencing your AKS cluster in your Azure ML Workspace. + +After successfully deploying to Azure Container Instances, the next stage will deploy the model to Kubernetes and run a smoke test. + +![build](./images/multi-stage-aci-aks.png) + +## Deploy the Model to Azure App Service (Azure Web App for containers) + +Note: This is an optional step and can be used only if you are [deploying your +scoring service on Azure App Service](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-app-service). + +In the Variables tab, edit your variable group (`devopsforai-aml-vg`). In the variable group definition, add the following variable: + +| Variable Name | Suggested Value | +| --------------------------- | -----------------------------------| +| WEBAPP_DEPLOYMENT_NAME | mlopswebapp | + +Set **WEBAPP_DEPLOYMENT_NAME** to the name of your Azure Web App. Delete the **ACI_DEPLOYMENT_NAME** variable. + +The pipeline uses the [Create Image Script](../ml_service/util/create_scoring_image.py) +to create a scoring image. The image +created by this script will be registered under Azure Container Registry (ACR) +instance that belongs to Azure Machine Learning Service. Any dependencies that +scoring file depends on can also be packaged with the container with Image +config. +[Learn more on how to create a container with AML SDK](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.image.image.image?view=azure-ml-py#create-workspace--name--models--image-config-). + +Make sure your webapp has the credentials to pull the image from the Azure Container Registry created by the Infrastructure as Code pipeline. You could do this by following the instructions in the section [Configure registry credentials in web app](https://docs.microsoft.com/en-us/azure/devops/pipelines/targets/webapp-on-container-linux?view=azure-devops&tabs=dotnet-core%2Cyaml#configure-registry-credentials-in-web-app). Note that you must have run the pipeline once (including the Deploy to Webapp stage up to the `Create scoring image` step) so that an image is present in the registry, before you can connect the Webapp to the Azure Container Registry in the Azure Portal. + +![build](./images/multi-stage-webapp.png) + +# Next steps + +* The provided pipeline definition YAML file is a sample starting point, which you should tailor to your processes and environment. +* You should edit the pipeline definition to remove unused stages. For example, if you are deploying to ACI and AKS, you should delete the unused `Deploy_Webapp` stage. +* The sample pipeline generates a random value for a model hyperparameter (ridge regression [*alpha*](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html)) to generate 'interesting' charts when testing the sample. In a real application you should use fixed hyperparameter values. You can [tune hyperparameter values using Azure ML](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters), and manage their values in Azure DevOps Variable Groups. +* You may wish to enable [manual approvals](https://docs.microsoft.com/en-us/azure/devops/pipelines/process/approvals) before the deployment stages. +* You can explore aspects of model observability in the solution, such as: + * **Logging**: navigate to the Application Insights instance linked to the Azure ML Portal, + then to the Logs (Analytics) pane. The following sample query correlates HTTP requests with custom logs + generated in `score.py`, and can be used for example to analyze query duration vs. scoring batch size: + + let Traceinfo=traces + | extend d=parse_json(tostring(customDimensions.Content)) + | project workspace=customDimensions.["Workspace Name"], + service=customDimensions.["Service Name"], + NumberOfPredictions=tostring(d.NumberOfPredictions), + id=tostring(d.RequestId), + TraceParent=tostring(d.TraceParent); + requests + | project timestamp, id, success, resultCode, duration + | join kind=fullouter Traceinfo on id + | project-away id1 + + * **Distributed tracing**: The smoke test client code sets an HTTP `traceparent` header (per the [W3C Trace Context proposed specification](https://www.w3.org/TR/trace-context-1)), and the `score.py` code logs this header. The query above shows how to surface this value. You can adapt this to your tracing framework. + * **Monitoring**: You can use [Azure Monitor for containers](https://docs.microsoft.com/en-us/azure/azure-monitor/insights/container-insights-overview) to monitor the Azure ML scoring containers' performance, just as for any other container. \ No newline at end of file diff --git a/docs/images/automate_deploy_model_pipeline.png b/docs/images/automate_deploy_model_pipeline.png deleted file mode 100644 index 35c7f54e..00000000 Binary files a/docs/images/automate_deploy_model_pipeline.png and /dev/null differ diff --git a/docs/images/ci-build-logs.png b/docs/images/ci-build-logs.png deleted file mode 100644 index 726f70ac..00000000 Binary files a/docs/images/ci-build-logs.png and /dev/null differ diff --git a/docs/images/create-rm-service-connection.png b/docs/images/create-rm-service-connection.png index 629d3c2a..011018d3 100644 Binary files a/docs/images/create-rm-service-connection.png and b/docs/images/create-rm-service-connection.png differ diff --git a/docs/images/deploy-model.png b/docs/images/deploy-model.png deleted file mode 100644 index 8a4cbd06..00000000 Binary files a/docs/images/deploy-model.png and /dev/null differ diff --git a/docs/images/multi-stage-aci-aks.png b/docs/images/multi-stage-aci-aks.png new file mode 100644 index 00000000..0307fbf6 Binary files /dev/null and b/docs/images/multi-stage-aci-aks.png differ diff --git a/docs/images/multi-stage-aci.png b/docs/images/multi-stage-aci.png new file mode 100644 index 00000000..a96f3195 Binary files /dev/null and b/docs/images/multi-stage-aci.png differ diff --git a/docs/images/multi-stage-webapp.png b/docs/images/multi-stage-webapp.png new file mode 100644 index 00000000..e6d60ce1 Binary files /dev/null and b/docs/images/multi-stage-webapp.png differ diff --git a/ml_service/pipelines/verify_train_pipeline.py b/ml_service/pipelines/verify_train_pipeline.py index b677dd6e..db1725ba 100644 --- a/ml_service/pipelines/verify_train_pipeline.py +++ b/ml_service/pipelines/verify_train_pipeline.py @@ -41,6 +41,12 @@ def main(): type=str, help="The Build ID of the build triggering this pipeline run", ) + parser.add_argument( + "--output_model_version_file", + type=str, + default="model_version.txt", + help="Name of a file to write model version to" + ) args = parser.parse_args() if (args.build_id is not None): @@ -61,6 +67,11 @@ def main(): print("Model was not registered for this run.") sys.exit(1) + # Save the Model Version for other AzDO jobs after script is complete + if args.output_model_version_file is not None: + with open(args.output_model_version_file, "w") as out_file: + out_file.write(str(model.version)) + if __name__ == '__main__': main() diff --git a/ml_service/util/create_scoring_image.py b/ml_service/util/create_scoring_image.py index af7de448..0968b6c4 100644 --- a/ml_service/util/create_scoring_image.py +++ b/ml_service/util/create_scoring_image.py @@ -1,5 +1,6 @@ import os import sys +import argparse from azureml.core import Workspace from azureml.core.image import ContainerImage, Image from azureml.core.model import Model @@ -15,6 +16,15 @@ resource_group=e.resource_group ) +parser = argparse.ArgumentParser("create scoring image") +parser.add_argument( + "--output_image_location_file", + type=str, + help=("Name of a file to write image location to, " + "in format REGISTRY.azurecr.io/IMAGE_NAME:IMAGE_VERSION") +) +args = parser.parse_args() + model = Model(ws, name=e.model_name, version=e.model_version) os.chdir("./code/scoring") @@ -30,6 +40,8 @@ name=e.image_name, models=[model], image_config=image_config, workspace=ws ) +os.chdir("../..") + image.wait_for_creation(show_output=True) if image.creation_state != "Succeeded": @@ -43,3 +55,9 @@ image.image_build_log_uri, ) ) + +# Save the Image Location for other AzDO jobs after script is complete +if args.output_image_location_file is not None: + print("Writing image location to %s" % args.output_image_location_file) + with open(args.output_image_location_file, "w") as out_file: + out_file.write(str(image.image_location)) diff --git a/ml_service/util/smoke_test_scoring_service.py b/ml_service/util/smoke_test_scoring_service.py new file mode 100644 index 00000000..753ef23e --- /dev/null +++ b/ml_service/util/smoke_test_scoring_service.py @@ -0,0 +1,94 @@ +import os +import sys +import argparse +import requests +import time +from azureml.core import Workspace +from azureml.core.webservice import AksWebservice, AciWebservice +sys.path.append(os.path.abspath("./ml_service/util")) # NOQA: E402 +from env_variables import Env +import secrets + + +input = {"data": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]]} +output_len = 2 + + +def call_web_service(e, service_type, service_name): + aml_workspace = Workspace.get( + name=e.workspace_name, + subscription_id=e.subscription_id, + resource_group=e.resource_group + ) + print("Fetching service") + headers = {} + if service_type == "ACI": + service = AciWebservice(aml_workspace, service_name) + else: + service = AksWebservice(aml_workspace, service_name) + if service.auth_enabled: + service_keys = service.get_keys() + headers['Authorization'] = 'Bearer ' + service_keys[0] + print("Testing service") + print(". url: %s" % service.scoring_uri) + output = call_web_app(service.scoring_uri, headers) + + return output + + +def call_web_app(url, headers): + + # Generate an HTTP 'traceparent' distributed tracing header + # (per the W3C Trace Context proposed specification). + headers['traceparent'] = "00-{0}-{1}-00".format( + secrets.token_hex(16), secrets.token_hex(8)) + + retries = 600 + for i in range(retries): + try: + response = requests.post( + url, json=input, headers=headers) + response.raise_for_status() + return response.json() + except requests.exceptions.HTTPError as e: + if i == retries-1: + raise e + print(e) + print("Retrying...") + time.sleep(1) + + +def main(): + + parser = argparse.ArgumentParser("smoke_test_scoring_service.py") + + parser.add_argument( + "--type", + type=str, + choices=["AKS", "ACI", "Webapp"], + required=True, + help="type of service" + ) + parser.add_argument( + "--service", + type=str, + required=True, + help="Name of the image to test" + ) + args = parser.parse_args() + + e = Env() + if args.type == "Webapp": + output = call_web_app(args.service, {}) + else: + output = call_web_service(e, args.type, args.service) + print("Verifying service output") + + assert "result" in output + assert len(output["result"]) == output_len + print("Smoke test successful.") + + +if __name__ == '__main__': + main()