Upgrade build train CI pipeline to multi-stage (#90)

* add staged pipeline * remove release id * remove train in release pipeline * revert to BASE_NAME vars * Move train trigger to new stage * cleanup register comments * add conditional for triggering train pipe * update doc steps * string vs boolean * var to boolean * set to false * try with true * cleanup images * Use Coalesce so override works * add back build artifacts * address feedback * include code/scoring path for ci
microsoft · Nov 12, 2019 · d5864f3 · d5864f3
1 parent fcc6fde
commit d5864f3
Show file tree

Hide file tree

Showing 11 changed files with 68 additions and 151 deletions.
diff --git a/.pipelines/azdo-ci-build-train.yml b/.pipelines/azdo-ci-build-train.yml
@@ -3,62 +3,69 @@ trigger:
   branches:
     include:
     - master
-
-pool: 
-  vmImage: 'ubuntu-latest'
-
-container: mcr.microsoft.com/mlops/python:latest
-
+  paths:
+    exclude:
+    - docs/
+    - environment_setup/
+    - charts/
+    - ml_service/util/create_scoring_image.py
 
 variables:
 - group: devopsforai-aml-vg
+# Choose from default, build_train_pipeline_with_r.py, or build_train_pipeline_with_r_on_dbricks.py
+- name: build-train-script
+  value: 'build_train_pipeline.py'
+# Automatically triggers the train, evaluate, register pipeline after the CI steps. 
+# Uncomment to set to false or add same variable name at queue time with value of false to disable.
+# - name: auto-trigger-training
+#   value: false
 
-
-steps:
-
-- template: azdo-base-pipeline.yml
-
-- bash: |
-   # Invoke the Python building and publishing a training pipeline with Python on ML Compute
-   python3 $(Build.SourcesDirectory)/ml_service/pipelines/build_train_pipeline.py
-  failOnStderr: 'false'
-  env:
-    SP_APP_SECRET: '$(SP_APP_SECRET)'
-  displayName: 'Publish Azure Machine Learning Pipeline. Python on ML'
-  enabled: 'true'
-
-- bash: |
-   # Invoke the Python building and publishing a training pipeline with R on ML Compute
-   python3 $(Build.SourcesDirectory)/ml_service/pipelines/build_train_pipeline_with_r.py
-  failOnStderr: 'false'
-  env:
-    SP_APP_SECRET: '$(SP_APP_SECRET)'
-  displayName: 'Publish Azure Machine Learning Pipeline. R on ML Compute'
-  enabled: 'false'
-
-- bash: |
-   # Invoke the Python building and publishing a training pipeline with R on DataBricks
-   python3 $(Build.SourcesDirectory)/ml_service/pipelines/build_train_pipeline_with_r_on_dbricks.py
-  failOnStderr: 'false'
-  env:
-    SP_APP_SECRET: '$(SP_APP_SECRET)'
-  displayName: 'Publish Azure Machine Learning Pipeline. R on DataBricks'
-  enabled: 'false'
-
-- task: CopyFiles@2
-  displayName: 'Copy Files to: $(Build.ArtifactStagingDirectory)'
-  inputs:
-    SourceFolder: '$(Build.SourcesDirectory)'
-    TargetFolder: '$(Build.ArtifactStagingDirectory)'
-    Contents: |
-      ml_service/pipelines/?(run_train_pipeline.py|*.json)
-      code/scoring/**
-
-
-- task: PublishBuildArtifacts@1
-  displayName: 'Publish Artifact'
-  inputs:
-    ArtifactName: 'mlops-pipelines'
-    publishLocation: 'container'
-    pathtoPublish: '$(Build.ArtifactStagingDirectory)' 
-    TargetPath: '$(Build.ArtifactStagingDirectory)'  
+stages:
+- stage: 'Model_CI'
+  displayName: 'Model CI'
+  jobs:
+  - job: "Model_CI_Pipeline"
+    displayName: "Model CI Pipeline"
+    pool: 
+      vmImage: 'ubuntu-latest'
+    container: mcr.microsoft.com/mlops/python:latest
+    timeoutInMinutes: 0
+    steps:
+    - template: azdo-base-pipeline.yml
+    - script: |
+        # Invoke the Python building and publishing a training pipeline
+        python3 $(Build.SourcesDirectory)/ml_service/pipelines/$(build-train-script)
+      failOnStderr: 'false'
+      env:
+        SP_APP_SECRET: '$(SP_APP_SECRET)'
+      displayName: 'Publish Azure Machine Learning Pipeline'
+- stage: 'Trigger_AML_Pipeline'
+  displayName: 'Train, evaluate, register model via previously published AML pipeline'
+  jobs:
+  - job: "Invoke_Model_Pipeline"
+    condition: and(succeeded(), eq(coalesce(variables['auto-trigger-training'], 'true'), 'true'))
+    displayName: "Invoke Model Pipeline and evaluate results to register"
+    pool: 
+      vmImage: 'ubuntu-latest'
+    container: mcr.microsoft.com/mlops/python:latest
+    timeoutInMinutes: 0
+    steps:
+    - script: |
+        python $(Build.SourcesDirectory)/ml_service/pipelines/run_train_pipeline.py
+      displayName: 'Trigger Training Pipeline'
+      env:
+        SP_APP_SECRET: '$(SP_APP_SECRET)'
+    - task: CopyFiles@2
+      displayName: 'Copy Files to: $(Build.ArtifactStagingDirectory)'
+      inputs:
+        SourceFolder: '$(Build.SourcesDirectory)'
+        TargetFolder: '$(Build.ArtifactStagingDirectory)'
+        Contents: |
+          code/scoring/**
+    - task: PublishBuildArtifacts@1
+      displayName: 'Publish Artifact'
+      inputs:
+        ArtifactName: 'mlops-pipelines'
+        publishLocation: 'container'
+        pathtoPublish: '$(Build.ArtifactStagingDirectory)' 
+        TargetPath: '$(Build.ArtifactStagingDirectory)'
diff --git a/docs/getting_started.md b/docs/getting_started.md
@@ -145,9 +145,7 @@ you can set up the rest of the pipelines necessary for deploying your ML model
 to production. These are the pipelines that you will be setting up:
 
 1. **Build pipeline:** triggered on code change to master branch on GitHub,
-performs linting, unit testing and publishing a training pipeline.
-1. **Release Trigger pipeline:** runs a published training pipeline to train,
-evaluate and register a model.
+performs linting, unit testing, publishing a training pipeline, and runs the published training pipeline to train, evaluate, and register a model.
 1. **Release Deployment pipeline:** deploys a model to QA (ACI) and Prod (AKS)
 environments.
 
@@ -169,88 +167,25 @@ and checkout a published training pipeline in the **mlops-AML-WS** workspace in
 
 ![training pipeline](./images/training-pipeline.png)
 
-Great, you now have the build pipeline set up which can either be manually
-triggered or automatically triggered every time there's a change in the master
-branch. The pipeline performs linting, unit testing, and builds and publishes an
+Great, you now have the build pipeline set up which automatically triggers every time there's a change in the master
+branch. The pipeline performs linting, unit testing, builds and publishes and executes a 
 **ML Training Pipeline** in a **ML Workspace**.
 
 **Note:** The build pipeline contains disabled steps to build and publish ML
 pipelines using R to train a model. Enable these steps if you want to play with
-this approach. For the pipeline training a model with R on Databricks you have
+this approach by changing the `build-train-script` pipeline variable to either `build_train_pipeline_with_r.py`, or `build_train_pipeline_with_r_on_dbricks.py`. For the pipeline training a model with R on Databricks you have
 to manually create a Databricks cluster and attach it to the ML Workspace as a
 compute (Values DB_CLUSTER_ID and DATABRICKS_COMPUTE_NAME variables shoud be
 specified).
 
-### Set up a Release Trigger Pipeline to Train the Model
-
-The next step is to invoke the training pipeline created in the previous step.
-It can be done with a **Release Pipeline**. Click on the Pipelines/Releases
-menu, and then **New pipeline**, and then click on "Empty Job" on the
-"Select a template" window that pops to the right:
-
-![invoke training pipeline](./images/invoke-training-pipeline.png)
-
-Next, click on "Add an artifact". We will select the artifact of this pipeline
-to be the result of the build pipeline **ci-build**:
-
-![artifact invoke pipeline](./images/artifact-invoke-pipeline.png)
-
-After that, configure a pipeline to see values from the previously defined
-variable group **devopsforai-aml-vg**. Click on the "Variable groups",
-and to the right, click on "Link variable group". From there, pick the
-**devopsforai-aml-vg** variable group we created in an earlier step, choose
-"Release" as a variable group scope, and click on "Link":
-
-![retrain pipeline vg](./images/retrain-pipeline-vg.png)
-
-Rename the default "Stage 1" to **Invoke Training Pipeline** and make sure that
-the **Agent Specification** is **ubuntu-16.04** under the Agent Job:
-
-![agent specification](./images/agent-specification.png)
-
-Add a **Command Line Script** step, rename it to **Run Training Pipeline** with the following script:
-
-```bash
-docker run -v $(System.DefaultWorkingDirectory)/_ci-build/mlops-pipelines/ml_service/pipelines:/pipelines \
- -w=/pipelines -e MODEL_NAME=$MODEL_NAME -e EXPERIMENT_NAME=$EXPERIMENT_NAME \
- -e TENANT_ID=$TENANT_ID -e SP_APP_ID=$SP_APP_ID -e SP_APP_SECRET=$(SP_APP_SECRET) \
- -e SUBSCRIPTION_ID=$SUBSCRIPTION_ID -e RELEASE_RELEASEID=$RELEASE_RELEASEID \
- -e BUILD_BUILDID=$BUILD_BUILDID -e BASE_NAME=$BASE_NAME \
-mcr.microsoft.com/mlops/python:latest python run_train_pipeline.py
-```
-
-as in the screen shot below, leaving all other fields to their default value:
-
-![Run Training Pipeline Task](./images/run_training_pipeline_task.png)
-
-Now, add the automation to trigger a run of this pipeline whenever the
-**ci_build** build is completed, click on the lightning bolt icon on the top
-right of the **\_ci-build** artifact is selected, and enable the automatic
-release:
-
-![automate_invoke_training_pipeline](./images/automate_invoke_training_pipeline.png)
-
-This release pipeline should now be automatically triggered
-(continuous deployment) whenever a new **ML training pipeline** is published by
-the **ci-build builder pipeline**. It can also be triggered manually or
-configured to run on a scheduled basis. Create a new release to trigger the
-pipeline manually by clicking on the "Create release" button on the top right
-of your screen, when selecting this new build pipeline:
-
-![create release](./images/create-release.png)
-
-Leave the fields empty and click on "create". Once the release pipeline is
-completed, check out in the **ML Workspace** that the training pipeline is
-running:
-
 ![running training pipeline](./images/running-training-pipeline.png)
 
 The training pipeline will train, evaluate, and register a new model. Wait until
 it is finished and make sure there is a new model in the **ML Workspace**:
 
 ![trained model](./images/trained-model.png)
 
-Good! Now we have a trained model.
+To disable the automatic trigger of the training pipeline, change the `auto-trigger-training` variable as listed in the `.pipelines\azdo-ci-build-train.yml` pipeline to `false`.  This can also be overridden at runtime execution of the pipeline.
 
 ### Set up a Release Deployment Pipeline to Deploy the Model
 
@@ -268,9 +203,6 @@ The pipeline consumes two artifacts:
 1. the result of the **Build Pipeline** as it contains configuration files
 1. the **model** trained and registered by the ML training pipeline
 
-Create a new release pipeline and add the **\_ci-build** artifact using the
-same process as what we did in the previous step.
-
 Install the **Azure Machine Learning** extension to your organization from the
 [marketplace](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml),
 so that you can set up a service connection to your AML workspace.

diff --git a/docs/images/agent-specification.png b/docs/images/agent-specification.png
diff --git a/docs/images/artifact-invoke-pipeline.png b/docs/images/artifact-invoke-pipeline.png
diff --git a/docs/images/automate_invoke_training_pipeline.png b/docs/images/automate_invoke_training_pipeline.png
diff --git a/docs/images/create-release.png b/docs/images/create-release.png
diff --git a/docs/images/invoke-training-pipeline.png b/docs/images/invoke-training-pipeline.png
diff --git a/docs/images/retrain-pipeline-vg.png b/docs/images/retrain-pipeline-vg.png
diff --git a/docs/images/run_training_pipeline_task.png b/docs/images/run_training_pipeline_task.png
diff --git a/ml_service/pipelines/build_train_pipeline.py b/ml_service/pipelines/build_train_pipeline.py
@@ -22,7 +22,6 @@ def main():
     sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN")
     train_script_path = os.environ.get("TRAIN_SCRIPT_PATH")
     evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH")
-    # register_script_path = os.environ.get("REGISTER_SCRIPT_PATH")
     vm_size = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU")
     compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
     model_name = os.environ.get("MODEL_NAME")
@@ -90,27 +89,7 @@ def main():
     )
     print("Step Evaluate created")
 
-    # Currently, the Evaluate step will automatically register
-    # the model if it performs better. This step is based on a
-    # previous version of the repo which utilized JSON files to
-    # track evaluation results.
-
-    # register_model_step = PythonScriptStep(
-    #     name="Register New Trained Model",
-    #     script_name=register_script_path,
-    #     compute_target=aml_compute,
-    #     source_directory=sources_directory_train,
-    #     arguments=[
-    #         "--release_id", release_id,
-    #         "--model_name", model_name,
-    #     ],
-    #     runconfig=run_config,
-    #     allow_reuse=False,
-    # )
-    # print("Step register model created")
-
     evaluate_step.run_after(train_step)
-    # register_model_step.run_after(evaluate_step)
     steps = [evaluate_step]
 
     train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)

diff --git a/ml_service/pipelines/run_train_pipeline.py b/ml_service/pipelines/run_train_pipeline.py
@@ -15,7 +15,6 @@ def main():
     model_name = os.environ.get("MODEL_NAME")
     app_id = os.environ.get('SP_APP_ID')
     app_secret = os.environ.get('SP_APP_SECRET')
-    release_id = os.environ.get('RELEASE_RELEASEID')
     build_id = os.environ.get('BUILD_BUILDID')
 
     service_principal = ServicePrincipalAuthentication(
@@ -47,7 +46,7 @@ def main():
     else:
         published_pipeline = matched_pipes[0]
 
-    pipeline_parameters = {"model_name": model_name, "release_id": release_id}
+    pipeline_parameters = {"model_name": model_name}
 
     response = published_pipeline.submit(
         aml_workspace,