diff --git a/.pipelines/azdo-ci-build-train.yml b/.pipelines/azdo-ci-build-train.yml index efecb7f8..c2453d4d 100644 --- a/.pipelines/azdo-ci-build-train.yml +++ b/.pipelines/azdo-ci-build-train.yml @@ -3,62 +3,69 @@ trigger: branches: include: - master - -pool: - vmImage: 'ubuntu-latest' - -container: mcr.microsoft.com/mlops/python:latest - + paths: + exclude: + - docs/ + - environment_setup/ + - charts/ + - ml_service/util/create_scoring_image.py variables: - group: devopsforai-aml-vg +# Choose from default, build_train_pipeline_with_r.py, or build_train_pipeline_with_r_on_dbricks.py +- name: build-train-script + value: 'build_train_pipeline.py' +# Automatically triggers the train, evaluate, register pipeline after the CI steps. +# Uncomment to set to false or add same variable name at queue time with value of false to disable. +# - name: auto-trigger-training +# value: false - -steps: - -- template: azdo-base-pipeline.yml - -- bash: | - # Invoke the Python building and publishing a training pipeline with Python on ML Compute - python3 $(Build.SourcesDirectory)/ml_service/pipelines/build_train_pipeline.py - failOnStderr: 'false' - env: - SP_APP_SECRET: '$(SP_APP_SECRET)' - displayName: 'Publish Azure Machine Learning Pipeline. Python on ML' - enabled: 'true' - -- bash: | - # Invoke the Python building and publishing a training pipeline with R on ML Compute - python3 $(Build.SourcesDirectory)/ml_service/pipelines/build_train_pipeline_with_r.py - failOnStderr: 'false' - env: - SP_APP_SECRET: '$(SP_APP_SECRET)' - displayName: 'Publish Azure Machine Learning Pipeline. R on ML Compute' - enabled: 'false' - -- bash: | - # Invoke the Python building and publishing a training pipeline with R on DataBricks - python3 $(Build.SourcesDirectory)/ml_service/pipelines/build_train_pipeline_with_r_on_dbricks.py - failOnStderr: 'false' - env: - SP_APP_SECRET: '$(SP_APP_SECRET)' - displayName: 'Publish Azure Machine Learning Pipeline. R on DataBricks' - enabled: 'false' - -- task: CopyFiles@2 - displayName: 'Copy Files to: $(Build.ArtifactStagingDirectory)' - inputs: - SourceFolder: '$(Build.SourcesDirectory)' - TargetFolder: '$(Build.ArtifactStagingDirectory)' - Contents: | - ml_service/pipelines/?(run_train_pipeline.py|*.json) - code/scoring/** - - -- task: PublishBuildArtifacts@1 - displayName: 'Publish Artifact' - inputs: - ArtifactName: 'mlops-pipelines' - publishLocation: 'container' - pathtoPublish: '$(Build.ArtifactStagingDirectory)' - TargetPath: '$(Build.ArtifactStagingDirectory)' \ No newline at end of file +stages: +- stage: 'Model_CI' + displayName: 'Model CI' + jobs: + - job: "Model_CI_Pipeline" + displayName: "Model CI Pipeline" + pool: + vmImage: 'ubuntu-latest' + container: mcr.microsoft.com/mlops/python:latest + timeoutInMinutes: 0 + steps: + - template: azdo-base-pipeline.yml + - script: | + # Invoke the Python building and publishing a training pipeline + python3 $(Build.SourcesDirectory)/ml_service/pipelines/$(build-train-script) + failOnStderr: 'false' + env: + SP_APP_SECRET: '$(SP_APP_SECRET)' + displayName: 'Publish Azure Machine Learning Pipeline' +- stage: 'Trigger_AML_Pipeline' + displayName: 'Train, evaluate, register model via previously published AML pipeline' + jobs: + - job: "Invoke_Model_Pipeline" + condition: and(succeeded(), eq(coalesce(variables['auto-trigger-training'], 'true'), 'true')) + displayName: "Invoke Model Pipeline and evaluate results to register" + pool: + vmImage: 'ubuntu-latest' + container: mcr.microsoft.com/mlops/python:latest + timeoutInMinutes: 0 + steps: + - script: | + python $(Build.SourcesDirectory)/ml_service/pipelines/run_train_pipeline.py + displayName: 'Trigger Training Pipeline' + env: + SP_APP_SECRET: '$(SP_APP_SECRET)' + - task: CopyFiles@2 + displayName: 'Copy Files to: $(Build.ArtifactStagingDirectory)' + inputs: + SourceFolder: '$(Build.SourcesDirectory)' + TargetFolder: '$(Build.ArtifactStagingDirectory)' + Contents: | + code/scoring/** + - task: PublishBuildArtifacts@1 + displayName: 'Publish Artifact' + inputs: + ArtifactName: 'mlops-pipelines' + publishLocation: 'container' + pathtoPublish: '$(Build.ArtifactStagingDirectory)' + TargetPath: '$(Build.ArtifactStagingDirectory)' \ No newline at end of file diff --git a/docs/getting_started.md b/docs/getting_started.md index a8d10a3c..d39be2e8 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -145,9 +145,7 @@ you can set up the rest of the pipelines necessary for deploying your ML model to production. These are the pipelines that you will be setting up: 1. **Build pipeline:** triggered on code change to master branch on GitHub, -performs linting, unit testing and publishing a training pipeline. -1. **Release Trigger pipeline:** runs a published training pipeline to train, -evaluate and register a model. +performs linting, unit testing, publishing a training pipeline, and runs the published training pipeline to train, evaluate, and register a model. 1. **Release Deployment pipeline:** deploys a model to QA (ACI) and Prod (AKS) environments. @@ -169,80 +167,17 @@ and checkout a published training pipeline in the **mlops-AML-WS** workspace in ![training pipeline](./images/training-pipeline.png) -Great, you now have the build pipeline set up which can either be manually -triggered or automatically triggered every time there's a change in the master -branch. The pipeline performs linting, unit testing, and builds and publishes an +Great, you now have the build pipeline set up which automatically triggers every time there's a change in the master +branch. The pipeline performs linting, unit testing, builds and publishes and executes a **ML Training Pipeline** in a **ML Workspace**. **Note:** The build pipeline contains disabled steps to build and publish ML pipelines using R to train a model. Enable these steps if you want to play with -this approach. For the pipeline training a model with R on Databricks you have +this approach by changing the `build-train-script` pipeline variable to either `build_train_pipeline_with_r.py`, or `build_train_pipeline_with_r_on_dbricks.py`. For the pipeline training a model with R on Databricks you have to manually create a Databricks cluster and attach it to the ML Workspace as a compute (Values DB_CLUSTER_ID and DATABRICKS_COMPUTE_NAME variables shoud be specified). -### Set up a Release Trigger Pipeline to Train the Model - -The next step is to invoke the training pipeline created in the previous step. -It can be done with a **Release Pipeline**. Click on the Pipelines/Releases -menu, and then **New pipeline**, and then click on "Empty Job" on the -"Select a template" window that pops to the right: - -![invoke training pipeline](./images/invoke-training-pipeline.png) - -Next, click on "Add an artifact". We will select the artifact of this pipeline -to be the result of the build pipeline **ci-build**: - -![artifact invoke pipeline](./images/artifact-invoke-pipeline.png) - -After that, configure a pipeline to see values from the previously defined -variable group **devopsforai-aml-vg**. Click on the "Variable groups", -and to the right, click on "Link variable group". From there, pick the -**devopsforai-aml-vg** variable group we created in an earlier step, choose -"Release" as a variable group scope, and click on "Link": - -![retrain pipeline vg](./images/retrain-pipeline-vg.png) - -Rename the default "Stage 1" to **Invoke Training Pipeline** and make sure that -the **Agent Specification** is **ubuntu-16.04** under the Agent Job: - -![agent specification](./images/agent-specification.png) - -Add a **Command Line Script** step, rename it to **Run Training Pipeline** with the following script: - -```bash -docker run -v $(System.DefaultWorkingDirectory)/_ci-build/mlops-pipelines/ml_service/pipelines:/pipelines \ - -w=/pipelines -e MODEL_NAME=$MODEL_NAME -e EXPERIMENT_NAME=$EXPERIMENT_NAME \ - -e TENANT_ID=$TENANT_ID -e SP_APP_ID=$SP_APP_ID -e SP_APP_SECRET=$(SP_APP_SECRET) \ - -e SUBSCRIPTION_ID=$SUBSCRIPTION_ID -e RELEASE_RELEASEID=$RELEASE_RELEASEID \ - -e BUILD_BUILDID=$BUILD_BUILDID -e BASE_NAME=$BASE_NAME \ -mcr.microsoft.com/mlops/python:latest python run_train_pipeline.py -``` - -as in the screen shot below, leaving all other fields to their default value: - -![Run Training Pipeline Task](./images/run_training_pipeline_task.png) - -Now, add the automation to trigger a run of this pipeline whenever the -**ci_build** build is completed, click on the lightning bolt icon on the top -right of the **\_ci-build** artifact is selected, and enable the automatic -release: - -![automate_invoke_training_pipeline](./images/automate_invoke_training_pipeline.png) - -This release pipeline should now be automatically triggered -(continuous deployment) whenever a new **ML training pipeline** is published by -the **ci-build builder pipeline**. It can also be triggered manually or -configured to run on a scheduled basis. Create a new release to trigger the -pipeline manually by clicking on the "Create release" button on the top right -of your screen, when selecting this new build pipeline: - -![create release](./images/create-release.png) - -Leave the fields empty and click on "create". Once the release pipeline is -completed, check out in the **ML Workspace** that the training pipeline is -running: - ![running training pipeline](./images/running-training-pipeline.png) The training pipeline will train, evaluate, and register a new model. Wait until @@ -250,7 +185,7 @@ it is finished and make sure there is a new model in the **ML Workspace**: ![trained model](./images/trained-model.png) -Good! Now we have a trained model. +To disable the automatic trigger of the training pipeline, change the `auto-trigger-training` variable as listed in the `.pipelines\azdo-ci-build-train.yml` pipeline to `false`. This can also be overridden at runtime execution of the pipeline. ### Set up a Release Deployment Pipeline to Deploy the Model @@ -268,9 +203,6 @@ The pipeline consumes two artifacts: 1. the result of the **Build Pipeline** as it contains configuration files 1. the **model** trained and registered by the ML training pipeline -Create a new release pipeline and add the **\_ci-build** artifact using the -same process as what we did in the previous step. - Install the **Azure Machine Learning** extension to your organization from the [marketplace](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml), so that you can set up a service connection to your AML workspace. diff --git a/docs/images/agent-specification.png b/docs/images/agent-specification.png deleted file mode 100644 index c71c3b68..00000000 Binary files a/docs/images/agent-specification.png and /dev/null differ diff --git a/docs/images/artifact-invoke-pipeline.png b/docs/images/artifact-invoke-pipeline.png deleted file mode 100644 index 2a6dcebf..00000000 Binary files a/docs/images/artifact-invoke-pipeline.png and /dev/null differ diff --git a/docs/images/automate_invoke_training_pipeline.png b/docs/images/automate_invoke_training_pipeline.png deleted file mode 100644 index 875d1410..00000000 Binary files a/docs/images/automate_invoke_training_pipeline.png and /dev/null differ diff --git a/docs/images/create-release.png b/docs/images/create-release.png deleted file mode 100644 index 15069b5d..00000000 Binary files a/docs/images/create-release.png and /dev/null differ diff --git a/docs/images/invoke-training-pipeline.png b/docs/images/invoke-training-pipeline.png deleted file mode 100644 index 21619ae3..00000000 Binary files a/docs/images/invoke-training-pipeline.png and /dev/null differ diff --git a/docs/images/retrain-pipeline-vg.png b/docs/images/retrain-pipeline-vg.png deleted file mode 100644 index 4aa30e9f..00000000 Binary files a/docs/images/retrain-pipeline-vg.png and /dev/null differ diff --git a/docs/images/run_training_pipeline_task.png b/docs/images/run_training_pipeline_task.png deleted file mode 100644 index e0455807..00000000 Binary files a/docs/images/run_training_pipeline_task.png and /dev/null differ diff --git a/ml_service/pipelines/build_train_pipeline.py b/ml_service/pipelines/build_train_pipeline.py index cd65ff83..481c68e5 100644 --- a/ml_service/pipelines/build_train_pipeline.py +++ b/ml_service/pipelines/build_train_pipeline.py @@ -22,7 +22,6 @@ def main(): sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN") train_script_path = os.environ.get("TRAIN_SCRIPT_PATH") evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH") - # register_script_path = os.environ.get("REGISTER_SCRIPT_PATH") vm_size = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU") compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME") model_name = os.environ.get("MODEL_NAME") @@ -90,27 +89,7 @@ def main(): ) print("Step Evaluate created") - # Currently, the Evaluate step will automatically register - # the model if it performs better. This step is based on a - # previous version of the repo which utilized JSON files to - # track evaluation results. - - # register_model_step = PythonScriptStep( - # name="Register New Trained Model", - # script_name=register_script_path, - # compute_target=aml_compute, - # source_directory=sources_directory_train, - # arguments=[ - # "--release_id", release_id, - # "--model_name", model_name, - # ], - # runconfig=run_config, - # allow_reuse=False, - # ) - # print("Step register model created") - evaluate_step.run_after(train_step) - # register_model_step.run_after(evaluate_step) steps = [evaluate_step] train_pipeline = Pipeline(workspace=aml_workspace, steps=steps) diff --git a/ml_service/pipelines/run_train_pipeline.py b/ml_service/pipelines/run_train_pipeline.py index 11252a88..1d942a8c 100644 --- a/ml_service/pipelines/run_train_pipeline.py +++ b/ml_service/pipelines/run_train_pipeline.py @@ -15,7 +15,6 @@ def main(): model_name = os.environ.get("MODEL_NAME") app_id = os.environ.get('SP_APP_ID') app_secret = os.environ.get('SP_APP_SECRET') - release_id = os.environ.get('RELEASE_RELEASEID') build_id = os.environ.get('BUILD_BUILDID') service_principal = ServicePrincipalAuthentication( @@ -47,7 +46,7 @@ def main(): else: published_pipeline = matched_pipes[0] - pipeline_parameters = {"model_name": model_name, "release_id": release_id} + pipeline_parameters = {"model_name": model_name} response = published_pipeline.submit( aml_workspace,