Switch CI to use new AzureML agentless execution task (#101)

microsoft · Nov 21, 2019 · 969e6ca · 969e6ca
1 parent 1eb4a42
commit 969e6ca
Show file tree

Hide file tree

Showing 15 changed files with 427 additions and 223 deletions.
diff --git a/.env.example b/.env.example
@@ -7,13 +7,13 @@ SP_APP_ID = ''
 SP_APP_SECRET = ''
 RESOUCE_GROUP = 'mlops-rg'
 
-# Mock build/release ID for local testing - update ReleaseID each "release"
+# Mock build/release ID for local testing
 BUILD_BUILDID = '001'
-RELEASE_RELEASEID = '001'
 
 # Azure ML Workspace Variables
-WORKSPACE_NAME = ''
+WORKSPACE_NAME = 'aml-workspace'
 EXPERIMENT_NAME = ''
+SCRIPT_FOLDER = './'
 
 # AML Compute Cluster Config
 AML_COMPUTE_CLUSTER_NAME = 'train-cluster'
@@ -36,4 +36,4 @@ SOURCES_DIR_TRAIN = 'code'
 DB_CLUSTER_ID = ''
 
 # Optional. Container Image name for image creation
-IMAGE_NAME = 'ml-trained'
+IMAGE_NAME = 'mltrained'
diff --git a/.pipelines/azdo-ci-build-train.yml b/.pipelines/azdo-ci-build-train.yml
@@ -21,7 +21,7 @@ stages:
   jobs:
   - job: "Model_CI_Pipeline"
     displayName: "Model CI Pipeline"
-    pool: 
+    pool:
       vmImage: 'ubuntu-latest'
     container: mcr.microsoft.com/mlops/python:latest
     timeoutInMinutes: 0
@@ -37,17 +37,47 @@ stages:
 - stage: 'Trigger_AML_Pipeline'
   displayName: 'Train, evaluate, register model via previously published AML pipeline'
   jobs:
-  - job: "Invoke_Model_Pipeline"
+  - job: "Get_Pipeline_ID"
     condition: and(succeeded(), eq(coalesce(variables['auto-trigger-training'], 'true'), 'true'))
-    displayName: "Invoke Model Pipeline and evaluate results to register"
-    pool: 
+    displayName: "Get Pipeline ID for execution"
+    pool:
       vmImage: 'ubuntu-latest'
     container: mcr.microsoft.com/mlops/python:latest
     timeoutInMinutes: 0
     steps:
     - script: |
         python $(Build.SourcesDirectory)/ml_service/pipelines/run_train_pipeline.py
-      displayName: 'Trigger Training Pipeline'
+        source $(Build.SourcesDirectory)/tmp.sh
+        echo "##vso[task.setvariable variable=AMLPIPELINEID;isOutput=true]$AMLPIPELINE_ID"
+      name: 'getpipelineid'
+      displayName: 'Get Pipeline ID'
+      env:
+        SP_APP_SECRET: '$(SP_APP_SECRET)'
+  - job: "Run_ML_Pipeline"
+    dependsOn: "Get_Pipeline_ID"
+    displayName: "Trigger ML Training Pipeline"
+    pool: server
+    variables:
+      AMLPIPELINE_ID: $[ dependencies.Get_Pipeline_ID.outputs['getpipelineid.AMLPIPELINEID'] ]
+    steps:
+    - task: ms-air-aiagility.vss-services-azureml.azureml-restApi-task.MLPublishedPipelineRestAPITask@0
+      displayName: 'Invoke ML pipeline'
+      inputs:
+        azureSubscription: '$(WORKSPACE_SVC_CONNECTION)'
+        PipelineId: '$(AMLPIPELINE_ID)'
+        ExperimentName: '$(EXPERIMENT_NAME)'
+        PipelineParameters: '"model_name": "sklearn_regression_model.pkl"'
+  - job: "Training_Run_Report"
+    dependsOn: "Run_ML_Pipeline"
+    displayName: "Determine if evaluation succeeded and new model is registered"
+    pool:
+      vmImage: 'ubuntu-latest'
+    container: mcr.microsoft.com/mlops/python:latest
+    timeoutInMinutes: 0
+    steps:
+    - script: |
+        python $(Build.SourcesDirectory)/code/register/register_model.py --build_id $(Build.BuildId) --validate True
+      displayName: 'Check if new model registered'
       env:
         SP_APP_SECRET: '$(SP_APP_SECRET)'
     - task: CopyFiles@2

diff --git a/.pipelines/azdo-variables.yml b/.pipelines/azdo-variables.yml
@@ -24,7 +24,7 @@ variables:
   value: '1'
   # AML Pipeline Config 
 - name: TRAINING_PIPELINE_NAME
-  value: 'Training Pipeline'
+  value: 'Training-Pipeline'
 - name: MODEL_PATH
   value: ''
 - name: EVALUATE_SCRIPT_PATH
@@ -34,7 +34,7 @@ variables:
 - name: SOURCES_DIR_TRAIN
   value: code
 - name: IMAGE_NAME
-  value: ''
+  value: 'mltrained'
   # Optional. Used by a training pipeline with R on Databricks
 - name: DB_CLUSTER_ID
   value: ''
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ description: "Code which demonstrates how to set up and operationalize an MLOps
 # MLOps with Azure ML
 
 
-[![Build Status](https://dev.azure.com/customai/DevopsForAI-AML/_apis/build/status/Build%20%26%20Train?branchName=master)](https://dev.azure.com/customai/DevopsForAI-AML/_build/latest?definitionId=34&branchName=master)
+[![Build Status](https://aidemos.visualstudio.com/MLOps/_apis/build/status/microsoft.MLOpsPython-CI?branchName=master)](https://aidemos.visualstudio.com/MLOps/_build/latest?definitionId=127&branchName=master)
 
 
 MLOps will help you to understand how to build the Continuous Integration and Continuous Delivery pipeline for a ML/AI project. We will be using the Azure DevOps Project for build and release/deployment pipelines along with Azure ML services for model retraining pipeline, model management and operationalization. 

diff --git a/code/evaluate/evaluate_model.py b/code/evaluate/evaluate_model.py
@@ -24,90 +24,119 @@
 POSSIBILITY OF SUCH DAMAGE.
 """
 import os
-from azureml.core import Model, Run
+from azureml.core import Model, Run, Workspace, Experiment
 import argparse
+from azureml.core.authentication import ServicePrincipalAuthentication
+import traceback
 
-
-# Get workspace
 run = Run.get_context()
-exp = run.experiment
-ws = run.experiment.workspace
+if (run.id.startswith('OfflineRun')):
+    from dotenv import load_dotenv
+    # For local development, set values in this section
+    load_dotenv()
+    workspace_name = os.environ.get("WORKSPACE_NAME")
+    experiment_name = os.environ.get("EXPERIMENT_NAME")
+    resource_group = os.environ.get("RESOURCE_GROUP")
+    subscription_id = os.environ.get("SUBSCRIPTION_ID")
+    tenant_id = os.environ.get("TENANT_ID")
+    model_name = os.environ.get("MODEL_NAME")
+    app_id = os.environ.get('SP_APP_ID')
+    app_secret = os.environ.get('SP_APP_SECRET')
+    build_id = os.environ.get('BUILD_BUILDID')
+    service_principal = ServicePrincipalAuthentication(
+        tenant_id=tenant_id,
+        service_principal_id=app_id,
+        service_principal_password=app_secret)
 
+    aml_workspace = Workspace.get(
+        name=workspace_name,
+        subscription_id=subscription_id,
+        resource_group=resource_group,
+        auth=service_principal
+    )
+    ws = aml_workspace
+    exp = Experiment(ws, experiment_name)
+    run_id = "e78b2c27-5ceb-49d9-8e84-abe7aecf37d5"
+else:
+    exp = run.experiment
+    ws = run.experiment.workspace
+    run_id = 'amlcompute'
 
 parser = argparse.ArgumentParser("evaluate")
 parser.add_argument(
-    "--release_id",
+    "--build_id",
+    type=str,
+    help="The Build ID of the build triggering this pipeline run",
+)
+parser.add_argument(
+    "--run_id",
     type=str,
-    help="The ID of the release triggering this pipeline run",
+    help="Training run ID",
 )
 parser.add_argument(
     "--model_name",
     type=str,
     help="Name of the Model",
     default="sklearn_regression_model.pkl",
 )
-args = parser.parse_args()
 
-print("Argument 1: %s" % args.release_id)
-print("Argument 2: %s" % args.model_name)
+args = parser.parse_args()
+if (args.build_id is not None):
+    build_id = args.build_id
+if (args.run_id is not None):
+    run_id = args.run_id
+if (run_id == 'amlcompute'):
+    run_id = run.parent.id
 model_name = args.model_name
-release_id = args.release_id
+metric_eval = "mse"
+run.tag("BuildId", value=build_id)
 
-# Paramaterize the matrics on which the models should be compared
+# Paramaterize the matrices on which the models should be compared
 # Add golden data set on which all the model performance can be evaluated
-
-all_runs = exp.get_runs(
-    properties={"release_id": release_id, "run_type": "train"},
-    include_children=True
-    )
-new_model_run = next(all_runs)
-new_model_run_id = new_model_run.id
-print(f'New Run found with Run ID of: {new_model_run_id}')
-
 try:
-    # Get most recently registered model, we assume that
-    # is the model in production.
-    # Download this model and compare it with the recently
-    # trained model by running test with same data set.
     model_list = Model.list(ws)
-    production_model = next(
-        filter(
-            lambda x: x.created_time == max(
-                model.created_time for model in model_list),
-            model_list,
+    if (len(model_list) > 0):
+        production_model = next(
+            filter(
+                lambda x: x.created_time == max(
+                    model.created_time for model in model_list),
+                model_list,
+            )
         )
-    )
-    production_model_run_id = production_model.tags.get("run_id")
-    run_list = exp.get_runs()
+        production_model_run_id = production_model.run_id
 
-    # Get the run history for both production model and
-    # newly trained model and compare mse
-    production_model_run = Run(exp, run_id=production_model_run_id)
-    new_model_run = Run(exp, run_id=new_model_run_id)
+        # Get the run history for both production model and
+        # newly trained model and compare mse
+        production_model_run = Run(exp, run_id=production_model_run_id)
+        new_model_run = run.parent
+        print("Production model run is", production_model_run)
 
-    production_model_mse = production_model_run.get_metrics().get("mse")
-    new_model_mse = new_model_run.get_metrics().get("mse")
-    print(
-        "Current Production model mse: {}, New trained model mse: {}".format(
-            production_model_mse, new_model_mse
-        )
-    )
+        production_model_mse = \
+            production_model_run.get_metrics().get(metric_eval)
+        new_model_mse = new_model_run.get_metrics().get(metric_eval)
+        if (production_model_mse is None or new_model_mse is None):
+            print("Unable to find", metric_eval, "metrics, "
+                  "exiting evaluation")
+            run.parent.cancel()
+        else:
+            print(
+                "Current Production model mse: {}, "
+                "New trained model mse: {}".format(
+                    production_model_mse, new_model_mse
+                )
+            )
 
-    promote_new_model = False
-    if new_model_mse < production_model_mse:
-        promote_new_model = True
-        print("New trained model performs better, thus it will be registered")
+        if (new_model_mse < production_model_mse):
+            print("New trained model performs better, "
+                  "thus it should be registered")
+        else:
+            print("New trained model metric is less than or equal to "
+                  "production model so skipping model registration.")
+            run.parent.cancel()
+    else:
+        print("This is the first model, "
+              "thus it should be registered")
 except Exception:
-    promote_new_model = True
-    print("This is the first model to be trained, \
-          thus nothing to evaluate for now")
-
-
-# Writing the run id to /aml_config/run_id.json
-if promote_new_model:
-    model_path = os.path.join('outputs', model_name)
-    new_model_run.register_model(
-        model_name=model_name,
-        model_path=model_path,
-        properties={"release_id": release_id})
-    print("Registered new model!")
+    traceback.print_exc(limit=None, file=None, chain=True)
+    print("Something went wrong trying to evaluate. Exiting.")
+    raise