Skip to content

Commit

Permalink
Switch CI to use new AzureML agentless execution task (#101)
Browse files Browse the repository at this point in the history
  • Loading branch information
dtzar authored and eedorenko committed Nov 21, 2019
1 parent 1eb4a42 commit 969e6ca
Show file tree
Hide file tree
Showing 15 changed files with 427 additions and 223 deletions.
8 changes: 4 additions & 4 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ SP_APP_ID = ''
SP_APP_SECRET = ''
RESOUCE_GROUP = 'mlops-rg'

# Mock build/release ID for local testing - update ReleaseID each "release"
# Mock build/release ID for local testing
BUILD_BUILDID = '001'
RELEASE_RELEASEID = '001'

# Azure ML Workspace Variables
WORKSPACE_NAME = ''
WORKSPACE_NAME = 'aml-workspace'
EXPERIMENT_NAME = ''
SCRIPT_FOLDER = './'

# AML Compute Cluster Config
AML_COMPUTE_CLUSTER_NAME = 'train-cluster'
Expand All @@ -36,4 +36,4 @@ SOURCES_DIR_TRAIN = 'code'
DB_CLUSTER_ID = ''

# Optional. Container Image name for image creation
IMAGE_NAME = 'ml-trained'
IMAGE_NAME = 'mltrained'
40 changes: 35 additions & 5 deletions .pipelines/azdo-ci-build-train.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ stages:
jobs:
- job: "Model_CI_Pipeline"
displayName: "Model CI Pipeline"
pool:
pool:
vmImage: 'ubuntu-latest'
container: mcr.microsoft.com/mlops/python:latest
timeoutInMinutes: 0
Expand All @@ -37,17 +37,47 @@ stages:
- stage: 'Trigger_AML_Pipeline'
displayName: 'Train, evaluate, register model via previously published AML pipeline'
jobs:
- job: "Invoke_Model_Pipeline"
- job: "Get_Pipeline_ID"
condition: and(succeeded(), eq(coalesce(variables['auto-trigger-training'], 'true'), 'true'))
displayName: "Invoke Model Pipeline and evaluate results to register"
pool:
displayName: "Get Pipeline ID for execution"
pool:
vmImage: 'ubuntu-latest'
container: mcr.microsoft.com/mlops/python:latest
timeoutInMinutes: 0
steps:
- script: |
python $(Build.SourcesDirectory)/ml_service/pipelines/run_train_pipeline.py
displayName: 'Trigger Training Pipeline'
source $(Build.SourcesDirectory)/tmp.sh
echo "##vso[task.setvariable variable=AMLPIPELINEID;isOutput=true]$AMLPIPELINE_ID"
name: 'getpipelineid'
displayName: 'Get Pipeline ID'
env:
SP_APP_SECRET: '$(SP_APP_SECRET)'
- job: "Run_ML_Pipeline"
dependsOn: "Get_Pipeline_ID"
displayName: "Trigger ML Training Pipeline"
pool: server
variables:
AMLPIPELINE_ID: $[ dependencies.Get_Pipeline_ID.outputs['getpipelineid.AMLPIPELINEID'] ]
steps:
- task: ms-air-aiagility.vss-services-azureml.azureml-restApi-task.MLPublishedPipelineRestAPITask@0
displayName: 'Invoke ML pipeline'
inputs:
azureSubscription: '$(WORKSPACE_SVC_CONNECTION)'
PipelineId: '$(AMLPIPELINE_ID)'
ExperimentName: '$(EXPERIMENT_NAME)'
PipelineParameters: '"model_name": "sklearn_regression_model.pkl"'
- job: "Training_Run_Report"
dependsOn: "Run_ML_Pipeline"
displayName: "Determine if evaluation succeeded and new model is registered"
pool:
vmImage: 'ubuntu-latest'
container: mcr.microsoft.com/mlops/python:latest
timeoutInMinutes: 0
steps:
- script: |
python $(Build.SourcesDirectory)/code/register/register_model.py --build_id $(Build.BuildId) --validate True
displayName: 'Check if new model registered'
env:
SP_APP_SECRET: '$(SP_APP_SECRET)'
- task: CopyFiles@2
Expand Down
4 changes: 2 additions & 2 deletions .pipelines/azdo-variables.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ variables:
value: '1'
# AML Pipeline Config
- name: TRAINING_PIPELINE_NAME
value: 'Training Pipeline'
value: 'Training-Pipeline'
- name: MODEL_PATH
value: ''
- name: EVALUATE_SCRIPT_PATH
Expand All @@ -34,7 +34,7 @@ variables:
- name: SOURCES_DIR_TRAIN
value: code
- name: IMAGE_NAME
value: ''
value: 'mltrained'
# Optional. Used by a training pipeline with R on Databricks
- name: DB_CLUSTER_ID
value: ''
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ description: "Code which demonstrates how to set up and operationalize an MLOps
# MLOps with Azure ML


[![Build Status](https://dev.azure.com/customai/DevopsForAI-AML/_apis/build/status/Build%20%26%20Train?branchName=master)](https://dev.azure.com/customai/DevopsForAI-AML/_build/latest?definitionId=34&branchName=master)
[![Build Status](https://aidemos.visualstudio.com/MLOps/_apis/build/status/microsoft.MLOpsPython-CI?branchName=master)](https://aidemos.visualstudio.com/MLOps/_build/latest?definitionId=127&branchName=master)


MLOps will help you to understand how to build the Continuous Integration and Continuous Delivery pipeline for a ML/AI project. We will be using the Azure DevOps Project for build and release/deployment pipelines along with Azure ML services for model retraining pipeline, model management and operationalization.
Expand Down
151 changes: 90 additions & 61 deletions code/evaluate/evaluate_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,90 +24,119 @@
POSSIBILITY OF SUCH DAMAGE.
"""
import os
from azureml.core import Model, Run
from azureml.core import Model, Run, Workspace, Experiment
import argparse
from azureml.core.authentication import ServicePrincipalAuthentication
import traceback


# Get workspace
run = Run.get_context()
exp = run.experiment
ws = run.experiment.workspace
if (run.id.startswith('OfflineRun')):
from dotenv import load_dotenv
# For local development, set values in this section
load_dotenv()
workspace_name = os.environ.get("WORKSPACE_NAME")
experiment_name = os.environ.get("EXPERIMENT_NAME")
resource_group = os.environ.get("RESOURCE_GROUP")
subscription_id = os.environ.get("SUBSCRIPTION_ID")
tenant_id = os.environ.get("TENANT_ID")
model_name = os.environ.get("MODEL_NAME")
app_id = os.environ.get('SP_APP_ID')
app_secret = os.environ.get('SP_APP_SECRET')
build_id = os.environ.get('BUILD_BUILDID')
service_principal = ServicePrincipalAuthentication(
tenant_id=tenant_id,
service_principal_id=app_id,
service_principal_password=app_secret)

aml_workspace = Workspace.get(
name=workspace_name,
subscription_id=subscription_id,
resource_group=resource_group,
auth=service_principal
)
ws = aml_workspace
exp = Experiment(ws, experiment_name)
run_id = "e78b2c27-5ceb-49d9-8e84-abe7aecf37d5"
else:
exp = run.experiment
ws = run.experiment.workspace
run_id = 'amlcompute'

parser = argparse.ArgumentParser("evaluate")
parser.add_argument(
"--release_id",
"--build_id",
type=str,
help="The Build ID of the build triggering this pipeline run",
)
parser.add_argument(
"--run_id",
type=str,
help="The ID of the release triggering this pipeline run",
help="Training run ID",
)
parser.add_argument(
"--model_name",
type=str,
help="Name of the Model",
default="sklearn_regression_model.pkl",
)
args = parser.parse_args()

print("Argument 1: %s" % args.release_id)
print("Argument 2: %s" % args.model_name)
args = parser.parse_args()
if (args.build_id is not None):
build_id = args.build_id
if (args.run_id is not None):
run_id = args.run_id
if (run_id == 'amlcompute'):
run_id = run.parent.id
model_name = args.model_name
release_id = args.release_id
metric_eval = "mse"
run.tag("BuildId", value=build_id)

# Paramaterize the matrics on which the models should be compared
# Paramaterize the matrices on which the models should be compared
# Add golden data set on which all the model performance can be evaluated

all_runs = exp.get_runs(
properties={"release_id": release_id, "run_type": "train"},
include_children=True
)
new_model_run = next(all_runs)
new_model_run_id = new_model_run.id
print(f'New Run found with Run ID of: {new_model_run_id}')

try:
# Get most recently registered model, we assume that
# is the model in production.
# Download this model and compare it with the recently
# trained model by running test with same data set.
model_list = Model.list(ws)
production_model = next(
filter(
lambda x: x.created_time == max(
model.created_time for model in model_list),
model_list,
if (len(model_list) > 0):
production_model = next(
filter(
lambda x: x.created_time == max(
model.created_time for model in model_list),
model_list,
)
)
)
production_model_run_id = production_model.tags.get("run_id")
run_list = exp.get_runs()
production_model_run_id = production_model.run_id

# Get the run history for both production model and
# newly trained model and compare mse
production_model_run = Run(exp, run_id=production_model_run_id)
new_model_run = Run(exp, run_id=new_model_run_id)
# Get the run history for both production model and
# newly trained model and compare mse
production_model_run = Run(exp, run_id=production_model_run_id)
new_model_run = run.parent
print("Production model run is", production_model_run)

production_model_mse = production_model_run.get_metrics().get("mse")
new_model_mse = new_model_run.get_metrics().get("mse")
print(
"Current Production model mse: {}, New trained model mse: {}".format(
production_model_mse, new_model_mse
)
)
production_model_mse = \
production_model_run.get_metrics().get(metric_eval)
new_model_mse = new_model_run.get_metrics().get(metric_eval)
if (production_model_mse is None or new_model_mse is None):
print("Unable to find", metric_eval, "metrics, "
"exiting evaluation")
run.parent.cancel()
else:
print(
"Current Production model mse: {}, "
"New trained model mse: {}".format(
production_model_mse, new_model_mse
)
)

promote_new_model = False
if new_model_mse < production_model_mse:
promote_new_model = True
print("New trained model performs better, thus it will be registered")
if (new_model_mse < production_model_mse):
print("New trained model performs better, "
"thus it should be registered")
else:
print("New trained model metric is less than or equal to "
"production model so skipping model registration.")
run.parent.cancel()
else:
print("This is the first model, "
"thus it should be registered")
except Exception:
promote_new_model = True
print("This is the first model to be trained, \
thus nothing to evaluate for now")


# Writing the run id to /aml_config/run_id.json
if promote_new_model:
model_path = os.path.join('outputs', model_name)
new_model_run.register_model(
model_name=model_name,
model_path=model_path,
properties={"release_id": release_id})
print("Registered new model!")
traceback.print_exc(limit=None, file=None, chain=True)
print("Something went wrong trying to evaluate. Exiting.")
raise
Loading

0 comments on commit 969e6ca

Please sign in to comment.