Skip to content

Commit 969e6ca

Browse files
dtzareedorenko
authored andcommitted
Switch CI to use new AzureML agentless execution task (#101)
1 parent 1eb4a42 commit 969e6ca

15 files changed

+427
-223
lines changed

.env.example

+4-4
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@ SP_APP_ID = ''
77
SP_APP_SECRET = ''
88
RESOUCE_GROUP = 'mlops-rg'
99

10-
# Mock build/release ID for local testing - update ReleaseID each "release"
10+
# Mock build/release ID for local testing
1111
BUILD_BUILDID = '001'
12-
RELEASE_RELEASEID = '001'
1312

1413
# Azure ML Workspace Variables
15-
WORKSPACE_NAME = ''
14+
WORKSPACE_NAME = 'aml-workspace'
1615
EXPERIMENT_NAME = ''
16+
SCRIPT_FOLDER = './'
1717

1818
# AML Compute Cluster Config
1919
AML_COMPUTE_CLUSTER_NAME = 'train-cluster'
@@ -36,4 +36,4 @@ SOURCES_DIR_TRAIN = 'code'
3636
DB_CLUSTER_ID = ''
3737

3838
# Optional. Container Image name for image creation
39-
IMAGE_NAME = 'ml-trained'
39+
IMAGE_NAME = 'mltrained'

.pipelines/azdo-ci-build-train.yml

+35-5
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ stages:
2121
jobs:
2222
- job: "Model_CI_Pipeline"
2323
displayName: "Model CI Pipeline"
24-
pool:
24+
pool:
2525
vmImage: 'ubuntu-latest'
2626
container: mcr.microsoft.com/mlops/python:latest
2727
timeoutInMinutes: 0
@@ -37,17 +37,47 @@ stages:
3737
- stage: 'Trigger_AML_Pipeline'
3838
displayName: 'Train, evaluate, register model via previously published AML pipeline'
3939
jobs:
40-
- job: "Invoke_Model_Pipeline"
40+
- job: "Get_Pipeline_ID"
4141
condition: and(succeeded(), eq(coalesce(variables['auto-trigger-training'], 'true'), 'true'))
42-
displayName: "Invoke Model Pipeline and evaluate results to register"
43-
pool:
42+
displayName: "Get Pipeline ID for execution"
43+
pool:
4444
vmImage: 'ubuntu-latest'
4545
container: mcr.microsoft.com/mlops/python:latest
4646
timeoutInMinutes: 0
4747
steps:
4848
- script: |
4949
python $(Build.SourcesDirectory)/ml_service/pipelines/run_train_pipeline.py
50-
displayName: 'Trigger Training Pipeline'
50+
source $(Build.SourcesDirectory)/tmp.sh
51+
echo "##vso[task.setvariable variable=AMLPIPELINEID;isOutput=true]$AMLPIPELINE_ID"
52+
name: 'getpipelineid'
53+
displayName: 'Get Pipeline ID'
54+
env:
55+
SP_APP_SECRET: '$(SP_APP_SECRET)'
56+
- job: "Run_ML_Pipeline"
57+
dependsOn: "Get_Pipeline_ID"
58+
displayName: "Trigger ML Training Pipeline"
59+
pool: server
60+
variables:
61+
AMLPIPELINE_ID: $[ dependencies.Get_Pipeline_ID.outputs['getpipelineid.AMLPIPELINEID'] ]
62+
steps:
63+
- task: ms-air-aiagility.vss-services-azureml.azureml-restApi-task.MLPublishedPipelineRestAPITask@0
64+
displayName: 'Invoke ML pipeline'
65+
inputs:
66+
azureSubscription: '$(WORKSPACE_SVC_CONNECTION)'
67+
PipelineId: '$(AMLPIPELINE_ID)'
68+
ExperimentName: '$(EXPERIMENT_NAME)'
69+
PipelineParameters: '"model_name": "sklearn_regression_model.pkl"'
70+
- job: "Training_Run_Report"
71+
dependsOn: "Run_ML_Pipeline"
72+
displayName: "Determine if evaluation succeeded and new model is registered"
73+
pool:
74+
vmImage: 'ubuntu-latest'
75+
container: mcr.microsoft.com/mlops/python:latest
76+
timeoutInMinutes: 0
77+
steps:
78+
- script: |
79+
python $(Build.SourcesDirectory)/code/register/register_model.py --build_id $(Build.BuildId) --validate True
80+
displayName: 'Check if new model registered'
5181
env:
5282
SP_APP_SECRET: '$(SP_APP_SECRET)'
5383
- task: CopyFiles@2

.pipelines/azdo-variables.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ variables:
2424
value: '1'
2525
# AML Pipeline Config
2626
- name: TRAINING_PIPELINE_NAME
27-
value: 'Training Pipeline'
27+
value: 'Training-Pipeline'
2828
- name: MODEL_PATH
2929
value: ''
3030
- name: EVALUATE_SCRIPT_PATH
@@ -34,7 +34,7 @@ variables:
3434
- name: SOURCES_DIR_TRAIN
3535
value: code
3636
- name: IMAGE_NAME
37-
value: ''
37+
value: 'mltrained'
3838
# Optional. Used by a training pipeline with R on Databricks
3939
- name: DB_CLUSTER_ID
4040
value: ''

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ description: "Code which demonstrates how to set up and operationalize an MLOps
1212
# MLOps with Azure ML
1313

1414

15-
[![Build Status](https://dev.azure.com/customai/DevopsForAI-AML/_apis/build/status/Build%20%26%20Train?branchName=master)](https://dev.azure.com/customai/DevopsForAI-AML/_build/latest?definitionId=34&branchName=master)
15+
[![Build Status](https://aidemos.visualstudio.com/MLOps/_apis/build/status/microsoft.MLOpsPython-CI?branchName=master)](https://aidemos.visualstudio.com/MLOps/_build/latest?definitionId=127&branchName=master)
1616

1717

1818
MLOps will help you to understand how to build the Continuous Integration and Continuous Delivery pipeline for a ML/AI project. We will be using the Azure DevOps Project for build and release/deployment pipelines along with Azure ML services for model retraining pipeline, model management and operationalization.

code/evaluate/evaluate_model.py

+90-61
Original file line numberDiff line numberDiff line change
@@ -24,90 +24,119 @@
2424
POSSIBILITY OF SUCH DAMAGE.
2525
"""
2626
import os
27-
from azureml.core import Model, Run
27+
from azureml.core import Model, Run, Workspace, Experiment
2828
import argparse
29+
from azureml.core.authentication import ServicePrincipalAuthentication
30+
import traceback
2931

30-
31-
# Get workspace
3232
run = Run.get_context()
33-
exp = run.experiment
34-
ws = run.experiment.workspace
33+
if (run.id.startswith('OfflineRun')):
34+
from dotenv import load_dotenv
35+
# For local development, set values in this section
36+
load_dotenv()
37+
workspace_name = os.environ.get("WORKSPACE_NAME")
38+
experiment_name = os.environ.get("EXPERIMENT_NAME")
39+
resource_group = os.environ.get("RESOURCE_GROUP")
40+
subscription_id = os.environ.get("SUBSCRIPTION_ID")
41+
tenant_id = os.environ.get("TENANT_ID")
42+
model_name = os.environ.get("MODEL_NAME")
43+
app_id = os.environ.get('SP_APP_ID')
44+
app_secret = os.environ.get('SP_APP_SECRET')
45+
build_id = os.environ.get('BUILD_BUILDID')
46+
service_principal = ServicePrincipalAuthentication(
47+
tenant_id=tenant_id,
48+
service_principal_id=app_id,
49+
service_principal_password=app_secret)
3550

51+
aml_workspace = Workspace.get(
52+
name=workspace_name,
53+
subscription_id=subscription_id,
54+
resource_group=resource_group,
55+
auth=service_principal
56+
)
57+
ws = aml_workspace
58+
exp = Experiment(ws, experiment_name)
59+
run_id = "e78b2c27-5ceb-49d9-8e84-abe7aecf37d5"
60+
else:
61+
exp = run.experiment
62+
ws = run.experiment.workspace
63+
run_id = 'amlcompute'
3664

3765
parser = argparse.ArgumentParser("evaluate")
3866
parser.add_argument(
39-
"--release_id",
67+
"--build_id",
68+
type=str,
69+
help="The Build ID of the build triggering this pipeline run",
70+
)
71+
parser.add_argument(
72+
"--run_id",
4073
type=str,
41-
help="The ID of the release triggering this pipeline run",
74+
help="Training run ID",
4275
)
4376
parser.add_argument(
4477
"--model_name",
4578
type=str,
4679
help="Name of the Model",
4780
default="sklearn_regression_model.pkl",
4881
)
49-
args = parser.parse_args()
5082

51-
print("Argument 1: %s" % args.release_id)
52-
print("Argument 2: %s" % args.model_name)
83+
args = parser.parse_args()
84+
if (args.build_id is not None):
85+
build_id = args.build_id
86+
if (args.run_id is not None):
87+
run_id = args.run_id
88+
if (run_id == 'amlcompute'):
89+
run_id = run.parent.id
5390
model_name = args.model_name
54-
release_id = args.release_id
91+
metric_eval = "mse"
92+
run.tag("BuildId", value=build_id)
5593

56-
# Paramaterize the matrics on which the models should be compared
94+
# Paramaterize the matrices on which the models should be compared
5795
# Add golden data set on which all the model performance can be evaluated
58-
59-
all_runs = exp.get_runs(
60-
properties={"release_id": release_id, "run_type": "train"},
61-
include_children=True
62-
)
63-
new_model_run = next(all_runs)
64-
new_model_run_id = new_model_run.id
65-
print(f'New Run found with Run ID of: {new_model_run_id}')
66-
6796
try:
68-
# Get most recently registered model, we assume that
69-
# is the model in production.
70-
# Download this model and compare it with the recently
71-
# trained model by running test with same data set.
7297
model_list = Model.list(ws)
73-
production_model = next(
74-
filter(
75-
lambda x: x.created_time == max(
76-
model.created_time for model in model_list),
77-
model_list,
98+
if (len(model_list) > 0):
99+
production_model = next(
100+
filter(
101+
lambda x: x.created_time == max(
102+
model.created_time for model in model_list),
103+
model_list,
104+
)
78105
)
79-
)
80-
production_model_run_id = production_model.tags.get("run_id")
81-
run_list = exp.get_runs()
106+
production_model_run_id = production_model.run_id
82107

83-
# Get the run history for both production model and
84-
# newly trained model and compare mse
85-
production_model_run = Run(exp, run_id=production_model_run_id)
86-
new_model_run = Run(exp, run_id=new_model_run_id)
108+
# Get the run history for both production model and
109+
# newly trained model and compare mse
110+
production_model_run = Run(exp, run_id=production_model_run_id)
111+
new_model_run = run.parent
112+
print("Production model run is", production_model_run)
87113

88-
production_model_mse = production_model_run.get_metrics().get("mse")
89-
new_model_mse = new_model_run.get_metrics().get("mse")
90-
print(
91-
"Current Production model mse: {}, New trained model mse: {}".format(
92-
production_model_mse, new_model_mse
93-
)
94-
)
114+
production_model_mse = \
115+
production_model_run.get_metrics().get(metric_eval)
116+
new_model_mse = new_model_run.get_metrics().get(metric_eval)
117+
if (production_model_mse is None or new_model_mse is None):
118+
print("Unable to find", metric_eval, "metrics, "
119+
"exiting evaluation")
120+
run.parent.cancel()
121+
else:
122+
print(
123+
"Current Production model mse: {}, "
124+
"New trained model mse: {}".format(
125+
production_model_mse, new_model_mse
126+
)
127+
)
95128

96-
promote_new_model = False
97-
if new_model_mse < production_model_mse:
98-
promote_new_model = True
99-
print("New trained model performs better, thus it will be registered")
129+
if (new_model_mse < production_model_mse):
130+
print("New trained model performs better, "
131+
"thus it should be registered")
132+
else:
133+
print("New trained model metric is less than or equal to "
134+
"production model so skipping model registration.")
135+
run.parent.cancel()
136+
else:
137+
print("This is the first model, "
138+
"thus it should be registered")
100139
except Exception:
101-
promote_new_model = True
102-
print("This is the first model to be trained, \
103-
thus nothing to evaluate for now")
104-
105-
106-
# Writing the run id to /aml_config/run_id.json
107-
if promote_new_model:
108-
model_path = os.path.join('outputs', model_name)
109-
new_model_run.register_model(
110-
model_name=model_name,
111-
model_path=model_path,
112-
properties={"release_id": release_id})
113-
print("Registered new model!")
140+
traceback.print_exc(limit=None, file=None, chain=True)
141+
print("Something went wrong trying to evaluate. Exiting.")
142+
raise

0 commit comments

Comments
 (0)