Skip to content

Commit

Permalink
Merge pull request #1 from DevoteamNL/feature/add_ci_pipeline
Browse files Browse the repository at this point in the history
pre-commit configuration
  • Loading branch information
thegitofdaniel authored Oct 23, 2023
2 parents 7ef8f4e + f57afee commit 0360c16
Show file tree
Hide file tree
Showing 39 changed files with 240 additions and 144 deletions.
16 changes: 16 additions & 0 deletions .github/workflows/check-code-quality.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: check-code-quality

on:
pull_request:
push:
branches:
- 'main'
- 'feature/**'

jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v3
- uses: pre-commit/[email protected]
12 changes: 6 additions & 6 deletions .github/workflows/deploy-batch-endpoint-pipeline-classical.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: deploy-batch-endpoint-pipeline

on:
on:
workflow_dispatch:
jobs:
get-config:
Expand All @@ -16,17 +16,17 @@ jobs:
min_instances: 0
max_instances: 5
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
create-endpoint:
needs: [get-config,create-compute]
uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
endpoint_file: mlops/azureml/deploy/batch/batch-endpoint.yml
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.bep) }}
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.bep) }}
endpoint_type: batch
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
Expand All @@ -35,9 +35,9 @@ jobs:
needs: [get-config,create-endpoint]
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
endpoint_file: mlops/azureml/deploy/batch/batch-deployment.yml
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.bep) }}
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.bep) }}
endpoint_type: batch
deployment_name: eptestdeploy
secrets:
Expand Down
7 changes: 3 additions & 4 deletions .github/workflows/tf-gha-deploy-infra.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: tf-gha-deploy-infra.yml
on:

on:
workflow_dispatch:
env:
config_env: 'none'
Expand All @@ -14,7 +14,7 @@ jobs:
uses: Azure/mlops-templates/.github/workflows/tf-gha-install-terraform.yml@main
with:
TFAction: 'apply'
dply_environment: ${{ needs.set-env-branch.outputs.config-file }}
dply_environment: ${{ needs.set-env-branch.outputs.config-file }}
location: ${{ needs.get-config.outputs.location }}
namespace: ${{ needs.get-config.outputs.namespace }}
postfix: ${{ needs.get-config.outputs.postfix }}
Expand Down Expand Up @@ -42,4 +42,3 @@ jobs:
- id: deploy-aml-workspace
name: deploy-aml-workspace
run: echo "OK"

1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -141,4 +141,3 @@ terraform.tfvars
! /infrastructure/bicep/bicepconfig.json

.idea

49 changes: 43 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,51 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.2.0
rev: v4.3.0
hooks:
- id: check-ast
- id: check-json
- id: check-toml
- id: check-xml
- id: check-yaml
- id: check-builtin-literals
- id: check-case-conflict
- id: check-docstring-first
- id: detect-private-key
- id: end-of-file-fixer
- id: name-tests-test
- id: trailing-whitespace

# Opinionated code formatter to forget about formatting
- repo: https://github.com/asottile/pyupgrade
rev: v3.1.0
hooks:
- id: pyupgrade
name: PyUpgrade
- repo: https://github.com/psf/black
rev: 21.12b0
rev: 22.3.0
hooks:
- id: black
name: Black
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v0.910
hooks:
- id: black
additional_dependencies: ['click==8.0.4']
- id: mypy
name: MyPy
- repo: https://github.com/PyCQA/bandit
rev: 1.7.4
hooks:
- id: bandit
name: bandit

# # XXX gitleaks needs to be configured
# - repo: https://github.com/zricethezav/gitleaks
# rev: v8.15.0
# hooks:
# - id: gitleaks-docker
# name: gitleaks
# entry: zricethezav/gitleaks detecet --verbose --source0. --config=gitleaks.toml --redact

# XXX ruff is still alpha: but it will become a better alternative
# - repo: https://github.com/astral-sh/ruff-pre-commit
# rev: v0.0.284
# hooks:
# - id: ruff
# args: [--fix, --exit-non-zero-on-fix]
2 changes: 1 addition & 1 deletion data-science/environment/train-conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ dependencies:
- joblib==1.0.0
- matplotlib==3.3.3
- git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-client
- git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-collector
- git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-collector
45 changes: 27 additions & 18 deletions data-science/src/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,25 +47,28 @@
"vendor",
]

CAT_ORD_COLS = [
]
CAT_ORD_COLS = [] # type: ignore


def parse_args():
'''Parse input arguments'''
"""Parse input arguments"""

parser = argparse.ArgumentParser("predict")
parser.add_argument("--model_name", type=str, help="Name of registered model")
parser.add_argument("--model_input", type=str, help="Path of input model")
parser.add_argument("--test_data", type=str, help="Path to test dataset")
parser.add_argument("--evaluation_output", type=str, help="Path of eval results")
parser.add_argument("--runner", type=str, help="Local or Cloud Runner", default="CloudRunner")
parser.add_argument(
"--runner", type=str, help="Local or Cloud Runner", default="CloudRunner"
)

args = parser.parse_args()

return args


def main(args):
'''Read trained model and test dataset, evaluate model and save result'''
"""Read trained model and test dataset, evaluate model and save result"""

# Load the test data
test_data = pd.read_parquet(Path(args.test_data))
Expand All @@ -75,15 +78,16 @@ def main(args):
X_test = test_data[NUMERIC_COLS + CAT_NOM_COLS + CAT_ORD_COLS]

# Load the model from input port
model = mlflow.sklearn.load_model(args.model_input)
model = mlflow.sklearn.load_model(args.model_input)

# ---------------- Model Evaluation ---------------- #
yhat_test, score = model_evaluation(X_test, y_test, model, args.evaluation_output)

# ----------------- Model Promotion ---------------- #
if args.runner == "CloudRunner":
predictions, deploy_flag = model_promotion(args.model_name, args.evaluation_output, X_test, y_test, yhat_test, score)

predictions, deploy_flag = model_promotion(
args.model_name, args.evaluation_output, X_test, y_test, yhat_test, score
)


def model_evaluation(X_test, y_test, model, evaluation_output):
Expand All @@ -95,7 +99,7 @@ def model_evaluation(X_test, y_test, model, evaluation_output):
output_data = X_test.copy()
output_data["real_label"] = y_test
output_data["predicted_label"] = yhat_test
output_data.to_csv((Path(evaluation_output) / "predictions.csv"))
output_data.to_csv(Path(evaluation_output) / "predictions.csv")

# Evaluate Model performance with the test set
r2 = r2_score(y_test, yhat_test)
Expand All @@ -119,8 +123,8 @@ def model_evaluation(X_test, y_test, model, evaluation_output):
mlflow.log_metric("test mae", mae)

# Visualize results
plt.scatter(y_test, yhat_test, color='black')
plt.plot(y_test, y_test, color='blue', linewidth=3)
plt.scatter(y_test, yhat_test, color="black")
plt.plot(y_test, y_test, color="blue", linewidth=3)
plt.xlabel("Real value")
plt.ylabel("Predicted value")
plt.title("Comparing Model Predictions to Real values - Test Data")
Expand All @@ -129,8 +133,9 @@ def model_evaluation(X_test, y_test, model, evaluation_output):

return yhat_test, r2


def model_promotion(model_name, evaluation_output, X_test, y_test, yhat_test, score):

scores = {}
predictions = {}

Expand All @@ -139,10 +144,12 @@ def model_promotion(model_name, evaluation_output, X_test, y_test, yhat_test, sc
for model_run in client.search_model_versions(f"name='{model_name}'"):
model_version = model_run.version
mdl = mlflow.pyfunc.load_model(
model_uri=f"models:/{model_name}/{model_version}")
model_uri=f"models:/{model_name}/{model_version}"
)
predictions[f"{model_name}:{model_version}"] = mdl.predict(X_test)
scores[f"{model_name}:{model_version}"] = r2_score(
y_test, predictions[f"{model_name}:{model_version}"])
y_test, predictions[f"{model_name}:{model_version}"]
)

if scores:
if score >= max(list(scores.values())):
Expand All @@ -153,15 +160,16 @@ def model_promotion(model_name, evaluation_output, X_test, y_test, yhat_test, sc
deploy_flag = 1
print(f"Deploy flag: {deploy_flag}")

with open((Path(evaluation_output) / "deploy_flag"), 'w') as outfile:
with open((Path(evaluation_output) / "deploy_flag"), "w") as outfile:
outfile.write(f"{int(deploy_flag)}")

# add current model score and predictions
scores["current model"] = score
predictions["currrent model"] = yhat_test

perf_comparison_plot = pd.DataFrame(
scores, index=["r2 score"]).plot(kind='bar', figsize=(15, 10))
perf_comparison_plot = pd.DataFrame(scores, index=["r2 score"]).plot(
kind="bar", figsize=(15, 10)
)
perf_comparison_plot.figure.savefig("perf_comparison.png")
perf_comparison_plot.figure.savefig(Path(evaluation_output) / "perf_comparison.png")

Expand All @@ -170,6 +178,7 @@ def model_promotion(model_name, evaluation_output, X_test, y_test, yhat_test, sc

return predictions, deploy_flag


if __name__ == "__main__":

mlflow.start_run()
Expand All @@ -185,7 +194,7 @@ def model_promotion(model_name, evaluation_output, X_test, y_test, yhat_test, sc

for line in lines:
print(line)

main(args)

mlflow.end_run()
50 changes: 30 additions & 20 deletions data-science/src/prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
import argparse

from pathlib import Path
import os

# import os
import numpy as np
import pandas as pd

Expand Down Expand Up @@ -41,37 +42,45 @@
"vendor",
]

CAT_ORD_COLS = [
]
CAT_ORD_COLS = [] # type: ignore


def parse_args():
'''Parse input arguments'''
"""Parse input arguments"""

parser = argparse.ArgumentParser("prep")
parser.add_argument("--raw_data", type=str, help="Path to raw data")
parser.add_argument("--train_data", type=str, help="Path to train dataset")
parser.add_argument("--val_data", type=str, help="Path to test dataset")
parser.add_argument("--test_data", type=str, help="Path to test dataset")

parser.add_argument("--enable_monitoring", type=str, help="enable logging to ADX")
parser.add_argument("--table_name", type=str, default="mlmonitoring", help="Table name in ADX for logging")

parser.add_argument(
"--table_name",
type=str,
default="mlmonitoring",
help="Table name in ADX for logging",
)

args = parser.parse_args()

return args


def log_training_data(df, table_name):
from obs.collector import Online_Collector

collector = Online_Collector(table_name)
collector.batch_collect(df)


def main(args):
'''Read, split, and save datasets'''
"""Read, split, and save datasets"""

# ------------ Reading Data ------------ #
# -------------------------------------- #

data = pd.read_csv((Path(args.raw_data)))
data = pd.read_csv(Path(args.raw_data))
data = data[NUMERIC_COLS + CAT_NOM_COLS + CAT_ORD_COLS + [TARGET_COL]]

# ------------- Split Data ------------- #
Expand All @@ -89,15 +98,19 @@ def main(args):
val = data[msk_val]
test = data[msk_test]

mlflow.log_metric('train size', train.shape[0])
mlflow.log_metric('val size', val.shape[0])
mlflow.log_metric('test size', test.shape[0])
mlflow.log_metric("train size", train.shape[0])
mlflow.log_metric("val size", val.shape[0])
mlflow.log_metric("test size", test.shape[0])

train.to_parquet((Path(args.train_data) / "train.parquet"))
val.to_parquet((Path(args.val_data) / "val.parquet"))
test.to_parquet((Path(args.test_data) / "test.parquet"))
train.to_parquet(Path(args.train_data) / "train.parquet")
val.to_parquet(Path(args.val_data) / "val.parquet")
test.to_parquet(Path(args.test_data) / "test.parquet")

if (args.enable_monitoring.lower() == 'true' or args.enable_monitoring == '1' or args.enable_monitoring.lower() == 'yes'):
if (
args.enable_monitoring.lower() == "true"
or args.enable_monitoring == "1"
or args.enable_monitoring.lower() == "yes"
):
log_training_data(data, args.table_name)


Expand All @@ -115,14 +128,11 @@ def main(args):
f"Train dataset output path: {args.train_data}",
f"Val dataset output path: {args.val_data}",
f"Test dataset path: {args.test_data}",

]

for line in lines:
print(line)

main(args)

mlflow.end_run()


Loading

0 comments on commit 0360c16

Please sign in to comment.