Merge pull request #1 from DevoteamNL/feature/add_ci_pipeline

pre-commit configuration
DevoteamNL · Oct 23, 2023 · 0360c16 · 0360c16
2 parents 7ef8f4e + f57afee
commit 0360c16
Show file tree

Hide file tree

Showing 39 changed files with 240 additions and 144 deletions.
diff --git a/.github/workflows/check-code-quality.yml b/.github/workflows/check-code-quality.yml
@@ -0,0 +1,16 @@
+name: check-code-quality
+
+on:
+  pull_request:
+  push:
+    branches:
+      - 'main'
+      - 'feature/**'
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v3
+    - uses: pre-commit/[email protected]
diff --git a/.github/workflows/deploy-batch-endpoint-pipeline-classical.yml b/.github/workflows/deploy-batch-endpoint-pipeline-classical.yml
@@ -1,6 +1,6 @@
 name: deploy-batch-endpoint-pipeline
 
-on: 
+on:
   workflow_dispatch:
 jobs:
   get-config:
@@ -16,17 +16,17 @@ jobs:
         min_instances: 0
         max_instances: 5
         resource_group: ${{ needs.get-config.outputs.resource_group }}
-        workspace_name: ${{ needs.get-config.outputs.aml_workspace }}     
+        workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
       secrets:
           creds: ${{secrets.AZURE_CREDENTIALS}}
   create-endpoint:
       needs: [get-config,create-compute]
       uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
       with:
         resource_group: ${{ needs.get-config.outputs.resource_group }}
-        workspace_name: ${{ needs.get-config.outputs.aml_workspace }} 
+        workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
         endpoint_file: mlops/azureml/deploy/batch/batch-endpoint.yml
-        endpoint_name:  ${{ format('taxi-gha-{0}', needs.get-config.outputs.bep) }} 
+        endpoint_name:  ${{ format('taxi-gha-{0}', needs.get-config.outputs.bep) }}
         endpoint_type: batch
       secrets:
         creds: ${{secrets.AZURE_CREDENTIALS}}
@@ -35,9 +35,9 @@ jobs:
       needs: [get-config,create-endpoint]
       with:
         resource_group: ${{ needs.get-config.outputs.resource_group }}
-        workspace_name: ${{ needs.get-config.outputs.aml_workspace }} 
+        workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
         endpoint_file: mlops/azureml/deploy/batch/batch-deployment.yml
-        endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.bep) }} 
+        endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.bep) }}
         endpoint_type: batch
         deployment_name: eptestdeploy
       secrets:

diff --git a/.github/workflows/tf-gha-deploy-infra.yml b/.github/workflows/tf-gha-deploy-infra.yml
@@ -1,6 +1,6 @@
 name: tf-gha-deploy-infra.yml
- 
-on: 
+
+on:
   workflow_dispatch:
 env:
     config_env: 'none'
@@ -14,7 +14,7 @@ jobs:
     uses: Azure/mlops-templates/.github/workflows/tf-gha-install-terraform.yml@main
     with:
       TFAction: 'apply'
-      dply_environment: ${{ needs.set-env-branch.outputs.config-file }} 
+      dply_environment: ${{ needs.set-env-branch.outputs.config-file }}
       location: ${{ needs.get-config.outputs.location }}
       namespace: ${{ needs.get-config.outputs.namespace }}
       postfix: ${{ needs.get-config.outputs.postfix }}
@@ -42,4 +42,3 @@ jobs:
     - id: deploy-aml-workspace
       name: deploy-aml-workspace
       run: echo "OK"
-
diff --git a/.gitignore b/.gitignore
@@ -141,4 +141,3 @@ terraform.tfvars
 ! /infrastructure/bicep/bicepconfig.json
 
 .idea
-
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,14 +1,51 @@
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.2.0
+    rev: v4.3.0
     hooks:
+    -   id: check-ast
+    -   id: check-json
+    -   id: check-toml
+    -   id: check-xml
     -   id: check-yaml
+    -   id: check-builtin-literals
+    -   id: check-case-conflict
+    -   id: check-docstring-first
+    -   id: detect-private-key
     -   id: end-of-file-fixer
+    -   id: name-tests-test
     -   id: trailing-whitespace
-
-    # Opinionated code formatter to forget about formatting
+-   repo: https://github.com/asottile/pyupgrade
+    rev: v3.1.0
+    hooks:
+    - id: pyupgrade
+      name: PyUpgrade
 -   repo: https://github.com/psf/black
-    rev: 21.12b0
+    rev: 22.3.0
+    hooks:
+    - id: black
+      name: Black
+-   repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v0.910
     hooks:
-    -   id: black
-        additional_dependencies: ['click==8.0.4']
+    - id: mypy
+      name: MyPy
+-   repo: https://github.com/PyCQA/bandit
+    rev: 1.7.4
+    hooks:
+    - id: bandit
+      name: bandit
+
+# # XXX gitleaks needs to be configured
+# -   repo: https://github.com/zricethezav/gitleaks
+#     rev: v8.15.0
+#     hooks:
+#     - id: gitleaks-docker
+#       name: gitleaks
+#       entry: zricethezav/gitleaks detecet --verbose --source0. --config=gitleaks.toml --redact
+
+# XXX ruff is still alpha: but it will become a better alternative
+# -   repo: https://github.com/astral-sh/ruff-pre-commit
+#     rev: v0.0.284
+#     hooks:
+#     - id: ruff
+#       args: [--fix, --exit-non-zero-on-fix]
diff --git a/data-science/environment/train-conda.yml b/data-science/environment/train-conda.yml
@@ -14,4 +14,4 @@ dependencies:
       - joblib==1.0.0
       - matplotlib==3.3.3
       - git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-client
-      - git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-collector
+      - git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-collector
diff --git a/data-science/src/evaluate.py b/data-science/src/evaluate.py
@@ -47,25 +47,28 @@
     "vendor",
 ]
 
-CAT_ORD_COLS = [
-]
+CAT_ORD_COLS = []  # type: ignore
+
 
 def parse_args():
-    '''Parse input arguments'''
+    """Parse input arguments"""
 
     parser = argparse.ArgumentParser("predict")
     parser.add_argument("--model_name", type=str, help="Name of registered model")
     parser.add_argument("--model_input", type=str, help="Path of input model")
     parser.add_argument("--test_data", type=str, help="Path to test dataset")
     parser.add_argument("--evaluation_output", type=str, help="Path of eval results")
-    parser.add_argument("--runner", type=str, help="Local or Cloud Runner", default="CloudRunner")
+    parser.add_argument(
+        "--runner", type=str, help="Local or Cloud Runner", default="CloudRunner"
+    )
 
     args = parser.parse_args()
 
     return args
 
+
 def main(args):
-    '''Read trained model and test dataset, evaluate model and save result'''
+    """Read trained model and test dataset, evaluate model and save result"""
 
     # Load the test data
     test_data = pd.read_parquet(Path(args.test_data))
@@ -75,15 +78,16 @@ def main(args):
     X_test = test_data[NUMERIC_COLS + CAT_NOM_COLS + CAT_ORD_COLS]
 
     # Load the model from input port
-    model =  mlflow.sklearn.load_model(args.model_input) 
+    model = mlflow.sklearn.load_model(args.model_input)
 
     # ---------------- Model Evaluation ---------------- #
     yhat_test, score = model_evaluation(X_test, y_test, model, args.evaluation_output)
 
     # ----------------- Model Promotion ---------------- #
     if args.runner == "CloudRunner":
-        predictions, deploy_flag = model_promotion(args.model_name, args.evaluation_output, X_test, y_test, yhat_test, score)
-
+        predictions, deploy_flag = model_promotion(
+            args.model_name, args.evaluation_output, X_test, y_test, yhat_test, score
+        )
 
 
 def model_evaluation(X_test, y_test, model, evaluation_output):
@@ -95,7 +99,7 @@ def model_evaluation(X_test, y_test, model, evaluation_output):
     output_data = X_test.copy()
     output_data["real_label"] = y_test
     output_data["predicted_label"] = yhat_test
-    output_data.to_csv((Path(evaluation_output) / "predictions.csv"))
+    output_data.to_csv(Path(evaluation_output) / "predictions.csv")
 
     # Evaluate Model performance with the test set
     r2 = r2_score(y_test, yhat_test)
@@ -119,8 +123,8 @@ def model_evaluation(X_test, y_test, model, evaluation_output):
     mlflow.log_metric("test mae", mae)
 
     # Visualize results
-    plt.scatter(y_test, yhat_test,  color='black')
-    plt.plot(y_test, y_test, color='blue', linewidth=3)
+    plt.scatter(y_test, yhat_test, color="black")
+    plt.plot(y_test, y_test, color="blue", linewidth=3)
     plt.xlabel("Real value")
     plt.ylabel("Predicted value")
     plt.title("Comparing Model Predictions to Real values - Test Data")
@@ -129,8 +133,9 @@ def model_evaluation(X_test, y_test, model, evaluation_output):
 
     return yhat_test, r2
 
+
 def model_promotion(model_name, evaluation_output, X_test, y_test, yhat_test, score):
-    
+
     scores = {}
     predictions = {}
 
@@ -139,10 +144,12 @@ def model_promotion(model_name, evaluation_output, X_test, y_test, yhat_test, sc
     for model_run in client.search_model_versions(f"name='{model_name}'"):
         model_version = model_run.version
         mdl = mlflow.pyfunc.load_model(
-            model_uri=f"models:/{model_name}/{model_version}")
+            model_uri=f"models:/{model_name}/{model_version}"
+        )
         predictions[f"{model_name}:{model_version}"] = mdl.predict(X_test)
         scores[f"{model_name}:{model_version}"] = r2_score(
-            y_test, predictions[f"{model_name}:{model_version}"])
+            y_test, predictions[f"{model_name}:{model_version}"]
+        )
 
     if scores:
         if score >= max(list(scores.values())):
@@ -153,15 +160,16 @@ def model_promotion(model_name, evaluation_output, X_test, y_test, yhat_test, sc
         deploy_flag = 1
     print(f"Deploy flag: {deploy_flag}")
 
-    with open((Path(evaluation_output) / "deploy_flag"), 'w') as outfile:
+    with open((Path(evaluation_output) / "deploy_flag"), "w") as outfile:
         outfile.write(f"{int(deploy_flag)}")
 
     # add current model score and predictions
     scores["current model"] = score
     predictions["currrent model"] = yhat_test
 
-    perf_comparison_plot = pd.DataFrame(
-        scores, index=["r2 score"]).plot(kind='bar', figsize=(15, 10))
+    perf_comparison_plot = pd.DataFrame(scores, index=["r2 score"]).plot(
+        kind="bar", figsize=(15, 10)
+    )
     perf_comparison_plot.figure.savefig("perf_comparison.png")
     perf_comparison_plot.figure.savefig(Path(evaluation_output) / "perf_comparison.png")
 
@@ -170,6 +178,7 @@ def model_promotion(model_name, evaluation_output, X_test, y_test, yhat_test, sc
 
     return predictions, deploy_flag
 
+
 if __name__ == "__main__":
 
     mlflow.start_run()
@@ -185,7 +194,7 @@ def model_promotion(model_name, evaluation_output, X_test, y_test, yhat_test, sc
 
     for line in lines:
         print(line)
-    
+
     main(args)
 
     mlflow.end_run()
diff --git a/data-science/src/prep.py b/data-science/src/prep.py
@@ -7,7 +7,8 @@
 import argparse
 
 from pathlib import Path
-import os
+
+# import os
 import numpy as np
 import pandas as pd
 
@@ -41,37 +42,45 @@
     "vendor",
 ]
 
-CAT_ORD_COLS = [
-]
+CAT_ORD_COLS = []  # type: ignore
+
 
 def parse_args():
-    '''Parse input arguments'''
+    """Parse input arguments"""
 
     parser = argparse.ArgumentParser("prep")
     parser.add_argument("--raw_data", type=str, help="Path to raw data")
     parser.add_argument("--train_data", type=str, help="Path to train dataset")
     parser.add_argument("--val_data", type=str, help="Path to test dataset")
     parser.add_argument("--test_data", type=str, help="Path to test dataset")
-    
+
     parser.add_argument("--enable_monitoring", type=str, help="enable logging to ADX")
-    parser.add_argument("--table_name", type=str, default="mlmonitoring", help="Table name in ADX for logging")
-
+    parser.add_argument(
+        "--table_name",
+        type=str,
+        default="mlmonitoring",
+        help="Table name in ADX for logging",
+    )
+
     args = parser.parse_args()
 
     return args
 
+
 def log_training_data(df, table_name):
     from obs.collector import Online_Collector
+
     collector = Online_Collector(table_name)
     collector.batch_collect(df)
 
+
 def main(args):
-    '''Read, split, and save datasets'''
+    """Read, split, and save datasets"""
 
     # ------------ Reading Data ------------ #
     # -------------------------------------- #
 
-    data = pd.read_csv((Path(args.raw_data)))
+    data = pd.read_csv(Path(args.raw_data))
     data = data[NUMERIC_COLS + CAT_NOM_COLS + CAT_ORD_COLS + [TARGET_COL]]
 
     # ------------- Split Data ------------- #
@@ -89,15 +98,19 @@ def main(args):
     val = data[msk_val]
     test = data[msk_test]
 
-    mlflow.log_metric('train size', train.shape[0])
-    mlflow.log_metric('val size', val.shape[0])
-    mlflow.log_metric('test size', test.shape[0])
+    mlflow.log_metric("train size", train.shape[0])
+    mlflow.log_metric("val size", val.shape[0])
+    mlflow.log_metric("test size", test.shape[0])
 
-    train.to_parquet((Path(args.train_data) / "train.parquet"))
-    val.to_parquet((Path(args.val_data) / "val.parquet"))
-    test.to_parquet((Path(args.test_data) / "test.parquet"))
+    train.to_parquet(Path(args.train_data) / "train.parquet")
+    val.to_parquet(Path(args.val_data) / "val.parquet")
+    test.to_parquet(Path(args.test_data) / "test.parquet")
 
-    if (args.enable_monitoring.lower() == 'true' or args.enable_monitoring == '1' or args.enable_monitoring.lower() == 'yes'):
+    if (
+        args.enable_monitoring.lower() == "true"
+        or args.enable_monitoring == "1"
+        or args.enable_monitoring.lower() == "yes"
+    ):
         log_training_data(data, args.table_name)
 
 
@@ -115,14 +128,11 @@ def main(args):
         f"Train dataset output path: {args.train_data}",
         f"Val dataset output path: {args.val_data}",
         f"Test dataset path: {args.test_data}",
-
     ]
 
     for line in lines:
         print(line)
-    
+
     main(args)
 
     mlflow.end_run()
-
-
Original file line number	Diff line number	Diff line change
Expand Up		@@ -141,4 +141,3 @@ terraform.tfvars
		! /infrastructure/bicep/bicepconfig.json

		.idea