diff --git a/.github/actions/starter_template_test/action.yml b/.github/actions/starter_template_test/action.yml index aa01bc1..9940481 100644 --- a/.github/actions/starter_template_test/action.yml +++ b/.github/actions/starter_template_test/action.yml @@ -69,7 +69,7 @@ runs: - name: Concatenate requirements shell: bash run: | - zenml integration export-requirements -o ./local_checkout/integration-requirements.txt sklearn mlflow s3 kubernetes kubeflow slack evidently + zenml integration export-requirements -o ./local_checkout/integration-requirements.txt sklearn cat ./local_checkout/requirements.txt ./local_checkout/test-requirements.txt ./local_checkout/integration-requirements.txt >> ./local_checkout/all-requirements.txt - name: Install requirements diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1090e8f..9fc5047 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,7 +2,25 @@ name: CI on: workflow_dispatch: + inputs: + ref-template: + description: 'Branch or tag ref to check out for template' + type: string + required: false + ref-zenml: + description: 'Branch or tag ref to check out for ZenML' + type: string + required: false workflow_call: + inputs: + ref-template: + description: 'Branch or tag ref to check out for template' + type: string + required: false + ref-zenml: + description: 'Branch or tag ref to check out for ZenML' + type: string + required: false push: branches: ["main", "develop"] paths-ignore: ["README.md"] @@ -19,6 +37,7 @@ jobs: run-tests: runs-on: ${{ matrix.os }} strategy: + fail-fast: false matrix: stack-name: [local] os: [windows-latest, ubuntu-latest, macos-latest] @@ -36,3 +55,5 @@ jobs: with: stack-name: ${{ matrix.stack-name }} python-version: ${{ matrix.python-version }} + ref-zenml: ${{ inputs.ref-zenml || 'feature/update-quickstart-from-template' }} + ref-template: ${{ inputs.ref-template || github.ref }} diff --git a/.github/workflows/image-optimizer.yml b/.github/workflows/image-optimizer.yml new file mode 100644 index 0000000..dddbd1e --- /dev/null +++ b/.github/workflows/image-optimizer.yml @@ -0,0 +1,26 @@ +name: Compress Images +on: + pull_request: + # Run Image Actions when JPG, JPEG, PNG or WebP files are added or changed. + # See https://help.github.com/en/actions/automating-your-workflow-with-github-actions/workflow-syntax-for-github-actions#onpushpull_requestpaths for reference. + paths: + - '**.jpg' + - '**.jpeg' + - '**.png' + - '**.webp' +jobs: + build: + # Only run on non-draft PRs within the same repository. + if: github.event.pull_request.head.repo.full_name == github.repository && github.event.pull_request.draft == false + name: calibreapp/image-actions + runs-on: ubuntu-latest + steps: + - name: Checkout Repo + uses: actions/checkout@v3 + + - name: Compress Images + uses: calibreapp/image-actions@main + with: + # The `GITHUB_TOKEN` is automatically generated by GitHub and scoped only to the repository that is currently running the action. By default, the action can’t update Pull Requests initiated from forked repositories. + # See https://docs.github.com/en/actions/reference/authentication-in-a-workflow and https://help.github.com/en/articles/virtual-environments-for-github-actions#token-permissions + githubToken: ${{ secrets.GITHUB_TOKEN }} diff --git a/.gitignore b/.gitignore index b6e4761..f8d2b18 100644 --- a/.gitignore +++ b/.gitignore @@ -98,6 +98,9 @@ __pypackages__/ celerybeat-schedule celerybeat.pid +# PyCharm Stuff +.idea + # SageMath parsed files *.sage.py @@ -127,3 +130,6 @@ dmypy.json # Pyre type checker .pyre/ + +*.zen +.vscode diff --git a/README.md b/README.md index 0660009..61bb4e7 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,16 @@ # πŸ“œ ZenML Project Templates -This repository contains a collection of templates from which a ZenML project -can be generated: a collection of steps, pipelines, stack configurations and +This repository contains a starter template from which a simple ZenML project +can be generated easily. It contains a collection of steps, pipelines, stack configurations and other artifacts and useful resources that can get you started with ZenML. -πŸ”₯ **Do you have a personal project powered by ZenML that you would like to see here?** At -ZenML, we are looking for design partnerships and collaboration to help us +πŸ”₯ **Do you have a personal project powered by ZenML that you would like to see here?** + +At ZenML, we are looking for design partnerships and collaboration to help us better understand the real-world scenarios in which MLOps is being used and to build the best possible experience for our users. If you are interested in sharing all or parts of your project with us in the form of a ZenML project -template, please [join our Slack](https://zenml.io/slack-invite/) and leave us a +template, please [join our Slack](https://zenml.io/slack/) and leave us a message! ## πŸ“¦ Prerequisites @@ -18,7 +19,7 @@ To use the templates, you need to have Zenml and its `templates` extras installed: ```bash -pip install zenml[templates] +pip install "zenml[templates]" ``` ## πŸš€ Generate a ZenML Project @@ -35,7 +36,7 @@ library and a set of Jinja2 templates to generate the project. So you may also interact with Copier directly to generate a project, e.g.: ```bash -copier gh:zenml-io/zenml-project-templates +copier gh:zenml-io/template-starter ``` You will be prompted to select the project template and enter various values for @@ -47,11 +48,5 @@ the same command again. If you want to skip the prompts to use the values you already entered and overwrite all files in the existing project, you can run: ```bash -copier -wf gh:zenml-io/zenml-project-templates -``` - -## πŸ“ƒ List of Project Templates - -| Project Template | Tags | Description | -|------------------|----------|-----------------------------------------------------------------------------------| -| [ZenML Starter](https://github.com/zenml-io/zenml-project-templates/tree/main/starter) | basic scikit-learn | All the basic ML ingredients you need to get you started with ZenML: parameterized steps, a model training pipeline, a flexible configuration and a simple CLI. All created around a representative and versatile model training use-case implemented with the scikit-learn library. | +copier -wf gh:zenml-io/template-starter +``` \ No newline at end of file diff --git a/copier.yaml b/copier.yaml index 270dd34..fdaee8b 100644 --- a/copier.yaml +++ b/copier.yaml @@ -1,12 +1,4 @@ --- # GLOBAL PROMPT -------------------------------- - -template: - type: str - help: >- - The project template you would like to use - choices: - ZenML Starter: starter - default: starter project_name: type: str help: Short name for your project @@ -40,70 +32,10 @@ email: The email of the person/entity holding the copyright default: info@zenml.io when: "{{ open_source_license }}" -auto_format: - type: bool - help: "Auto-format the generated code with black, ruff and autoflake? \n (NOTE: you need to have these tools installed in your environment)" - default: false - -# SUB-TEMPLATE PROMPT [starter] --------------------- -use_step_params: - type: bool - help: "Would you like to see step parameters being used in the generated \n ZenML steps?" - default: true - when: "{{ template == 'starter' }}" -use_custom_artifacts: - type: bool - help: "Would you like to see custom artifact data types and materializers \n being used in the generated ZenML steps and pipelines?" - default: true - when: "{{ template == 'starter' }}" -configurable_dataset: - type: bool - help: "Would you like to be able to select the dataset used in model training\n at runtime through the CLI and as a data loader step parameter?" - default: true - when: "{{ template == 'starter' and use_step_params }}" -sklearn_dataset_name: - type: str - help: "The generated ZenML project will be populated with some example code - \n featuring one of the UCI ML datasets from scikit-learn. \n Which - dataset would you like to see being used in the generated code?" - choices: - UCI Wine Data Set (classification): wine - UCI Iris Data Set (classification): iris - UCI Breast Cancer Wisconsin (Diagnostic) Data Set (classification): breast_cancer - default: wine - when: "{{ template == 'starter' }}" -configurable_model: - type: bool - help: "Would you like to be able to select the type of model used in model \n - training at runtime through the CLI and as a model trainer step \n - parameter?" - default: true - help: >- - Would you like to be able to select the type of model used in model - training at runtime through the CLI and as a model trainer step - parameter ? - default: true - when: "{{ template == 'starter' and use_step_params }}" -sklearn_model_name: - type: str - help: "The generated ZenML project will be populated with some example code - \n featuring one of the scikit-learn classifier models. \n Which model class would you like to see being used in the generated code?" - choices: - Logistic Regression: LogisticRegression - C-Support Vector Classification: SVC - Linear Support Vector Classification: LinearSVC - Random Forest Classifier: RandomForestClassifier - K-Nearest Neighbors Classifier: KNeighborsClassifier - Gaussian Naive Bayes: GaussianNB - Linear Perceptron Classifier: Perceptron - Stochastic Gradient Descent Linear Classifier: SGDClassifier - Decision Tree Classifier: DecisionTreeClassifier - default: LogisticRegression - when: "{{ template == 'starter' }}" # CONFIGURATION ------------------------- _templates_suffix: "" -_subdirectory: "{{ template }}/template" +_subdirectory: "./template" _exclude: - /README.md _tasks: diff --git a/requirements.txt b/requirements.txt index eede3b1..65e8c35 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ -scikit-learn +scikit-learn<1.3 copier jinja2-time +zenml[server]>=0.52.0 +notebook diff --git a/starter/README.md b/starter/README.md deleted file mode 100644 index aa76203..0000000 --- a/starter/README.md +++ /dev/null @@ -1,53 +0,0 @@ -# πŸ“œ ZenML Starter Project Template - -What would you need to get a quick understanding of the ZenML framework and -start building your own ML pipelines? The answer is a simple project template -to cover the basics of ZenML: a collection of steps and pipelines, a stack -configuration and, to top it all off, a simple but useful CLI. This is exactly -what the ZenML starter template is all about. - -This project template is a good starting point for anyone starting out with -ZenML. It showcases the following fundamental ZenML concepts in a relatable -ML context: - -* designing [ZenML pipeline steps](https://docs.zenml.io/user-guide/starter-guide#pipelines-and-steps) -in general, but also particularly useful for the following applications: - * data ingestion, data transformation and data train/test splitting - * model training and evaluation -* using [step parameterization and caching](https://docs.zenml.io/user-guide/starter-guide/cache-previous-executions) -to design flexible and reusable steps -* using [custom data types for your artifacts and writing materializers for them](https://docs.zenml.io/user-guide/advanced-guide/artifact-management/handle-custom-data-types) -* constructing and running a [ZenML pipeline](https://docs.zenml.io/user-guide/starter-guide#pipelines-and-steps) -* accessing ZenML pipeline run artifacts in [the post-execution phase](https://docs.zenml.io/user-guide/starter-guide/fetch-runs-after-execution), -after a pipeline run has concluded -* best practices for implementing and running reproducible and reliable ML -pipelines with ZenML - -In addition to that, the entire project is implemented with the [scikit-learn](https://scikit-learn.org) -library and showcases how to use ZenML with a popular ML framework. It makes -heavy use of the tabular datasets and classification models that scikit-learn -provides, but the concepts and patterns it showcases are applicable to any -other ML framework. - -## πŸ“ƒ Template Parameters - -| Parameter | Description | Default | -|-----------|-------------|---------| -| Name | The name of the person/entity holding the copyright | ZenML GmbH | -| Email | The email of the person/entity holding the copyright | info@zenml.io | -| Project Name | Short name for your project | ZenML Starter | -| Project Slug | A slugified version of the project name (automatically generated from the project name) | zenml_starter | -| Project Version | The version of your project | 0.1.0 | -| Project License | The license under which your project will be released (one of `Apache Software License 2.0`, `MIT license`, `BSD license`, `ISC license`, `GNU General Public License v3` and `Not open source`) | Apache Software License 2.0 | -| Auto-Format | Whether to automatically format and cleanup the generated code with [black](https://black.readthedocs.io/), [ruff](https://beta.ruff.rs/docs/) and [autoflake](https://github.com/PyCQA/autoflake) (yes/no). You also need to have these Python packages installed for this option to take effect. | no | -| Use ZenML Step Params | Whether to showcase using parameters for the ZenML steps in the project (yes/no). If selected, all generated ZenML pipeline steps will be parameterized. | yes | -| Use ZenML Materializers | Whether to showcase using custom data types for the ZenML artifacts in the project (yes/no). If selected, the generated code will demonstrate the use of custom artifact data types and materializers in the generated steps and pipelines. | yes | -| UCI Dataset | The name of the UCI provided scikit-learn dataset to use in the project (one of `Iris`, `Breast Cancer` and `Wine`) | Wine | -| Scikit-learn Model | The name of the scikit-learn classifier model to use in the project (one of `Logistic Regression`, `SVC`, `Linear SVC`, `Random Forest`, `KNN`, `Gaussian NB`, `Perceptron`, `SGD Classifier` and `Decision Tree`) | Logistic Regression | -| Runtime Configurable Dataset | Whether to make the dataset a configurable parameter of the data loader step and CLI (yes/no). Only has effect if the step parameters were also selected. | yes | -| Runtime Configurable Model | Whether to make the model a configurable parameter of the model trainer step and CLI (yes/no). Only has effect if the step parameters were also selected. | yes | - -## πŸš€ Generate a ZenML Project - -Please see [the main README page](../README.md) for instructions on how to -generate a ZenML project from this template. diff --git a/starter/template/README.md b/starter/template/README.md deleted file mode 100644 index 0d04b69..0000000 --- a/starter/template/README.md +++ /dev/null @@ -1,113 +0,0 @@ -# {{project_name}} - -This is a basic supervised learning ML project built with the -ZenML framework and its scikit-learn integration. The project trains one or more -scikit-learn classification models to make predictions on one of the tabular -classification datasets provided by the scikit-learn library. The project was -generated from the [starter ZenML project template](https://github.com/zenml-io/zenml-project-templates/tree/main/starter) -with the following optional features enabled: -{%- if use_step_params == "y" %} -- parameterized ZenML steps -- ability to control the ZenML step parameters using the CLI -{%- if configurable_dataset == "y" %} -- ability to dynamically select and use a different dataset at runtime using the -CLI and/or a data loader step parameter -{%- endif %} -{%- if configurable_model == "y" %} -- ability to dynamically select and train a different classification model at -runtime using the CLI and/or a model trainer step parameter -{%- endif %} -{%- endif %} -{%- if use_custom_artifacts == "y" %} -- using custom data types for step artifacts and implementing custom materializers -{%- endif %} - -## πŸ‘‹ Introduction - -Welcome to your newly generated "{{project_name}}" project! This is -a great way to get started with ZenML. The project contains a collection of -basic ZenML steps, pipelines, stack configurations and other artifacts and -useful resources that can serve as a starting point for your journey with -ZenML. - -What to do first? You can start by giving the the project a quick run. The -project is ready to be used and can run as-is without any further code -changes! You can try it right away by installing ZenML, the scikit-learn -ZenML integration and then calling the CLI included in the project. We also -recommend that you start the ZenML UI locally to get a better sense of what -is going on under the hood: - -```bash -# Set up a Python virtual environment, if you haven't already -virtualenv .venv -source .venv/bin/activate -# Install requirements -pip install -r requirements.txt -# Start the ZenML UI locally (recommended, but optional); -# the default username is "admin" with an empty password -zenml up -# Run the pipeline included in the project -python run.py -``` - -When the pipeline is done running, you can check out the results in the ZenML -UI by following the link printed in the terminal (or you can go straight to -the [ZenML UI pipelines run page](http://127.0.0.1:8237/workspaces/default/all-runs?page=1). - -Next, you should: - -* look at the CLI help to see what you can do with the project: -```bash -python run.py --help -``` -* go back and [try out different parameters](https://github.com/zenml-io/zenml-project-templates/tree/main/starter#-template-parameters) -for your generated project. For example, you could enable generating step -parameters or custom materializers for your project, if you haven't already. -* take a look at [the project structure](#πŸ“œ-project-structure) and the code -itself. The code is heavily commented and should be easy to follow. -* read the [ZenML documentation](https://docs.zenml.io) to learn more about -various ZenML concepts referenced in the code and to get a better sense of -what you can do with ZenML. -* start building your own ZenML project by modifying this code - -## πŸ“¦ What's in the box? - -The {{ project_name }} project showcases a basic ZenML model -training pipeline with all the usual components you would expect to find in -a simple machine learning project such as this one: - -- a data ingestion step that loads one of the datasets provided through -scikit-learn -- a data processing step that does some basic preprocessing (drops rows with -missing values, normalizes the data) -- a data splitting step that breaks the data into train and test sets -- a training step that trains one of the scikit-learn models on the train set -- a model evaluation step that evaluates the trained model on the train and test -sets and warns or fails the pipeline if the model performance is below a -certain threshold - -The project code is meant to be used as a template for your own projects. For -this reason, you will find a number of places in the code specifically marked -to indicate where you can add your own code: - -```python -### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### -... -### YOUR CODE ENDS HERE ### -``` - -## πŸ“œ Project Structure - -The project loosely follows [the recommended ZenML project structure](https://docs.zenml.io/user-guide/starter-guide/follow-best-practices): - -``` -β”œβ”€β”€ pipelines <- All pipelines in one place -β”‚ β”œβ”€β”€ model_training.py <- The main (training) pipeline -β”œβ”€β”€ steps <- All steps in one place -β”‚ β”œβ”€β”€ data_loaders.py <- Data loader/processor/splitter steps -β”‚ β”œβ”€β”€ model_trainers.py <- Model trainer/evaluator steps -β”œβ”€β”€ .dockerignore -β”œβ”€β”€ README.md <- This file -β”œβ”€β”€ requirements.txt <- Python dependencies -└── run.py <- CLI entrypoint -``` diff --git a/starter/template/__init__.py b/starter/template/__init__.py deleted file mode 100644 index e08729f..0000000 --- a/starter/template/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -{% include 'templates/license_header' %} -__version__ = "{{ version }}" \ No newline at end of file diff --git a/starter/template/pipelines/__init__.py b/starter/template/pipelines/__init__.py deleted file mode 100644 index fa0b4e3..0000000 --- a/starter/template/pipelines/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -{% include 'templates/license_header' %} - -from pipelines.model_training import model_training_pipeline - -__all__ = [ - "model_training_pipeline", -] \ No newline at end of file diff --git a/starter/template/pipelines/model_training.py b/starter/template/pipelines/model_training.py deleted file mode 100644 index 143ba9f..0000000 --- a/starter/template/pipelines/model_training.py +++ /dev/null @@ -1,54 +0,0 @@ -{% include 'templates/license_header' %} - -from zenml.pipelines import pipeline - -@pipeline() -def model_training_pipeline( - data_loader, - data_processor, - data_splitter, - model_trainer, - model_evaluator, -): - """ - Model training pipeline recipe. - - This is a recipe for a pipeline that loads the data, processes it and - splits it into train and test sets, then trains and evaluates a model - on it. It is agnostic of the actual step implementations and just defines - how the artifacts are circulated through the steps by calling them in the - right order and passing the output of one step as the input of the next - step. - - The arguments that this function takes are instances of the steps that - are defined in the steps folder. Also note that the arguments passed to - the steps are step artifacts. If you use step parameters to configure the - steps, they must not be used here, but instead be used when the steps are - instantiated, before this function is called. - - Args: - data_loader: A data loader step instance that outputs a dataset. - data_processor: A data processor step instance that takes a dataset - as input and outputs a processed dataset. - data_splitter: A data splitter step instance that takes a dataset - as input and outputs a train and test set split. - model_trainer: A model trainer step instance that takes a train set - as input and outputs a trained model. - model_evaluator: A model evaluator step instance that takes a train/test - set split and a trained model as input and outputs model evaluation - metrics. - - """ - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - # Link all the steps together by calling them and passing the output - # of one step as the input of the next step. - dataset = data_loader() - processed_dataset = data_processor(dataset=dataset) - train_set, test_set = data_splitter(dataset=processed_dataset) - model = model_trainer(train_set=train_set) - model_evaluator( - model=model, - train_set=train_set, - test_set=test_set, - ) - ### YOUR CODE ENDS HERE ### diff --git a/starter/template/requirements.txt b/starter/template/requirements.txt deleted file mode 100644 index 9fcc501..0000000 --- a/starter/template/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -zenml[server] -scikit-learn \ No newline at end of file diff --git a/starter/template/run.py b/starter/template/run.py deleted file mode 100644 index 095b785..0000000 --- a/starter/template/run.py +++ /dev/null @@ -1,344 +0,0 @@ -{% include 'templates/license_header' %} - -import click -from typing import Any, Dict, Optional -from steps import ( - data_loader, - data_processor, - data_splitter, - model_trainer, - model_evaluator, -{%- if use_step_params %} -{%- if configurable_dataset %} - SklearnDataset, - DataLoaderStepParameters, -{%- endif %} - DataProcessorStepParameters, - DataSplitterStepParameters, - ModelTrainerStepParameters, - ModelEvaluatorStepParameters, -{%- if configurable_model %} - SklearnClassifierModel, -{%- endif %} -{%- endif %} -) -from pipelines import ( - model_training_pipeline, -) - -{% if use_step_params %} -def process_hyper_parameters(params: Optional[str] = None) -> Dict[str, Any]: - """Process hyper parameters entered by the user from the command line. - - This function is used to parse hyper parameters entered by the user in - the command line from a key-value string format (e.g. "C=0.1,max_iter=1000") - to a dictionary with the correct data types (int, float, bool). - - Args: - params: A string of comma-separated key-value pairs. - - Returns: - A dictionary of hyper parameters converted to the correct type (int, - float, bool) - """ - if not params: - return {} - try: - params = params.split(",") - params = [param.split("=") for param in params] - params = {key: value for key, value in params} - except ValueError: - raise ValueError( - "Invalid format for hyperparameters. " - "Expected a comma-separated list of key-value pairs " - "(e.g. 'C=0.1,max_iter=1000')." - ) - for key, value in params.items(): - try: - params[key] = int(value) - continue - except ValueError: - pass - try: - params[key] = float(value) - continue - except ValueError: - pass - if value.lower() == "true": - params[key] = True - continue - if value.lower() == "false": - params[key] = False - continue - return params - - -{% endif %} -@click.command(help=""" -{{ project_name }} CLI v{{ version }}. - -Run the {{ project_name }} model training pipeline with various -options. - -Examples: - - \b - # Run the pipeline with default options - python run.py - - \b - # Run the pipeline with caching disabled - python run.py --no-cache - -{%- if use_step_params and configurable_dataset %} - - \b - # Run the pipeline with a different dataset - python run.py --dataset=breast_cancer -{%- endif %} - -{%- if use_step_params and configurable_model %} - - \b - # Run the pipeline with a different model - python run.py --model=SVM -{%- endif %} - -{%- if use_step_params %} - - \b - # Run the pipeline with custom hyperparameters for the model training step - python run.py --hyperparameters="C=0.1,max_iter=1000" - - \b - # Run the pipeline with custom data splitter step parameters - python run.py --test-size=0.1 --no-stratify - - \b - # Run the pipeline with custom data processor step parameters - python run.py --drop-columns="alcohol,ash" --no-normalize - - \b - # Run the pipeline with a different random seed - python run.py --random-state=40 - - \b - # Change the model evaluation thresholds - python run.py --min-train-accuracy=0.98 --min-test-accuracy=0.98 --max-train-test-diff=0.05 --fail-on-eval-warnings -{%- endif %} -""" -) -@click.option( - "--no-cache", - is_flag=True, - default=False, - help="Disable caching for the pipeline run.", -) -{%- if use_step_params %} -{%- if configurable_dataset %} -@click.option( - "--dataset", - default="{{ sklearn_dataset_name }}", - type=click.Choice(SklearnDataset.values()), - help="The scikit-learn dataset to load.", -) -{%- endif %} -{%- if configurable_model %} -@click.option( - "--model", - default="{{ sklearn_model_name }}", - type=click.Choice(SklearnClassifierModel.values()), - help="The scikit-learn model to train.", -) -{%- endif %} -@click.option( - "--no-drop-na", - is_flag=True, - default=False, - help="Whether to skip dropping rows with missing values in the dataset.", -) -@click.option( - "--drop-columns", - default=None, - type=click.STRING, - help="Comma-separated list of columns to drop from the dataset.", -) -@click.option( - "--no-normalize", - is_flag=True, - default=False, - help="Whether to skip normalizing the dataset.", -) -@click.option( - "--test-size", - default=0.2, - type=click.FloatRange(0.0, 1.0), - help="Proportion of the dataset to include in the test split.", -) -@click.option( - "--no-shuffle", - is_flag=True, - default=False, - help="Whether to skip shuffling the data before splitting.", -) -@click.option( - "--no-stratify", - is_flag=True, - default=False, - help="Whether to skip stratifying the data before splitting.", -) -@click.option( - "--random-state", - default=42, - type=click.INT, - help="Controls the randomness during data shuffling and model training. " - "Pass an int for reproducible and cached output across multiple " - "pipeline runs.", -) -@click.option( - "--hyperparameters", - default=None, - type=click.STRING, - help="Comma-separated list of hyper-parameters to pass to the model " - "trainer (e.g. 'C=0.1,max_iter=1000').", -) -@click.option( - "--min-train-accuracy", - default=0.8, - type=click.FloatRange(0.0, 1.0), - help="Minimum training accuracy to pass to the model evaluator.", -) -@click.option( - "--min-test-accuracy", - default=0.8, - type=click.FloatRange(0.0, 1.0), - help="Minimum test accuracy to pass to the model evaluator.", -) -@click.option( - "--max-train-test-diff", - default=0.1, - type=click.FloatRange(0.0, 1.0), - help="Maximum difference between training and test accuracy to pass to " - "the model evaluator.", -) -@click.option( - "--fail-on-eval-warnings", - is_flag=True, - default=False, - help="Whether to fail the pipeline run if the model evaluation step " - "finds that the model is not accurate enough.", -) -{%- endif %} -def main( - no_cache: bool = False, -{%- if use_step_params %} -{%- if configurable_dataset %} - dataset: str = "{{ sklearn_dataset_name }}", -{%- endif %} -{%- if configurable_model %} - model: str = "{{ sklearn_model_name }}", -{%- endif %} - no_drop_na: bool = False, - drop_columns: Optional[str] = None, - no_normalize: bool = False, - test_size: float = 0.2, - no_shuffle: bool = False, - no_stratify: bool = False, - random_state: int = 42, - hyperparameters: Optional[str] = None, - min_train_accuracy: float = 0.8, - min_test_accuracy: float = 0.8, - max_train_test_diff: float = 0.1, - fail_on_eval_warnings: bool = False, -{%- endif %} -): - """Main entry point for the pipeline execution. - - This entrypoint is where everything comes together: - - * instantiating the steps and configuring them with the required - parameters (some of which may come from command line arguments) - * creating a pipeline instance that brings together all step instances - * launching the pipeline - * extracting and looking at the artifacts logged by the pipeline run - """ - - # Initialize a pipeline. This is also where we instantiate the steps and - # configure them with the required parameters. The step instances are - # then passed to the pipeline constructor. The result is a pipeline - # instance that is ready to be run. - pipeline = model_training_pipeline( -{%- if use_step_params %} -{%- if configurable_dataset %} - data_loader=data_loader( - params=DataLoaderStepParameters( - dataset=SklearnDataset(dataset), - ), - ), -{%- else %} - data_loader=data_loader(), -{%- endif %} - data_processor=data_processor( - params=DataProcessorStepParameters( - drop_na=not no_drop_na, - drop_columns=drop_columns.split(",") if drop_columns else [], - normalize=not no_normalize, - ), - ), - data_splitter=data_splitter( - params=DataSplitterStepParameters( - test_size=test_size, - shuffle=not no_shuffle, - stratify=not no_stratify, - random_state=random_state, - ), - ), - model_trainer=model_trainer( - params=ModelTrainerStepParameters( -{%- if configurable_model %} - model=SklearnClassifierModel(model), -{%- endif %} - random_state=random_state, - hyperparameters=process_hyper_parameters(hyperparameters), - ), - ), - model_evaluator=model_evaluator( - params=ModelEvaluatorStepParameters( - min_train_accuracy=min_train_accuracy, - min_test_accuracy=min_test_accuracy, - max_train_test_accuracy_difference=max_train_test_diff, - fail_on_warnings=fail_on_eval_warnings, - ), - ), -{%- else %} - data_loader=data_loader(), - data_splitter=data_splitter(), - data_processor=data_processor(), - model_trainer=model_trainer(), - model_evaluator=model_evaluator(), -{%- endif %} - ) - - pipeline_args = {} - if no_cache: - pipeline_args["enable_cache"] = False - - # Run the pipeline. This executes all steps in the pipeline in the - # correct order using the orchestrator stack component that is configured - # in your active ZenML stack. - pipeline.run(**pipeline_args) - - - # TODO: - # extract the evaluator result and show here - # Point to the dashboard URL (if running); instruct to start the dashboard - # info on how to use the CLI to show pipeline run details. - # add experiment tracker to steps (flag) - - # materialization: - # * - create a dummy class (statistics, report, visualization) and use that - # in the pipeline step and post-execution - - -if __name__ == "__main__": - main() diff --git a/starter/template/steps/__init__.py b/starter/template/steps/__init__.py deleted file mode 100644 index cc087fd..0000000 --- a/starter/template/steps/__init__.py +++ /dev/null @@ -1,47 +0,0 @@ -{% include 'templates/license_header' %} - -from steps.data_loaders import ( - data_loader, - data_processor, - data_splitter, -{%- if use_step_params %} -{%- if configurable_dataset %} - DataLoaderStepParameters, - SklearnDataset, -{%- endif %} - DataProcessorStepParameters, - DataSplitterStepParameters, -{%- endif %} -) -from steps.model_trainers import ( - model_trainer, - model_evaluator, -{%- if use_step_params %} -{%- if configurable_model %} - SklearnClassifierModel, -{%- endif %} - ModelTrainerStepParameters, - ModelEvaluatorStepParameters, -{%- endif %} -) - -__all__ = [ - "data_loader", - "data_processor", - "data_splitter", - "model_trainer", - "model_evaluator", -{%- if use_step_params %} -{%- if configurable_dataset %} - "DataLoaderStepParameters", - "SklearnDataset", -{%- endif %} - "DataProcessorStepParameters", - "DataSplitterStepParameters", -{%- if configurable_model %} - "SklearnClassifierModel", -{%- endif %} - "ModelTrainerStepParameters", - "ModelEvaluatorStepParameters", -{%- endif %} -] \ No newline at end of file diff --git a/starter/template/steps/data_loaders.py b/starter/template/steps/data_loaders.py deleted file mode 100644 index 26ac788..0000000 --- a/starter/template/steps/data_loaders.py +++ /dev/null @@ -1,383 +0,0 @@ -{% include 'templates/license_header' %} -{%- if use_step_params %} -from typing import List -{%- endif %} -import pandas as pd - -from sklearn.datasets import ( -{%- if use_step_params and configurable_dataset %} - load_wine, - load_breast_cancer, - load_iris, -{%- else %} - load_{{ sklearn_dataset_name }}, -{%- endif %} -) -from sklearn.model_selection import train_test_split -{%- if use_step_params %} -from zenml.enums import StrEnum -{%- endif %} -from zenml.steps import ( -{%- if use_step_params %} - BaseParameters, -{%- endif %} - Output, - step, -) -from zenml.logger import get_logger - -logger = get_logger(__name__) - -{% if use_step_params and configurable_dataset %} -class SklearnDataset(StrEnum): - """Built-in scikit-learn datasets.""" - wine = "wine" - iris = "iris" - breast_cancer = "breast_cancer" - -class DataLoaderStepParameters(BaseParameters): - """Parameters for the data loader step. - - This is an example of how to use step parameters to make your data loader - step configurable independently of the step code. This is useful for example - if you want to load different datasets or different versions of the same - dataset in your pipeline without having to change the step code. - """ - - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - # The name of the built-in scikit-learn dataset to load. - dataset: SklearnDataset = SklearnDataset.{{ sklearn_dataset_name }} - ### YOUR CODE ENDS HERE ### - - class Config: - """Pydantic config class. - - This is used to configure the behavior of Pydantic, the library used to - parse and validate step parameters. See the documentation for more - information: - - https://pydantic-docs.helpmanual.io/usage/model_config/ - - It is recommended to explicitly forbid extra parameters here to ensure - that the step parameters are always valid. - """ - extra = "forbid" - - -@step -def data_loader( - params: DataLoaderStepParameters, -) -> pd.DataFrame: - """Data loader step. - - This is an example of a data loader step that is usually the first step - in your pipeline. It reads data from an external source like a file, - database or 3rd party library, then formats it and returns it as a step - output artifact. - - This step is parameterized using the `DataLoaderStepParameters` class, which - allows you to configure the step independently of the step code, before - running it in a pipeline. In this example, the step can be configured to - load different built-in scikit-learn datasets. See the documentation for - more information: - - https://docs.zenml.io/user-guide/starter-guide/cache-previous-executions - - Data loader steps should have caching disabled if they are not deterministic - (i.e. if they data they load from the external source can be different when - they are subsequently called, even if the step code and parameter values - don't change). - - Args: - params: Parameters for the data loader step. - - Returns: - The loaded dataset artifact. - """ - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - # Load the dataset indicated in the step parameters and format it as a - # pandas DataFrame - if params.dataset == SklearnDataset.wine: - dataset = load_wine(as_frame=True).frame - elif params.dataset == SklearnDataset.iris: - dataset = load_iris(as_frame=True).frame - elif params.dataset == SklearnDataset.breast_cancer: - dataset = load_breast_cancer(as_frame=True).frame - elif params.dataset == SklearnDataset.diabetes: - dataset = load_diabetes(as_frame=True).frame - logger.info(f"Loaded dataset {params.dataset.value}: %s", dataset.info()) - logger.info(dataset.head()) - ### YOUR CODE ENDS HERE ### - - return dataset -{% else %} -@step -def data_loader() -> Output( - dataset=pd.DataFrame, -): - """Data loader step. - - This is an example of a data loader step that is usually the first step - in your pipeline. It reads data from an external source like a file, - database or 3rd party library, then formats it and returns it as a step - output artifact. - - Data loader steps should have caching disabled if they are not deterministic - (i.e. if they data they load from the external source is different when - they are subsequently called, even if the step code doesn't change). - - As an alternative, try modelling the data source as a step parameter to make - your data loader deterministic and configurable without the need to change - the step implementation. See the documentation for more information: - - https://docs.zenml.io/user-guide/starter-guide/cache-previous-executions - - Returns: - The loaded dataset artifact. - """ - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - # Load the {{ sklearn_dataset_name }} dataset and format it as a pandas DataFrame - dataset = load_{{ sklearn_dataset_name }}(as_frame=True).frame - dataset.info() - logger.info(dataset.head()) - ### YOUR CODE ENDS HERE ### - - return dataset -{% endif %} -{% if use_step_params %} -class DataProcessorStepParameters(BaseParameters): - """Parameters for the data processor step. - - This is an example of how to use step parameters to make your data processor - step configurable independently of the step code. This is useful for example - if you want to change the way your process data in your pipeline without - having to change the step code. - """ - - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - # Whether to drop rows with missing values. - drop_na: bool = True - # Columns to drop from the dataset. - drop_columns: List[str] = [] - # Whether to normalize the data. - normalize: bool = True - ### YOUR CODE ENDS HERE ### - - class Config: - """Pydantic config class. - - This is used to configure the behavior of Pydantic, the library used to - parse and validate step parameters. See the documentation for more - information: - - https://pydantic-docs.helpmanual.io/usage/model_config/ - - It is recommended to explicitly forbid extra parameters here to ensure - that the step parameters are always valid. - """ - extra = "forbid" - - -@step -def data_processor( - params: DataProcessorStepParameters, - dataset: pd.DataFrame, -) -> pd.DataFrame: - """Data processor step. - - This is an example of a data processor step that prepares the data so that - it is suitable for model training. It takes in a dataset as an input step - artifact and performs any necessary preprocessing steps like cleaning, - feature engineering, feature selection, etc. It then returns the processed - dataset as a step output artifact. - - This step is parameterized using the `DataProcessorStepParameters` class, - which allows you to configure the step independently of the step code, - before running it in a pipeline. In this example, the step can be configured - to perform or skip different preprocessing steps (e.g. dropping rows with - missing values, dropping columns, normalizing the data, etc.). See the - documentation for more information: - - https://docs.zenml.io/user-guide/starter-guide/cache-previous-executions - - Args: - params: Parameters for the data processor step. - dataset: The dataset artifact to process. - - Returns: - The processed dataset artifact. - """ - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - if params.drop_na: - # Drop rows with missing values - dataset = dataset.dropna() - if params.drop_columns: - # Drop columns - dataset = dataset.drop(columns=params.drop_columns) - if params.normalize: - # Normalize the data - target = dataset.pop('target') - dataset = (dataset - dataset.mean()) / dataset.std() - dataset['target'] = target - ### YOUR CODE ENDS HERE ### - - return dataset - - -class DataSplitterStepParameters(BaseParameters): - """Parameters for the data splitter step. - - This is an example of how to use step parameters to make your data splitter - step configurable independently of the step code. This is useful for example - if you want to change the ratio for the data split or if you want to - control the random seed used for the split without having to change the step - code. - """ - - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - # The proportion of the dataset to include in the test split. - test_size: float = 0.2 - # The random seed to use for the split. - random_state: int = 42 - # Whether to shuffle the dataset before splitting. - shuffle: bool = True - # Whether to stratify the split. - stratify: bool = True - ### YOUR CODE ENDS HERE ### - - class Config: - """Pydantic config class. - - This is used to configure the behavior of Pydantic, the library used to - parse and validate step parameters. See the documentation for more - information: - - https://pydantic-docs.helpmanual.io/usage/model_config/ - - It is recommended to explicitly forbid extra parameters here to ensure - that the step parameters are always valid. - """ - extra = "forbid" - - -@step -def data_splitter( - params: DataSplitterStepParameters, - dataset: pd.DataFrame, -) -> Output( - train_set=pd.DataFrame, - test_set=pd.DataFrame, -): - """Data splitter step. - - This is an example of a data splitter step that splits the dataset into - training and dev subsets to be used for model training and evaluation. It - takes in a dataset as an step input artifact and returns the training and - dev subsets as two separate step output artifacts. - - Data splitter steps should have a deterministic behavior, i.e. they should - use a fixed random seed and always return the same split when called with - the same input dataset. This is to ensure reproducibility of your pipeline - runs. - - This step is parameterized using the `DataSplitterStepParameters` class, - which allows you to configure the step independently of the step code, - before running it in a pipeline. In this example, the step can be configured - to use a different random seed, change the split ratio, or control whether - to shuffle or stratify the split. See the documentation for more - information: - - https://docs.zenml.io/user-guide/starter-guide/cache-previous-executions - - Args: - params: Parameters for the data splitter step. - dataset: The dataset to split. - - Returns: - The resulting training and dev subsets. - """ - - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - # Split the dataset into training and dev subsets - train_set, test_set = train_test_split( - dataset, - test_size=params.test_size, - shuffle=params.shuffle, - stratify=dataset['target'] if params.stratify else None, - random_state=params.random_state, - ) - ### YOUR CODE ENDS HERE ### - - return train_set, test_set -{% else %} -@step -def data_processor(dataset: pd.DataFrame) -> pd.DataFrame: - """Data processor step. - - This is an example of a data processor step that prepares the data so that - it is suitable for model training. It takes in a dataset as an step input - artifact and performs any necessary preprocessing steps like cleaning, feature - engineering, feature selection, etc. The processed dataset is then returned - as an step output artifact. - - Args: - dataset: The dataset artifact to process. - - Returns: - The processed dataset artifact. - """ - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - # Drop rows with missing values - dataset = dataset.dropna() - # Normalize the data - target = dataset.pop('target') - dataset = (dataset - dataset.mean()) / dataset.std() - dataset['target'] = target - ### YOUR CODE ENDS HERE ### - - return dataset - - -@step -def data_splitter(dataset: pd.DataFrame) -> Output( - train_set=pd.DataFrame, - test_set=pd.DataFrame, -): - """Data splitter step. - - This is an example of a data splitter step that splits the dataset into - training and dev subsets to be used for model training and evaluation. It - takes in a dataset as a step input artifact and returns the training and - dev subsets as two separate step output artifacts. - - Data splitter steps should have a deterministic behavior, i.e. they should - use a fixed random seed and always return the same split when called with - the same input dataset. This is to ensure reproducibility of your pipeline - runs. - - As an alternative to hard-coding the seed in the step code, try modelling - the random seed as a step parameter to make your data loader deterministic. - See the documentation for more information: - - https://docs.zenml.io/user-guide/starter-guide/cache-previous-executions - - Args: - dataset: The dataset to split. - - Returns: - The resulting training and dev subsets. - """ - - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - # Split the dataset into training and dev subsets - train_set, test_set = train_test_split( - dataset, - test_size=0.2, - shuffle=True, - stratify=dataset['target'], - random_state=42 - ) - ### YOUR CODE ENDS HERE ### - - return train_set, test_set -{%- endif %} diff --git a/starter/template/steps/model_trainers.py b/starter/template/steps/model_trainers.py deleted file mode 100644 index a8ae7b4..0000000 --- a/starter/template/steps/model_trainers.py +++ /dev/null @@ -1,499 +0,0 @@ -{% include 'templates/license_header' %} - -{%- if use_step_params %} -from typing import Any, Dict -{%- endif %} -import pandas as pd - -from sklearn.base import ClassifierMixin -{%- if use_step_params and configurable_model %} -from sklearn.linear_model import LogisticRegression -from sklearn.svm import SVC, LinearSVC -from sklearn.ensemble import RandomForestClassifier -from sklearn.neighbors import KNeighborsClassifier -from sklearn.naive_bayes import GaussianNB -from sklearn.linear_model import Perceptron -from sklearn.linear_model import SGDClassifier -from sklearn.tree import DecisionTreeClassifier -{%- elif sklearn_model_name == 'LogisticRegression' %} -from sklearn.linear_model import LogisticRegression -{%- elif sklearn_model_name == 'SVC' %} -from sklearn.svm import SVC -{%- elif sklearn_model_name == 'LinearSVC' %} -from sklearn.svm import LinearSVC -{%- elif sklearn_model_name == 'RandomForestClassifier' %} -from sklearn.ensemble import RandomForestClassifier -{%- elif sklearn_model_name == 'KNeighborsClassifier' %} -from sklearn.neighbors import KNeighborsClassifier -{%- elif sklearn_model_name == 'GaussianNB' %} -from sklearn.naive_bayes import GaussianNB -{%- elif sklearn_model_name == 'Perceptron' %} -from sklearn.linear_model import Perceptron -{%- elif sklearn_model_name == 'SGDClassifier' %} -from sklearn.linear_model import SGDClassifier -{%- elif sklearn_model_name == 'DecisionTreeClassifier' %} -from sklearn.tree import DecisionTreeClassifier -{%- endif %} -{% if use_custom_artifacts %} -from artifacts import ModelMetadata -from materializers import ModelMetadataMaterializer -{%- endif %} - -{%- if use_step_params %} -from zenml.enums import StrEnum -{%- endif %} -from zenml.logger import get_logger -from zenml.steps import ( -{%- if use_step_params %} - BaseParameters, -{%- endif %} - Output, - step, -) - -logger = get_logger(__name__) - -{% if use_step_params %} -{%- if configurable_model %} -class SklearnClassifierModel(StrEnum): - """Scikit-learn models used for classification.""" - LogisticRegression = "LogisticRegression" - SVC = "SVC" - LinearSVC = "LinearSVC" - RandomForestClassifier = "RandomForestClassifier" - KNeighborsClassifier = "KNeighborsClassifier" - GaussianNB = "GaussianNB" - Perceptron = "Perceptron" - SGDClassifier = "SGDClassifier" - DecisionTreeClassifier = "DecisionTreeClassifier" -{%- endif %} - -class ModelTrainerStepParameters(BaseParameters): - """Parameters for the model trainer step. - - This is an example of how to use step parameters to make your model trainer - step configurable independently of the step code. This is useful for example - if you want to try out different models in your pipeline without having to - change the step code. - """ - - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### -{%- if configurable_model %} - # The name of the scikit-learn classifier model to train. - model: SklearnClassifierModel = SklearnClassifierModel.{{ sklearn_model_name }} -{%- endif %} - # The random seed to use for reproducibility. - random_state: int = 42 - # The parameters to pass to the model constructor. - hyperparameters: Dict[str, Any] = {} - ### YOUR CODE ENDS HERE ### - - class Config: - """Pydantic config class. - - This is used to configure the behavior of Pydantic, the library used to - parse and validate step parameters. See the documentation for more - information: - - https://pydantic-docs.helpmanual.io/usage/model_config/ - - It is recommended to explicitly forbid extra parameters here to ensure - that the step parameters are always valid. - """ - extra = "forbid" - - -@step -def model_trainer( - params: ModelTrainerStepParameters, - train_set: pd.DataFrame, -) -> ClassifierMixin: - """Configure and train a model on the training dataset. - - This is an example of a model training step that takes in a dataset artifact - previously loaded and pre-processed by other steps in your pipeline, then - configures and trains a model on it. The model is then returned as a step - output artifact. - - Model training steps should have caching disabled if they are not - deterministic (i.e. if the model training involve some random processes - like initializing weights or shuffling data that are not controlled by - setting a fixed random seed). This example step ensures the outcome is - deterministic by initializing the model with a fixed random seed. - - This step is parameterized using the `ModelTrainerStepParameters` class, - which allows you to configure the step independently of the step code, -{%- if configurable_model %} - before running it in a pipeline. In this example, the step can be configured - to use a different model, change the random seed, or pass different - hyperparameters to the model constructor. See the documentation for more - information: -{%- else %} - before running it in a pipeline. In this example, the step can be configured - to change the random seed, or pass different hyperparameters to the model - constructor. See the documentation for more information: -{%- endif %} - - https://docs.zenml.io/user-guide/starter-guide/cache-previous-executions - - Args: - params: The parameters for the model trainer step. - train_set: The training data set artifact. - - Returns: - The trained model artifact. - """ - X_train = train_set.drop("target", axis=1) - Y_train = train_set["target"] - - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - # Initialize the model with the hyperparameters indicated in the step - # parameters and train it on the training set. -{%- if configurable_model %} - if params.model == SklearnClassifierModel.LogisticRegression: - model = LogisticRegression( - random_state=params.random_state, - **params.hyperparameters, - ) - elif params.model == SklearnClassifierModel.SVC: - model = SVC( - random_state=params.random_state, - **params.hyperparameters, - ) - elif params.model == SklearnClassifierModel.LinearSVC: - model = LinearSVC( - random_state=params.random_state, - **params.hyperparameters, - ) - elif params.model == SklearnClassifierModel.RandomForestClassifier: - model = RandomForestClassifier( - random_state=params.random_state, - **params.hyperparameters, - ) - elif params.model == SklearnClassifierModel.KNeighborsClassifier: - model = KNeighborsClassifier(**params.hyperparameters) - elif params.model == SklearnClassifierModel.GaussianNB: - model = GaussianNB(**params.hyperparameters) - elif params.model == SklearnClassifierModel.Perceptron: - model = Perceptron( - random_state=params.random_state, - **params.hyperparameters, - ) - elif params.model == SklearnClassifierModel.SGDClassifier: - model = SGDClassifier( - random_state=params.random_state, - **params.hyperparameters - ) - elif params.model == SklearnClassifierModel.DecisionTreeClassifier: - model = DecisionTreeClassifier( - random_state=params.random_state, - **params.hyperparameters, - ) -{%- else %} -{%- if not sklearn_model_name in [ 'KNeighborsClassifier', 'GaussianNB' ] %} - model = {{ sklearn_model_name }}(random_state=42, **params.hyperparameters) -{%- else %} - model = {{ sklearn_model_name }}(**params.hyperparameters) -{%- endif %} -{%- endif %} - - logger.info(f"Training model {model}...") - model.fit(X_train, Y_train) - ### YOUR CODE ENDS HERE ### - - return model - - -class ModelEvaluatorStepParameters(BaseParameters): - """Parameters for the model evaluator step. - - This is an example of how to use step parameters to make your model - evaluator step configurable independently of the step code. This is useful - for example if you want to control the acceptable thresholds for your model - metrics in your pipeline without having to change the step code. - """ - - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - # The minimum acceptable accuracy on the train set. - min_train_accuracy: float = 0.8 - # The minimum acceptable accuracy on the test set. - min_test_accuracy: float = 0.8 - # The maximum acceptable difference between train and test accuracy. - max_train_test_accuracy_difference: float = 0.1 - # Whether to raise an error and fail the pipeline step if the model - # performance does not meet the minimum criteria. - fail_on_warnings: bool = False - ### YOUR CODE ENDS HERE ### - - class Config: - """Pydantic config class. - - This is used to configure the behavior of Pydantic, the library used to - parse and validate step parameters. See the documentation for more - information: - - https://pydantic-docs.helpmanual.io/usage/model_config/ - - It is recommended to explicitly forbid extra parameters here to ensure - that the step parameters are always valid. - """ - extra = "forbid" - - -@step -{%- if use_custom_artifacts -%} -(output_materializers=ModelMetadataMaterializer) -{%- endif %} -def model_evaluator( - params: ModelEvaluatorStepParameters, - model: ClassifierMixin, - train_set: pd.DataFrame, - test_set: pd.DataFrame, -{%- if use_custom_artifacts %} -) -> ModelMetadata: -{%- else %} -) -> Output( - train_accuracy=float, - test_accuracy=float, -): -{%- endif %} - """Evaluate a trained model. - - This is an example of a model evaluation step that takes in a model artifact - previously trained by another step in your pipeline, and a training - and validation data set pair which it uses to evaluate the model's -{%- if use_custom_artifacts %} - performance. The step returns a custom type of artifact containing metadata - about the trained model. Note that using a custom data type also requires - implementing a custom materializer for it. See the `materializer` folder - or the following ZenML docs for more information about materializers: - - https://docs.zenml.io/user-guide/advanced-guide/artifact-management/handle-custom-data-types -{%- else %} - performance. The model metrics are then returned as step output artifacts - (in this case, the model accuracy on the train and test set). -{%- endif %} - - The suggested step implementation also outputs some warnings if the model - performance does not meet some minimum criteria. This is just an example of - how you can use steps to monitor your model performance and alert you if - something goes wrong. As an alternative, you can raise an exception in the - step to force the pipeline run to fail early and all subsequent steps to - be skipped. - - This step is parameterized using the `ModelEvaluatorStepParameters` class, - which allows you to configure the step independently of the step code, - before running it in a pipeline. In this example, the step can be configured - to use different values for the acceptable model performance thresholds and - to control whether the pipeline run should fail if the model performance - does not meet the minimum criteria. See the documentation for more - information: - - https://docs.zenml.io/user-guide/starter-guide/cache-previous-executions - - Args: - params: The parameters for the model evaluator step. - model: The pre-trained model artifact. - train_set: The training data set artifact. - test_set: The test data set artifact. - - Returns: -{%- if use_custom_artifacts %} - A model metadata artifact. -{%- else %} - The model accuracy on the train and test set. -{%- endif %} - """ - X_train = train_set.drop("target", axis=1) - Y_train = train_set["target"] - X_test = test_set.drop("target", axis=1) - Y_test = test_set["target"] - - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - # Calculate the model accuracy on the train and test set - train_acc = model.score(X_train, Y_train) - logger.info(f"Train accuracy: {train_acc}") - test_acc = model.score(X_test, Y_test) - logger.info(f"Test accuracy: {test_acc}") - - messages = [] - if train_acc < params.min_train_accuracy: - messages.append( - f"Train accuracy is below {params.min_train_accuracy*100}% !" - ) - if test_acc < params.min_test_accuracy: - messages.append( - f"Test accuracy is below {params.min_test_accuracy*100}% !" - ) - if test_acc - train_acc > params.max_train_test_accuracy_difference: - messages.append( - f"Train accuracy is more than " - f"{params.max_train_test_accuracy_difference*100}% " - f"higher than test accuracy. The model is overfitting the training " - f"dataset." - ) - if params.fail_on_warnings and messages: - raise RuntimeError( - "Model performance did not meet the minimum criteria:\n" + - "\n".join(messages) - ) - else: - for message in messages: - logger.warning(message) -{% if use_custom_artifacts %} - model_metadata = ModelMetadata() - model_metadata.collect_metadata( - model = model, - train_accuracy = train_acc, - test_accuracy = test_acc, - ) - return model_metadata -{%- else %} - return train_acc, test_acc -{%- endif %} - ### YOUR CODE ENDS HERE ### - -{% else %} -@step -def model_trainer( - train_set: pd.DataFrame, -) -> ClassifierMixin: - """Configure and train a model on the training dataset. - - This is an example of a model training step that takes in a dataset artifact - previously loaded and pre-processed by other steps in your pipeline, then - configures and trains a model on it. The model is then returned as a step - output artifact. - - Model training steps should have caching disabled if they are not - deterministic (i.e. if the model training involve some random processes - like initializing weights or shuffling data that are not controlled by - setting a fixed random seed). This example step ensures the outcome is - deterministic by initializing the model with a fixed random seed. - - As an alternative, try modelling the random seed as a step parameter to make - your model training steps deterministic. Another way step parameters may be - useful here is to configure the model hyperparameters rather than - hard-coding them in the step implementation. See the documentation for more - information: - - https://docs.zenml.io/user-guide/starter-guide/cache-previous-executions - - Args: - train_set: The training data set artifact. - - Returns: - The trained model artifact. - """ - X_train = train_set.drop("target", axis=1) - Y_train = train_set["target"] - - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - # Train a model on the training set. -{%- if not sklearn_model_name in [ 'KNeighborsClassifier', 'GaussianNB' ] %} - model = {{ sklearn_model_name }}(random_state=42) -{%- else %} - model = {{ sklearn_model_name }}() -{%- endif %} - logger.info(f"Training model {model}...") - - model.fit(X_train.to_numpy(), Y_train.to_numpy()) - ### YOUR CODE ENDS HERE ### - - return model - - -@step -{%- if use_custom_artifacts -%} -(output_materializers=ModelMetadataMaterializer) -{%- endif %} -def model_evaluator( - model: ClassifierMixin, - train_set: pd.DataFrame, - test_set: pd.DataFrame, -{%- if use_custom_artifacts %} -) -> ModelMetadata: -{%- else %} -) -> Output( - train_accuracy=float, - test_accuracy=float, -): -{%- endif %} - """Evaluate a trained model. - - This is an example of a model evaluation step that takes in a model artifact - previously trained by another step in your pipeline, and a training - and validation data set pair which it uses to evaluate the model's -{%- if use_custom_artifacts %} - performance. The step returns a custom type of artifact containing metadata - about the trained model. Note that using a custom data type also requires - implementing a custom materializer for it. See the `materializer` folder - or the following ZenML docs for more information about materializers: - - https://docs.zenml.io/user-guide/advanced-guide/artifact-management/handle-custom-data-types -{%- else %} - performance. The model metrics are then returned as step output artifacts - (in this case, the model accuracy on the train and test set). -{%- endif %} - - The suggested step implementation also outputs some warnings if the model - performance does not meet some minimum criteria. This is just an example of - how you can use steps to monitor your model performance and alert you if - something goes wrong. As an alternative, you can raise an exception in the - step to force the pipeline run to fail early and all subsequent steps to - be skipped. - - The threshold performance values used to evaluate the model are hard-coded - in this example. As an alternative, try modelling them as step parameters to - make your model evaluation steps more flexible. See the documentation for - more information: - - https://docs.zenml.io/user-guide/starter-guide/cache-previous-executions - - Args: - model: The pre-trained model artifact. - train_set: The training data set artifact. - test_set: The test data set artifact. - - Returns: -{%- if use_custom_artifacts %} - A model metadata artifact. -{%- else %} - The model accuracy on the train and test set. -{%- endif %} - """ - X_train = train_set.drop("target", axis=1) - Y_train = train_set["target"] - X_test = test_set.drop("target", axis=1) - Y_test = test_set["target"] - - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - # Calculate the model accuracy on the train and test set - train_acc = model.score(X_train, Y_train) - logger.info(f"Train accuracy: {train_acc}") - test_acc = model.score(X_test, Y_test) - logger.info(f"Train accuracy: {test_acc}") - - if train_acc < 0.8: - logger.warning("Train accuracy is below 80% !") - if test_acc < 0.8: - logger.warning("Test accuracy is below 80% !") - if test_acc - train_acc > 0.1: - logger.warning( - "Train accuracy is more than 10% higher than test accuracy. The " - "model is overfitting the training dataset." - ) -{%- if use_custom_artifacts %} - model_metadata = ModelMetadata() - model_metadata.collect_metadata( - model = model, - train_accuracy = train_acc, - test_accuracy = test_acc, - ) - return model_metadata -{%- else %} - return train_acc, test_acc -{%- endif %} - ### YOUR CODE ENDS HERE ### - -{%- endif %} diff --git a/starter/template/{% if open_source_license %}LICENSE{% endif %} b/starter/template/{% if open_source_license %}LICENSE{% endif %} deleted file mode 100644 index 4179465..0000000 --- a/starter/template/{% if open_source_license %}LICENSE{% endif %} +++ /dev/null @@ -1 +0,0 @@ -{% include 'templates/license' %} \ No newline at end of file diff --git a/starter/template/{% if use_custom_artifacts %}artifacts{% endif %}/__init__.py b/starter/template/{% if use_custom_artifacts %}artifacts{% endif %}/__init__.py deleted file mode 100644 index ff9f4ab..0000000 --- a/starter/template/{% if use_custom_artifacts %}artifacts{% endif %}/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -{% include 'templates/license_header' %} - -from artifacts.model_metadata import ModelMetadata - -__all__ = [ - "ModelMetadata", -] diff --git a/starter/template/{% if use_custom_artifacts %}artifacts{% endif %}/model_metadata.py b/starter/template/{% if use_custom_artifacts %}artifacts{% endif %}/model_metadata.py deleted file mode 100644 index 39d0082..0000000 --- a/starter/template/{% if use_custom_artifacts %}artifacts{% endif %}/model_metadata.py +++ /dev/null @@ -1,75 +0,0 @@ -{% include 'templates/license_header' %} -from typing import Any, Dict -from sklearn.base import ClassifierMixin - -class ModelMetadata: - """A custom artifact that stores model metadata. - - A model metadata object gathers together information that is collected - about the model being trained in a training pipeline run. This data type - is used for one of the artifacts returned by the model evaluation step. - - This is an example of a *custom artifact data type*: a type returned by - one of the pipeline steps that isn't natively supported by the ZenML - framework. Custom artifact data types are a common occurrence in ZenML, - usually encountered in one of the following circumstances: - - - you use a third party library that is not covered as a ZenML integration - and you model one or more step artifacts from the data types provided by - this library (e.g. datasets, models, data validation profiles, model - evaluation results/reports etc.) - - you need to use one of your own data types as a step artifact and it is - not one of the basic Python artifact data types supported by the ZenML - framework (e.g. str, int, float, dictionaries, lists, etc.) - - you want to extend one of the artifact data types already natively - supported by ZenML (e.g. pandas.DataFrame or sklearn.ClassifierMixin) - to customize it with your own data and/or behavior. - - In all above cases, the ZenML framework lacks one very important piece of - information: it doesn't "know" how to convert the data into a format that - can be saved in the artifact store (e.g. on a filesystem or persistent - storage service like S3 or GCS). Saving and loading artifacts from the - artifact store is something called "materialization" in ZenML terms and - you need to provide this missing information in the form of a custom - materializer - a class that implements loading/saving artifacts from/to - the artifact store. Take a look at the `materializers` folder to see how a - custom materializer is implemented for this artifact data type. - - More information about custom step artifact data types and ZenML - materializers is available in the docs: - - https://docs.zenml.io/user-guide/advanced-guide/artifact-management/handle-custom-data-types - - """ - - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - def __init__(self) -> None: - self.metadata: Dict[str, Any] = {} - - def collect_metadata( - self, - model: ClassifierMixin, - train_accuracy: float, - test_accuracy: float, - ) -> None: - """Gathers and stores metadata about a model. - - Args: - model: trained model - train_accuracy: model accuracy measured on the train set - test_accuracy: model accuracy measured on the test set - """ - self.metadata = dict( - model_type = model.__class__.__name__, - train_accuracy = train_accuracy, - test_accuracy = test_accuracy, - ) - - def print_report(self) -> None: - """Print a user-friendly report from the model metadata.""" - print(f""" -Model type: {self.metadata.get('model_type')} -Accuracy on train set: {self.metadata.get('train_accuracy')} -Accuracy on test set: {self.metadata.get('test_accuracy')} -""") - ### YOUR CODE ENDS HERE ### diff --git a/starter/template/{% if use_custom_artifacts %}materializers{% endif %}/__init__.py b/starter/template/{% if use_custom_artifacts %}materializers{% endif %}/__init__.py deleted file mode 100644 index 07569a9..0000000 --- a/starter/template/{% if use_custom_artifacts %}materializers{% endif %}/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -{% include 'templates/license_header' %} - -from materializers.model_metadata_materializer import ModelMetadataMaterializer - -__all__ = [ - "ModelMetadataMaterializer", -] diff --git a/starter/template/{% if use_custom_artifacts %}materializers{% endif %}/model_metadata_materializer.py b/starter/template/{% if use_custom_artifacts %}materializers{% endif %}/model_metadata_materializer.py deleted file mode 100644 index 7f6e77e..0000000 --- a/starter/template/{% if use_custom_artifacts %}materializers{% endif %}/model_metadata_materializer.py +++ /dev/null @@ -1,113 +0,0 @@ -{% include 'templates/license_header' %} -import os -from typing import Type - -import yaml - -from artifacts import ModelMetadata - -from zenml.enums import ArtifactType -from zenml.io import fileio -from zenml.materializers.base_materializer import BaseMaterializer - - -class ModelMetadataMaterializer(BaseMaterializer): - """Custom materializer for the `ModelMetadata` artifact data type. - - A materializer instructs ZenML about how to store (de-materialize) - the information from an artifact data type (ModelMetadata in this example) - into the artifact store and, conversely, loading (materializing) it back - into the artifact data type. Take a look at the `artifacts` folder for - additional information about custom artifact data types. - - When using custom data types for your artifacts, you must also supply - a custom materializer class that implements two simple I/O operations: - - - saving an artifact object to the the artifact store - - loading an artifact object from the artifact store - - For both of these operations, the ZenML framework supplies a URI - (`self.uri`) identifying the location in the artifact store where the - artifact is/should be located. Implementing them means transferring - the in-memory data stored in the artifact to the provided URI and - vice-versa. ZenML puts at your disposal a series of I/O utilities capable of - universally handling these URLs in the `zenml.io.fileio`, - `zenml.utils.io_utils` and `zenml.utils.yaml_utils` Python modules. - - More information about custom step artifact data types and ZenML - materializers is available in the docs: - - https://docs.zenml.io/user-guide/advanced-guide/artifact-management/handle-custom-data-types - - """ - - # This needs to point to the artifact data type(s) associated with the - # materializer - ASSOCIATED_TYPES = (ModelMetadata,) - ASSOCIATED_ARTIFACT_TYPE = ArtifactType.STATISTICS - - def save(self, model_metadata: ModelMetadata) -> None: - """Save (de-materialize) a model metadata artifact to the artifact store. - - This operation takes the information in the artifact (`model_metadata`) - and stores it in the artifact store at the `self.uri` URI location. - - This is usually implemented in one of two ways: - - - shown here: using the `zenml.io.fileio.open()` function or one of the - `zenml.utils.yaml_utils` wrappers to write the artifact data - directly to a file in the artifact store, similar to how you would use - the standard `open()` Python I/O. - - saving the artifact to a temporary location on your local filesystem - and then copying it to the artifact store using the `zenml.io.fileio` - functions (e.g. `mkdir()`, `copy()`). This last method is used in - cases where artifact data types come from 3rd party libraries that are - not directly aware of ZenML's I/O and cannot be modified to use it. - - Args: - model_metadata: model metadata object to save to the artifact store. - """ - super().save(model_metadata) - - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - # Dump the model metadata directly into the artifact store as a YAML file - with fileio.open(os.path.join(self.uri, 'model_metadata.yaml'), 'w') as f: - f.write(yaml.dump(model_metadata.metadata)) - ### YOUR CODE ENDS HERE ### - - def load(self, data_type: Type[ModelMetadata]) -> ModelMetadata: - """Load (materialize) a model metadata artifact from the artifact store. - - This operation takes the `self.uri` URI location in the artifact store - and loads the information present at that location in an artifact - object (`ModelMetadata`). - - This is usually implemented in one of two ways: - - - shown here: using the `zenml.io.fileio.open()` function or one of the - `zenml.utils.yaml_utils` wrappers to read the artifact data - directly from a file in the artifact store, similar to how you would use - the standard `open()` Python I/O. - - copying the artifact from the artifact store to a temporary location - on your local filesystem using the `zenml.io.fileio` functions (e.g. - `copy()`) and loading the information from the local file into the - artifact instance. This last method is used in cases where artifact data - types come from 3rd party libraries that are not directly aware of - ZenML's I/O and cannot be modified to use it. - - Args: - data_type: the artifact data type (model metadata) - - Returns: - A model metadata artifact instance materialized from the artifact - store. - """ - super().load(data_type) - - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - with fileio.open(os.path.join(self.uri, 'data.txt'), 'r') as f: - model_metadata = ModelMetadata() - model_metadata.metadata = yaml.safe_load(f.read()) - ### YOUR CODE ENDS HERE ### - - return model_metadata diff --git a/starter/template/{{ _copier_conf.answers_file }} b/starter/template/{{ _copier_conf.answers_file }} deleted file mode 100644 index e0f7470..0000000 --- a/starter/template/{{ _copier_conf.answers_file }} +++ /dev/null @@ -1,2 +0,0 @@ -# Changes here will be overwritten by Copier; NEVER EDIT MANUALLY -{{ _copier_answers|to_nice_yaml }} \ No newline at end of file diff --git a/template/.assets/cloud_mcp.png b/template/.assets/cloud_mcp.png new file mode 100644 index 0000000..81197e9 Binary files /dev/null and b/template/.assets/cloud_mcp.png differ diff --git a/template/.assets/cloud_mcp_predictions.png b/template/.assets/cloud_mcp_predictions.png new file mode 100644 index 0000000..a6bf7c9 Binary files /dev/null and b/template/.assets/cloud_mcp_predictions.png differ diff --git a/template/.assets/cloud_mcp_screenshot.png b/template/.assets/cloud_mcp_screenshot.png new file mode 100644 index 0000000..8f56def Binary files /dev/null and b/template/.assets/cloud_mcp_screenshot.png differ diff --git a/template/.assets/feature_engineering_pipeline.png b/template/.assets/feature_engineering_pipeline.png new file mode 100644 index 0000000..db30191 Binary files /dev/null and b/template/.assets/feature_engineering_pipeline.png differ diff --git a/template/.assets/inference_pipeline.png b/template/.assets/inference_pipeline.png new file mode 100644 index 0000000..358d553 Binary files /dev/null and b/template/.assets/inference_pipeline.png differ diff --git a/template/.assets/pipeline_overview.png b/template/.assets/pipeline_overview.png new file mode 100644 index 0000000..609e97d Binary files /dev/null and b/template/.assets/pipeline_overview.png differ diff --git a/template/.assets/training_pipeline.png b/template/.assets/training_pipeline.png new file mode 100644 index 0000000..a2e6a7d Binary files /dev/null and b/template/.assets/training_pipeline.png differ diff --git a/starter/template/.dockerignore b/template/.dockerignore similarity index 100% rename from starter/template/.dockerignore rename to template/.dockerignore diff --git a/template/README.md b/template/README.md new file mode 100644 index 0000000..dd042cd --- /dev/null +++ b/template/README.md @@ -0,0 +1,212 @@ +# :running: MLOps 101 with ZenML + +Build your first MLOps pipelines with ZenML. + +## :earth_americas: Overview + +This repository is a minimalistic MLOps project intended as a starting point to learn how to put ML workflows in production. It features: + +- A feature engineering pipeline that loads data and prepares it for training. +- A training pipeline that loads the preprocessed dataset and trains a model. +- A batch inference pipeline that runs predictions on the trained model with new data. + +This is a representation of how it will all come together: + +Pipelines Overview + +Along the way we will also show you how to: + +- Structure your code into MLOps pipelines. +- Automatically version, track, and cache data, models, and other artifacts. +- Transition your ML models from development to production. + +## πŸƒ Run on Colab + +You can use Google Colab to see ZenML in action, no signup / installation required! + +Open In Colab + +## :computer: Run Locally + +To run locally, install ZenML and pull this quickstart: + +```shell +# Install ZenML +pip install "zenml[server]" + +# clone the ZenML repository +git clone https://github.com/zenml-io/zenml.git +cd zenml/examples/quickstart +``` + +Now we're ready to start. You have two options for running the quickstart locally: + +#### Option 1 - Interactively explore the quickstart using Jupyter Notebook: +```bash +pip install notebook +jupyter notebook +# open notebooks/quickstart.ipynb +``` + +#### Option 2 - Execute the whole ML pipeline from a Python script: +```bash +# Install required zenml integrations +zenml integration install sklearn -y + +# Initialize ZenML +zenml init + +# Start the ZenServer to enable dashboard access +zenml up + +# Run the feature engineering pipeline +python run.py --feature-pipeline + +# Run the training pipeline +python run.py --training-pipeline + +# Run the training pipeline with versioned artifacts +python run.py --training-pipeline --train-dataset-version-name=1 --test-dataset-version-name=1 + +# Run the inference pipeline +python run.py --inference-pipeline +``` + +## 🌡 Learning MLOps with ZenML + +This project is also a great source of learning about some fundamental MLOps concepts. In sum, there are four exemplary steps happening, that can be mapped onto many other projects: + +
+ πŸ₯‡ Step 1: Load your data and execute feature engineering + +We'll start off by importing our data. In this project, we'll be working with +[the Breast Cancer](https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic) dataset +which is publicly available on the UCI Machine Learning Repository. The task is a classification +problem, to predict whether a patient is diagnosed with breast cancer or not. + +When you're getting started with a machine learning problem you'll want to do +something similar to this: import your data and get it in the right shape for +your training. Here are the typical steps within a feature engineering pipeline. + +The steps can be found defined the [steps](steps/) directory, while the [pipelines](pipelines/) directory has the pipeline code to connect them together. + +Feature engineering pipeline + +To execute the feature engineer pipelines, run: + +```python +python run.py --feature-pipeline +``` + +After the pipeline has run, the pipeline will produce some logs like: + +```shell +The latest feature engineering pipeline produced the following artifacts: + +1. Train Dataset - Name: dataset_trn, Version Name: 1 +2. Test Dataset: Name: dataset_tst, Version Name: 1 +``` + +We will use these versions in the next pipeline. + +
+ +
+ ⌚ Step 2: Training pipeline + +Now that our data is prepared, it makes sense to train some models to get a sense of how difficult the task is. The Breast Cancer dataset is sufficiently large and complex that it's unlikely we'll be able to train a model that behaves perfectly since the problem is inherently complex, but we can get a sense of what a reasonable baseline looks like. + +We'll start with two simple models, a SGD Classifier and a Random Forest +Classifier, both batteries-included from `sklearn`. We'll train them on the +same data and then compare their performance. + +Training pipeline + +Run it by using the ID's from the first step: + +```python +# You can also ignore the `--train-dataset-version-name` and `--test-dataset-version-name` to use +# the latest versions +python run.py --training-pipeline --train-dataset-version-name 1 --test-dataset-version-name 1 +``` + +To track these models, ZenML offers a *Model Control Plane*, which is a central register of all your ML models. +Each run of the training pipeline will produce a ZenML Model Version. + +```shell +zenml model list +``` + +This will show you a new `breast_cancer_classifier` model with two versions, `sgd` and `rf` created. You can find out how this was configured in the [YAML pipeline configuration files](configs/). + +If you are a [ZenML Cloud](https://zenml.io/cloud) user, you can see all of this visualized in the dashboard: + +Model Control Plane + +There is a lot more you can do with ZenML models, including the ability to +track metrics by adding metadata to it, or having them persist in a model +registry. However, these topics can be explored more in the +[ZenML docs](https://docs.zenml.io). + +
+ +
+ πŸ’― Step 3: Promoting the best model to production + +For now, we will use the ZenML model control plane to promote our best +model to `production`. You can do this by simply setting the `stage` of +your chosen model version to the `production` tag. + +```shell +zenml model version update breast_cancer_classifier rf --stage production +``` + +While we've demonstrated a manual promotion process for clarity, a more in-depth look at the [promoter code](steps/model_promoter.py) reveals that the training pipeline is designed to automate this step. It evaluates the latest model against established production metrics and, if the new model outperforms the existing one based on test set results, it will automatically promote the model to production. Here is an overview of the process: + +Model Control Plane + +Again, if you are a [ZenML Cloud](https://zenml.io/cloud) user, you would be able to see all this in the cloud dashboard. + +
+ +
+ πŸ«… Step 4: Consuming the model in production + +Once the model is promoted, we can now consume the right model version in our +batch inference pipeline directly. Let's see how that works. + +The batch inference pipeline simply takes the model marked as `production` and runs inference on it +with `live data`. The critical step here is the `inference_predict` step, where we load the model in memory and generate predictions. Apart from the loading the model, we must also load the preprocessing pipeline that we ran in feature engineering, +so that we can do the exact steps that we did on training time, in inference time. Let's bring it all together: + +ZenML automatically links all artifacts to the `production` model version as well, including the predictions +that were returned in the pipeline. This completes the MLOps loop of training to inference: + +Inference pipeline + +You can also see all predictions ever created as a complete history in the dashboard (Again only for [ZenML Cloud](https://zenml.io/cloud) users): + +Model Control Plane + +
+ +## :bulb: Learn More + +You're a legit MLOps engineer now! You trained two models, evaluated them against +a test set, registered the best one with the ZenML model control plane, +and served some predictions. You also learned how to iterate on your models and +data by using some of the ZenML utility abstractions. You saw how to view your +artifacts and stacks via the client as well as the ZenML Dashboard. + +If you want to learn more about ZenML as a tool, then the +[:page_facing_up: **ZenML Docs**](https://docs.zenml.io/) are the perfect place +to get started. In particular, the [Production Guide](https://docs.zenml.io/user-guide/production-guide/) +goes into more detail as to how to transition these same pipelines into production +on the cloud. + +The best way to get a production ZenML instance up and running with all batteries included is the [ZenML Cloud](https://zenml.io/cloud). Check it out! + +Also, make sure to join our + Slack + Slack Community + to become part of the ZenML family! \ No newline at end of file diff --git a/template/configs/feature_engineering.yaml b/template/configs/feature_engineering.yaml new file mode 100644 index 0000000..d5ab212 --- /dev/null +++ b/template/configs/feature_engineering.yaml @@ -0,0 +1,10 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + requirements: + - pyarrow + +# pipeline configuration +test_size: 0.35 \ No newline at end of file diff --git a/template/configs/inference.yaml b/template/configs/inference.yaml new file mode 100644 index 0000000..0fd82f5 --- /dev/null +++ b/template/configs/inference.yaml @@ -0,0 +1,15 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + requirements: + - pyarrow + +# configuration of the Model Control Plane +model_version: + name: "breast_cancer_classifier" + version: "production" + license: Apache 2.0 + description: A breast cancer classifier + tags: ["breast_cancer", "classifier"] \ No newline at end of file diff --git a/template/configs/training_rf.yaml b/template/configs/training_rf.yaml new file mode 100644 index 0000000..c1418af --- /dev/null +++ b/template/configs/training_rf.yaml @@ -0,0 +1,19 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + requirements: + - pyarrow + +# configuration of the Model Control Plane +model_version: + name: breast_cancer_classifier + version: rf + license: Apache 2.0 + description: A breast cancer classifier + tags: ["breast_cancer", "classifier"] + +# Configure the pipeline +parameters: + model_type: "rf" # Choose between rf/sgd diff --git a/template/configs/training_sgd.yaml b/template/configs/training_sgd.yaml new file mode 100644 index 0000000..6ca7c0d --- /dev/null +++ b/template/configs/training_sgd.yaml @@ -0,0 +1,19 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + requirements: + - pyarrow + +# configuration of the Model Control Plane +model_version: + name: breast_cancer_classifier + version: sgd + license: Apache 2.0 + description: A breast cancer classifier + tags: ["breast_cancer", "classifier"] + +# Configure the pipeline +parameters: + model_type: "sgd" # Choose between rf/sgd \ No newline at end of file diff --git a/templates/license b/template/license similarity index 100% rename from templates/license rename to template/license diff --git a/template/license_header b/template/license_header new file mode 100644 index 0000000..cc653b6 --- /dev/null +++ b/template/license_header @@ -0,0 +1,2 @@ +{%- macro license() %}{% include 'template/license' %}{% endmacro -%} +{{ license() | replace('\n', '\n# ') }} \ No newline at end of file diff --git a/template/pipelines/__init__.py b/template/pipelines/__init__.py new file mode 100644 index 0000000..16ae363 --- /dev/null +++ b/template/pipelines/__init__.py @@ -0,0 +1,5 @@ +# {% include 'template/license_header' %} + +from .feature_engineering import feature_engineering +from .inference import inference +from .training import training diff --git a/template/pipelines/feature_engineering.py b/template/pipelines/feature_engineering.py new file mode 100644 index 0000000..912cda2 --- /dev/null +++ b/template/pipelines/feature_engineering.py @@ -0,0 +1,59 @@ +# {% include 'template/license_header' %} + +import random +from typing import List, Optional + +from steps import ( + data_loader, + data_preprocessor, + data_splitter, +) +from zenml import pipeline +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@pipeline +def feature_engineering( + test_size: float = 0.2, + drop_na: Optional[bool] = None, + normalize: Optional[bool] = None, + drop_columns: Optional[List[str]] = None, + target: Optional[str] = "target", + random_state: int = 17 +): + """ + Feature engineering pipeline. + + This is a pipeline that loads the data, processes it and splits + it into train and test sets. + + Args: + test_size: Size of holdout set for training 0.0..1.0 + drop_na: If `True` NA values will be removed from dataset + normalize: If `True` dataset will be normalized with MinMaxScaler + drop_columns: List of columns to drop from dataset + target: Name of target column in dataset + random_state: Random state to configure the data loader + + Returns: + The processed datasets (dataset_trn, dataset_tst). + """ + # Link all the steps together by calling them and passing the output + # of one step as the input of the next step. + raw_data = data_loader(random_state=random_state, target=target) + dataset_trn, dataset_tst = data_splitter( + dataset=raw_data, + test_size=test_size, + ) + dataset_trn, dataset_tst, _ = data_preprocessor( + dataset_trn=dataset_trn, + dataset_tst=dataset_tst, + drop_na=drop_na, + normalize=normalize, + drop_columns=drop_columns, + target=target, + random_state=random_state, + ) + return dataset_trn, dataset_tst diff --git a/template/pipelines/inference.py b/template/pipelines/inference.py new file mode 100644 index 0000000..b9a299f --- /dev/null +++ b/template/pipelines/inference.py @@ -0,0 +1,46 @@ +# {% include 'template/license_header' %} + +from steps import ( + data_loader, + inference_predict, + inference_preprocessor, +) +from zenml import get_pipeline_context, pipeline +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@pipeline +def inference(random_state: str, target: str): + """ + Model inference pipeline. + + This is a pipeline that loads the inference data, processes it with + the same preprocessing pipeline used in training, and runs inference + with the trained model. + + Args: + random_state: Random state for reproducibility. + target: Name of target column in dataset. + """ + # Get the production model artifact + model = get_pipeline_context().model_version.get_artifact("model") + + # Get the preprocess pipeline artifact associated with this version + preprocess_pipeline = get_pipeline_context().model_version.get_artifact( + "preprocess_pipeline" + ) + + # Link all the steps together by calling them and passing the output + # of one step as the input of the next step. + df_inference = data_loader(random_state=random_state, is_inference=True) + df_inference = inference_preprocessor( + dataset_inf=df_inference, + preprocess_pipeline=preprocess_pipeline, + target=target, + ) + inference_predict( + model=model, + dataset_inf=df_inference, + ) diff --git a/template/pipelines/training.py b/template/pipelines/training.py new file mode 100644 index 0000000..7eebebb --- /dev/null +++ b/template/pipelines/training.py @@ -0,0 +1,58 @@ +# {% include 'template/license_header' %} + +from typing import Optional +from uuid import UUID + +from steps import model_evaluator, model_promoter, model_trainer +from zenml import ExternalArtifact, pipeline +from zenml.logger import get_logger + +from pipelines import ( + feature_engineering, +) + +logger = get_logger(__name__) + + +@pipeline +def training( + train_dataset_id: Optional[UUID] = None, + test_dataset_id: Optional[UUID] = None, + target: Optional[str] = "target", + model_type: Optional[str] = "sgd", +): + """ + Model training pipeline. + + This is a pipeline that loads the data from a preprocessing pipeline, + trains a model on it and evaluates the model. If it is the first model + to be trained, it will be promoted to production. If not, it will be + promoted only if it has a higher accuracy than the current production + model version. + + Args: + train_dataset_id: ID of the train dataset produced by feature engineering. + test_dataset_id: ID of the test dataset produced by feature engineering. + target: Name of target column in dataset. + model_type: The type of model to train. + """ + # Link all the steps together by calling them and passing the output + # of one step as the input of the next step. + + # Execute Feature Engineering Pipeline + if train_dataset_id is None or test_dataset_id is None: + dataset_trn, dataset_tst = feature_engineering() + else: + dataset_trn = ExternalArtifact(id=train_dataset_id) + dataset_tst = ExternalArtifact(id=test_dataset_id) + + model = model_trainer(dataset_trn=dataset_trn, target=target, model_type=model_type) + + acc = model_evaluator( + model=model, + dataset_trn=dataset_trn, + dataset_tst=dataset_tst, + target=target, + ) + + model_promoter(accuracy=acc) diff --git a/template/quickstart.ipynb b/template/quickstart.ipynb new file mode 100644 index 0000000..36e502d --- /dev/null +++ b/template/quickstart.ipynb @@ -0,0 +1,1117 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "63ab391a", + "metadata": {}, + "source": [ + "# Intro to MLOps using ZenML\n", + "\n", + "## 🌍 Overview\n", + "\n", + "This repository is a minimalistic MLOps project intended as a starting point to learn how to put ML workflows in production. It features: \n", + "\n", + "- A feature engineering pipeline that loads data and prepares it for training.\n", + "- A training pipeline that loads the preprocessed dataset and trains a model.\n", + "- A batch inference pipeline that runs predictions on the trained model with new data.\n", + "\n", + "Follow along this notebook to understand how you can use ZenML to productionalize your ML workflows!\n", + "\n", + "\"Pipelines" + ] + }, + { + "cell_type": "markdown", + "id": "8f466b16", + "metadata": {}, + "source": [ + "## Run on Colab\n", + "\n", + "You can use Google Colab to see ZenML in action, no signup / installation\n", + "required!\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](\n", + "https://colab.research.google.com/github/zenml-io/zenml/blob/main/examples/quickstart/quickstart.ipynb)" + ] + }, + { + "cell_type": "markdown", + "id": "66b2977c", + "metadata": {}, + "source": [ + "# πŸ‘Ά Step 0. Install Requirements\n", + "\n", + "Let's install ZenML to get started. First we'll install the latest version of\n", + "ZenML as well as the `sklearn` integration of ZenML:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce2f40eb", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install \"zenml[server]\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5aad397e", + "metadata": {}, + "outputs": [], + "source": [ + "from zenml.environment import Environment\n", + "\n", + "if Environment.in_google_colab():\n", + " # Install Cloudflare Tunnel binary\n", + " !wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb && dpkg -i cloudflared-linux-amd64.deb\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f76f562e", + "metadata": {}, + "outputs": [], + "source": [ + "!zenml integration install sklearn -y\n", + "\n", + "import IPython\n", + "IPython.Application.instance().kernel.do_shutdown(restart=True)" + ] + }, + { + "cell_type": "markdown", + "id": "3b044374", + "metadata": {}, + "source": [ + "Please wait for the installation to complete before running subsequent cells. At\n", + "the end of the installation, the notebook kernel will automatically restart." + ] + }, + { + "cell_type": "markdown", + "id": "e3955ff1", + "metadata": {}, + "source": [ + "Optional: If you are using [ZenML Cloud](https://zenml.io/cloud), execute the following cell with your tenant URL. Otherwise ignore." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2587315", + "metadata": {}, + "outputs": [], + "source": [ + "zenml_server_url = \"PLEASE_UPDATE_ME\" # in the form \"https://URL_TO_SERVER\"\n", + "\n", + "!zenml connect --url $zenml_server_url" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "081d5616", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize ZenML and set the default stack\n", + "!zenml init\n", + "\n", + "!zenml stack set default" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79f775f2", + "metadata": {}, + "outputs": [], + "source": [ + "# Do the imports at the top\n", + "from typing_extensions import Annotated\n", + "from sklearn.datasets import load_breast_cancer\n", + "\n", + "import random\n", + "import pandas as pd\n", + "from zenml import step, ExternalArtifact, pipeline, ModelVersion, get_step_context\n", + "from zenml.client import Client\n", + "from zenml.logger import get_logger\n", + "from uuid import UUID\n", + "\n", + "from typing import Optional, List\n", + "\n", + "from zenml import pipeline\n", + "\n", + "from steps import (\n", + " data_loader,\n", + " data_preprocessor,\n", + " data_splitter,\n", + " model_evaluator,\n", + " inference_preprocessor\n", + ")\n", + "\n", + "from zenml.logger import get_logger\n", + "\n", + "logger = get_logger(__name__)\n", + "\n", + "# Initialize the ZenML client to fetch objects from the ZenML Server\n", + "client = Client()" + ] + }, + { + "cell_type": "markdown", + "id": "35e48460", + "metadata": {}, + "source": [ + "## πŸ₯‡ Step 1: Load your data and execute feature engineering\n", + "\n", + "We'll start off by importing our data. In this quickstart we'll be working with\n", + "[the Breast Cancer](https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic) dataset\n", + "which is publicly available on the UCI Machine Learning Repository. The task is a classification\n", + "problem, to predict whether a patient is diagnosed with breast cancer or not.\n", + "\n", + "When you're getting started with a machine learning problem you'll want to do\n", + "something similar to this: import your data and get it in the right shape for\n", + "your training. ZenML mostly gets out of your way when you're writing your Python\n", + "code, as you'll see from the following cell.\n", + "\n", + "\"Feature" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cd974d1", + "metadata": {}, + "outputs": [], + "source": [ + "@step\n", + "def data_loader_simplified(\n", + " random_state: int, is_inference: bool = False, target: str = \"target\"\n", + ") -> Annotated[pd.DataFrame, \"dataset\"]: # We name the dataset \n", + " \"\"\"Dataset reader step.\"\"\"\n", + " dataset = load_breast_cancer(as_frame=True)\n", + " inference_size = int(len(dataset.target) * 0.05)\n", + " dataset: pd.DataFrame = dataset.frame\n", + " inference_subset = dataset.sample(inference_size, random_state=random_state)\n", + " if is_inference:\n", + " dataset = inference_subset\n", + " dataset.drop(columns=target, inplace=True)\n", + " else:\n", + " dataset.drop(inference_subset.index, inplace=True)\n", + " dataset.reset_index(drop=True, inplace=True)\n", + " logger.info(f\"Dataset with {len(dataset)} records loaded!\")\n", + " return dataset\n" + ] + }, + { + "cell_type": "markdown", + "id": "1e8ba4c6", + "metadata": {}, + "source": [ + "The whole function is decorated with the `@step` decorator, which\n", + "tells ZenML to track this function as a step in the pipeline. This means that\n", + "ZenML will automatically version, track, and cache the data that is produced by\n", + "this function as an `artifact`. This is a very powerful feature, as it means that you can\n", + "reproduce your data at any point in the future, even if the original data source\n", + "changes or disappears. \n", + "\n", + "Note the use of the `typing` module's `Annotated` type hint in the output of the\n", + "step. We're using this to give a name to the output of the step, which will make\n", + "it possible to access it via a keyword later on.\n", + "\n", + "You'll also notice that we have included type hints for the outputs\n", + "to the function. These are not only useful for anyone reading your code, but\n", + "help ZenML process your data in a way appropriate to the specific data types." + ] + }, + { + "cell_type": "markdown", + "id": "b6286b67", + "metadata": {}, + "source": [ + "ZenML is built in a way that allows you to experiment with your data and build\n", + "your pipelines as you work, so if you want to call this function to see how it\n", + "works, you can just call it directly. Here we take a look at the first few rows\n", + "of your training dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d838e2ea", + "metadata": {}, + "outputs": [], + "source": [ + "df = data_loader_simplified(random_state=42)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "28c05291", + "metadata": {}, + "source": [ + "Everything looks as we'd expect and the values are all in the right format πŸ₯³.\n", + "\n", + "We're now at the point where can bring this step (and some others) together into a single\n", + "pipeline, the top-level organising entity for code in ZenML. Creating such a pipeline is\n", + "as simple as adding a `@pipeline` decorator to a function. This specific\n", + "pipeline doesn't return a value, but that option is available to you if you need." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b50a9537", + "metadata": {}, + "outputs": [], + "source": [ + "@pipeline\n", + "def feature_engineering(\n", + " test_size: float = 0.3,\n", + " drop_na: Optional[bool] = None,\n", + " normalize: Optional[bool] = None,\n", + " drop_columns: Optional[List[str]] = None,\n", + " target: Optional[str] = \"target\",\n", + " random_state: int = 17\n", + "):\n", + " \"\"\"Feature engineering pipeline.\"\"\"\n", + " # Link all the steps together by calling them and passing the output\n", + " # of one step as the input of the next step.\n", + " raw_data = data_loader(random_state=random_state, target=target)\n", + " dataset_trn, dataset_tst = data_splitter(\n", + " dataset=raw_data,\n", + " test_size=test_size,\n", + " )\n", + " dataset_trn, dataset_tst, _ = data_preprocessor(\n", + " dataset_trn=dataset_trn,\n", + " dataset_tst=dataset_tst,\n", + " drop_na=drop_na,\n", + " normalize=normalize,\n", + " drop_columns=drop_columns,\n", + " target=target,\n", + " random_state=random_state,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "7cd73c23", + "metadata": {}, + "source": [ + "We're ready to run the pipeline now, which we can do just as with the step - by calling the\n", + "pipeline function itself:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e0aa9af", + "metadata": {}, + "outputs": [], + "source": [ + "feature_engineering()" + ] + }, + { + "cell_type": "markdown", + "id": "1785c303", + "metadata": {}, + "source": [ + "Let's run this again with a slightly different test size, to create more datasets:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "658c0570-2607-4b97-a72d-d45c92633e48", + "metadata": {}, + "outputs": [], + "source": [ + "feature_engineering(test_size=0.25)" + ] + }, + { + "cell_type": "markdown", + "id": "64bb7206", + "metadata": {}, + "source": [ + "Notice the second time around, the data loader step was **cached**, while the rest of the pipeline was rerun. \n", + "This is because ZenML automatically determined that nothing had changed in the data loader step, \n", + "so it didn't need to rerun it." + ] + }, + { + "cell_type": "markdown", + "id": "5bc6849d-31ac-4c08-9ca2-cf7f5f35ccbf", + "metadata": {}, + "source": [ + "Let's run this again with a slightly different test size and random state, to disable the cache and to create more datasets:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e1d8546", + "metadata": {}, + "outputs": [], + "source": [ + "feature_engineering(test_size=0.25, random_state=104)" + ] + }, + { + "cell_type": "markdown", + "id": "6c42078a", + "metadata": {}, + "source": [ + "At this point you might be interested to view your pipeline runs in the ZenML\n", + "Dashboard. In case you are not using a hosted instance of ZenML, you can spin this up by executing the next cell. This will start a\n", + "server which you can access by clicking on the link that appears in the output\n", + "of the cell.\n", + "\n", + "Log into the Dashboard using default credentials (username 'default' and\n", + "password left blank). From there you can inspect the pipeline or the specific\n", + "pipeline run.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8cd3cc8c", + "metadata": {}, + "outputs": [], + "source": [ + "from zenml.environment import Environment\n", + "from zenml.zen_stores.rest_zen_store import RestZenStore\n", + "\n", + "\n", + "if not isinstance(client.zen_store, RestZenStore):\n", + " # Only spin up a local Dashboard in case you aren't already connected to a remote server\n", + " if Environment.in_google_colab():\n", + " # run ZenML through a cloudflare tunnel to get a public endpoint\n", + " !zenml up --port 8237 & cloudflared tunnel --url http://localhost:8237\n", + " else:\n", + " !zenml up" + ] + }, + { + "cell_type": "markdown", + "id": "e8471f93", + "metadata": {}, + "source": [ + "We can also fetch the pipeline from the server and view the results directly in the notebook:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f208b200", + "metadata": {}, + "outputs": [], + "source": [ + "client = Client()\n", + "run = client.get_pipeline(\"feature_engineering\").last_run\n", + "print(run.name)" + ] + }, + { + "cell_type": "markdown", + "id": "a037f09d", + "metadata": {}, + "source": [ + "We can also see the data artifacts that were produced by the last step of the pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34283e89", + "metadata": {}, + "outputs": [], + "source": [ + "run.steps[\"data_preprocessor\"].outputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bceb0312", + "metadata": {}, + "outputs": [], + "source": [ + "# Read one of the datasets. This is the one with a 0.25 test split\n", + "run.steps[\"data_preprocessor\"].outputs[\"dataset_trn\"].load()" + ] + }, + { + "cell_type": "markdown", + "id": "26d26436", + "metadata": {}, + "source": [ + "We can also get the artifacts directly. Each time you create a new pipeline run, a new `artifact version` is created.\n", + "\n", + "You can fetch these artifact and their versions using the `client`: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8f90647", + "metadata": {}, + "outputs": [], + "source": [ + "# Get artifact version from our run\n", + "dataset_trn_artifact_version_via_run = run.steps[\"data_preprocessor\"].outputs[\"dataset_trn\"] \n", + "\n", + "# Get latest version from client directly\n", + "dataset_trn_artifact_version = client.get_artifact_version(\"dataset_trn\")\n", + "\n", + "# This should be true if our run is the latest run and no artifact has been produced\n", + "# in the intervening time\n", + "dataset_trn_artifact_version_via_run.id == dataset_trn_artifact_version.id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f9d3dfd", + "metadata": {}, + "outputs": [], + "source": [ + "# Fetch the rest of the artifacts\n", + "dataset_tst_artifact_version = client.get_artifact_version(\"dataset_tst\")\n", + "preprocessing_pipeline_artifact_version = client.get_artifact_version(\"preprocess_pipeline\")" + ] + }, + { + "cell_type": "markdown", + "id": "7a7d1b04", + "metadata": {}, + "source": [ + "If you started with a fresh install, then you would have two versions corresponding\n", + "to the two pipelines that we ran above. We can even load a artifact version in memory: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c82aca75", + "metadata": {}, + "outputs": [], + "source": [ + "# Load an artifact to verify you can fetch it\n", + "dataset_trn_artifact_version.load()" + ] + }, + { + "cell_type": "markdown", + "id": "5963509e", + "metadata": {}, + "source": [ + "We'll use these artifacts from above in our next pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "8c28b474", + "metadata": {}, + "source": [ + "# ⌚ Step 2: Training pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "87909827", + "metadata": {}, + "source": [ + "Now that we have our data it makes sense to train some models to get a sense of\n", + "how difficult the task is. The Breast Cancer dataset is sufficiently large and complex \n", + "that it's unlikely we'll be able to train a model that behaves perfectly since the problem \n", + "is inherently complex, but we can get a sense of what a reasonable baseline looks like.\n", + "\n", + "We'll start with two simple models, a SGD Classifier and a Random Forest\n", + "Classifier, both batteries-included from `sklearn`. We'll train them both on the\n", + "same data and then compare their performance.\n", + "\n", + "\"Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fccf1bd9", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.base import ClassifierMixin\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.linear_model import SGDClassifier\n", + "from typing_extensions import Annotated\n", + "from zenml import ArtifactConfig, step\n", + "from zenml.logger import get_logger\n", + "\n", + "logger = get_logger(__name__)\n", + "\n", + "\n", + "@step\n", + "def model_trainer(\n", + " dataset_trn: pd.DataFrame,\n", + " model_type: str = \"sgd\",\n", + ") -> Annotated[ClassifierMixin, ArtifactConfig(name=\"model\", is_model_artifact=True)]:\n", + " \"\"\"Configure and train a model on the training dataset.\"\"\"\n", + " target = \"target\"\n", + " if model_type == \"sgd\":\n", + " model = SGDClassifier()\n", + " elif model_type == \"rf\":\n", + " model = RandomForestClassifier()\n", + " else:\n", + " raise ValueError(f\"Unknown model type {model_type}\") \n", + "\n", + " logger.info(f\"Training model {model}...\")\n", + "\n", + " model.fit(\n", + " dataset_trn.drop(columns=[target]),\n", + " dataset_trn[target],\n", + " )\n", + " return model\n" + ] + }, + { + "cell_type": "markdown", + "id": "73a00008", + "metadata": {}, + "source": [ + "Our two training steps both return different kinds of `sklearn` classifier\n", + "models, so we use the generic `ClassifierMixin` type hint for the return type." + ] + }, + { + "cell_type": "markdown", + "id": "a5f22174", + "metadata": {}, + "source": [ + "ZenML allows you to load any version of any dataset that is tracked by the framework\n", + "directly into a pipeline using the `ExternalArtifact` interface. This is very convenient\n", + "in this case, as we'd like to send our preprocessed dataset from the older pipeline directly\n", + "into the training pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1aa98f2f", + "metadata": {}, + "outputs": [], + "source": [ + "@pipeline\n", + "def training(\n", + " train_dataset_id: Optional[UUID] = None,\n", + " test_dataset_id: Optional[UUID] = None,\n", + " model_type: str = \"sgd\",\n", + " min_train_accuracy: float = 0.0,\n", + " min_test_accuracy: float = 0.0,\n", + "):\n", + " \"\"\"Model training pipeline.\"\"\" \n", + " if train_dataset_id is None or test_dataset_id is None:\n", + " # If we dont pass the IDs, this will run the feature engineering pipeline \n", + " dataset_trn, dataset_tst = feature_engineering()\n", + " else:\n", + " # Load the datasets from an older pipeline\n", + " dataset_trn = ExternalArtifact(id=train_dataset_id)\n", + " dataset_tst = ExternalArtifact(id=test_dataset_id) \n", + "\n", + " trained_model = model_trainer(\n", + " dataset_trn=dataset_trn,\n", + " model_type=model_type,\n", + " )\n", + "\n", + " model_evaluator(\n", + " model=trained_model,\n", + " dataset_trn=dataset_trn,\n", + " dataset_tst=dataset_tst,\n", + " min_train_accuracy=min_train_accuracy,\n", + " min_test_accuracy=min_test_accuracy,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "88b70fd3", + "metadata": {}, + "source": [ + "The end goal of this quick baseline evaluation is to understand which of the two\n", + "models performs better. We'll use the `evaluator` step to compare the two\n", + "models. This step takes in the model from the trainer step, and computes its score\n", + "over the testing set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c64885ac", + "metadata": {}, + "outputs": [], + "source": [ + "# Use a random forest model with the chosen datasets.\n", + "# We need to pass the ID's of the datasets into the function\n", + "training(\n", + " model_type=\"rf\",\n", + " train_dataset_id=dataset_trn_artifact_version.id,\n", + " test_dataset_id=dataset_tst_artifact_version.id\n", + ")\n", + "\n", + "rf_run = client.get_pipeline(\"training\").last_run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4300c82f", + "metadata": {}, + "outputs": [], + "source": [ + "# Use a SGD classifier\n", + "sgd_run = training(\n", + " model_type=\"sgd\",\n", + " train_dataset_id=dataset_trn_artifact_version.id,\n", + " test_dataset_id=dataset_tst_artifact_version.id\n", + ")\n", + "\n", + "sgd_run = client.get_pipeline(\"training\").last_run" + ] + }, + { + "cell_type": "markdown", + "id": "43f1a68a", + "metadata": {}, + "source": [ + "You can see from the logs already how our model training went: the\n", + "`RandomForestClassifier` performed considerably better than the `SGDClassifier`.\n", + "We can use the ZenML `Client` to verify this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d95810b1", + "metadata": {}, + "outputs": [], + "source": [ + "# The evaluator returns a float value with the accuracy\n", + "rf_run.steps[\"model_evaluator\"].output.load() > sgd_run.steps[\"model_evaluator\"].output.load()" + ] + }, + { + "cell_type": "markdown", + "id": "e256d145", + "metadata": {}, + "source": [ + "# πŸ’― Step 3: Associating a model with your pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "927978f3", + "metadata": {}, + "source": [ + "You can see it is relatively easy to train ML models using ZenML pipelines. But it can be somewhat clunky to track\n", + "all the models produced as you develop your experiments and use-cases. Luckily, ZenML offers a *Model Control Plane*,\n", + "which is a central register of all your ML models.\n", + "\n", + "You can easily create a ZenML `Model` and associate it with your pipelines using the `ModelVersion` object:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99ca00c0", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_settings = {}\n", + "\n", + "# Lets add some metadata to the model to make it identifiable\n", + "pipeline_settings[\"model_version\"] = ModelVersion(\n", + " name=\"breast_cancer_classifier\",\n", + " license=\"Apache 2.0\",\n", + " description=\"A breast cancer classifier\",\n", + " tags=[\"breast_cancer\", \"classifier\"],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e78a520", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's train the SGD model and set the version name to \"sgd\"\n", + "pipeline_settings[\"model_version\"].version = \"sgd\"\n", + "\n", + "# the `with_options` method allows us to pass in pipeline settings\n", + "# and returns a configured pipeline\n", + "training_configured = training.with_options(**pipeline_settings)\n", + "\n", + "# We can now run this as usual\n", + "training_configured(\n", + " model_type=\"sgd\",\n", + " train_dataset_id=dataset_trn_artifact_version.id,\n", + " test_dataset_id=dataset_tst_artifact_version.id\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b8e0002", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's train the RF model and set the version name to \"rf\"\n", + "pipeline_settings[\"model_version\"].version = \"rf\"\n", + "\n", + "# the `with_options` method allows us to pass in pipeline settings\n", + "# and returns a configured pipeline\n", + "training_configured = training.with_options(**pipeline_settings)\n", + "\n", + "# Let's run it again to make sure we have two versions\n", + "training_configured(\n", + " model_type=\"rf\",\n", + " train_dataset_id=dataset_trn_artifact_version.id,\n", + " test_dataset_id=dataset_tst_artifact_version.id\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "09597223", + "metadata": {}, + "source": [ + "This time, running both pipelines has created two associated **model versions**.\n", + "You can list your ZenML model and their versions as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbb25913", + "metadata": {}, + "outputs": [], + "source": [ + "zenml_model = client.get_model(\"breast_cancer_classifier\")\n", + "print(zenml_model)\n", + "\n", + "print(f\"Model {zenml_model.name} has {len(zenml_model.versions)} versions\")\n", + "\n", + "zenml_model.versions[0].version, zenml_model.versions[1].version" + ] + }, + { + "cell_type": "markdown", + "id": "e82cfac2", + "metadata": {}, + "source": [ + "The interesting part is that ZenML went ahead and linked all artifacts produced by the\n", + "pipelines to that model version, including the two pickle files that represent our\n", + "SGD and RandomForest classifier. We can see all artifacts directly from the model\n", + "version object:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31211413", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's load the RF version\n", + "rf_zenml_model_version = client.get_model_version(\"breast_cancer_classifier\", \"rf\")\n", + "\n", + "# We can now load our classifier directly as well\n", + "random_forest_classifier = rf_zenml_model_version.get_artifact(\"model\").load()\n", + "\n", + "random_forest_classifier" + ] + }, + { + "cell_type": "markdown", + "id": "53517a9a", + "metadata": {}, + "source": [ + "If you are a [ZenML Cloud](https://zenml.io/cloud) user, you can see all of this visualized in the dashboard:\n", + "\n", + "\"Model" + ] + }, + { + "cell_type": "markdown", + "id": "eb645dde", + "metadata": {}, + "source": [ + "There is a lot more you can do with ZenML models, including the ability to\n", + "track metrics by adding metadata to it, or having them persist in a model\n", + "registry. However, these topics can be explored more in the\n", + "[ZenML docs](https://docs.zenml.io).\n", + "\n", + "For now, we will use the ZenML model control plane to promote our best\n", + "model to `production`. You can do this by simply setting the `stage` of\n", + "your chosen model version to the `production` tag." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26b718f8", + "metadata": {}, + "outputs": [], + "source": [ + "# Set our best classifier to production\n", + "rf_zenml_model_version.set_stage(\"production\", force=True)" + ] + }, + { + "cell_type": "markdown", + "id": "9fddf3d0", + "metadata": {}, + "source": [ + "Of course, normally one would only promote the model by comparing to all other model\n", + "versions and doing some other tests. But that's a bit more advanced use-case. See the\n", + "[e2e_batch example](https://github.com/zenml-io/zenml/tree/main/examples/e2e) to get\n", + "more insight into that sort of flow!" + ] + }, + { + "cell_type": "markdown", + "id": "2ecbc8cf", + "metadata": {}, + "source": [ + "\"Model" + ] + }, + { + "cell_type": "markdown", + "id": "8f1146db", + "metadata": {}, + "source": [ + "Once the model is promoted, we can now consume the right model version in our\n", + "batch inference pipeline directly. Let's see how that works." + ] + }, + { + "cell_type": "markdown", + "id": "d6306f14", + "metadata": {}, + "source": [ + "# πŸ«… Step 4: Consuming the model in production" + ] + }, + { + "cell_type": "markdown", + "id": "b51f3108", + "metadata": {}, + "source": [ + "The batch inference pipeline simply takes the model marked as `production` and runs inference on it\n", + "with `live data`. The critical step here is the `inference_predict` step, where we load the model in memory\n", + "and generate predictions:\n", + "\n", + "\"Inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92c4c7dc", + "metadata": {}, + "outputs": [], + "source": [ + "@step\n", + "def inference_predict(dataset_inf: pd.DataFrame) -> Annotated[pd.Series, \"predictions\"]:\n", + " \"\"\"Predictions step\"\"\"\n", + " # Get the model_version\n", + " model_version = get_step_context().model_version\n", + "\n", + " # run prediction from memory\n", + " predictor = model_version.load_artifact(\"model\")\n", + " predictions = predictor.predict(dataset_inf)\n", + "\n", + " predictions = pd.Series(predictions, name=\"predicted\")\n", + "\n", + " return predictions\n" + ] + }, + { + "cell_type": "markdown", + "id": "3aeb227b", + "metadata": {}, + "source": [ + "Apart from the loading the model, we must also load the preprocessing pipeline that we ran in feature engineering,\n", + "so that we can do the exact steps that we did on training time, in inference time. Let's bring it all together:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37c409bd", + "metadata": {}, + "outputs": [], + "source": [ + "@pipeline\n", + "def inference(preprocess_pipeline_id: UUID):\n", + " \"\"\"Model batch inference pipeline\"\"\"\n", + " # random_state = client.get_artifact_version(id=preprocess_pipeline_id).metadata[\"random_state\"].value\n", + " # target = client.get_artifact_version(id=preprocess_pipeline_id).run_metadata['target'].value\n", + " random_state = 42\n", + " target = \"target\"\n", + "\n", + " df_inference = data_loader(\n", + " random_state=random_state, is_inference=True\n", + " )\n", + " df_inference = inference_preprocessor(\n", + " dataset_inf=df_inference,\n", + " # We use the preprocess pipeline from the feature engineering pipeline\n", + " preprocess_pipeline=ExternalArtifact(id=preprocess_pipeline_id),\n", + " target=target,\n", + " )\n", + " inference_predict(\n", + " dataset_inf=df_inference,\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "id": "c7afe7be", + "metadata": {}, + "source": [ + "The way to load the right model is to pass in the `production` stage into the `ModelVersion` config this time.\n", + "This will ensure to always load the production model, decoupled from all other pipelines:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61bf5939", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_settings = {\"enable_cache\": False}\n", + "\n", + "# Lets add some metadata to the model to make it identifiable\n", + "pipeline_settings[\"model_version\"] = ModelVersion(\n", + " name=\"breast_cancer_classifier\",\n", + " version=\"production\", # We can pass in the stage name here!\n", + " license=\"Apache 2.0\",\n", + " description=\"A breast cancer classifier\",\n", + " tags=[\"breast_cancer\", \"classifier\"],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff3402f1", + "metadata": {}, + "outputs": [], + "source": [ + "# the `with_options` method allows us to pass in pipeline settings\n", + "# and returns a configured pipeline\n", + "inference_configured = inference.with_options(**pipeline_settings)\n", + "\n", + "# Let's run it again to make sure we have two versions\n", + "# We need to pass in the ID of the preprocessing done in the feature engineering pipeline\n", + "# in order to avoid training-serving skew\n", + "inference_configured(\n", + " preprocess_pipeline_id=preprocessing_pipeline_artifact_version.id\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2935d1fa", + "metadata": {}, + "source": [ + "ZenML automatically links all artifacts to the `production` model version as well, including the predictions\n", + "that were returned in the pipeline. This completes the MLOps loop of training to inference:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e191d019", + "metadata": {}, + "outputs": [], + "source": [ + "# Fetch production model\n", + "production_model_version = client.get_model_version(\"breast_cancer_classifier\", \"production\")\n", + "\n", + "# Get the predictions artifact\n", + "production_model_version.get_artifact(\"predictions\").load()" + ] + }, + { + "cell_type": "markdown", + "id": "b0a73cdf", + "metadata": {}, + "source": [ + "You can also see all predictions ever created as a complete history in the dashboard:\n", + "\n", + "\"Model" + ] + }, + { + "cell_type": "markdown", + "id": "594ee4fc-f102-4b99-bdc3-2f1670c87679", + "metadata": {}, + "source": [ + "## Congratulations!\n", + "\n", + "You're a legit MLOps engineer now! You trained two models, evaluated them against\n", + "a test set, registered the best one with the ZenML model control plane,\n", + "and served some predictions. You also learned how to iterate on your models and\n", + "data by using some of the ZenML utility abstractions. You saw how to view your\n", + "artifacts and models via the client as well as the ZenML Dashboard.\n", + "\n", + "## Further exploration\n", + "\n", + "This was just the tip of the iceberg of what ZenML can do; check out the [**docs**](https://docs.zenml.io/) to learn more\n", + "about the capabilities of ZenML. For example, you might want to:\n", + "\n", + "- [Deploy ZenML](https://docs.zenml.io/user-guide/production-guide/connect-deployed-zenml) to collaborate with your colleagues.\n", + "- Run the same pipeline on a [cloud MLOps stack in production](https://docs.zenml.io/user-guide/production-guide/cloud-stack).\n", + "- Track your metrics in an experiment tracker like [MLflow](https://docs.zenml.io/stacks-and-components/component-guide/experiment-trackers/mlflow).\n", + "\n", + "## What next?\n", + "\n", + "* If you have questions or feedback... join our [**Slack Community**](https://zenml.io/slack) and become part of the ZenML family!\n", + "* If you want to quickly get started with ZenML, check out the [ZenML Cloud](https://zenml.io/cloud)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/template/requirements.txt b/template/requirements.txt new file mode 100644 index 0000000..060cab4 --- /dev/null +++ b/template/requirements.txt @@ -0,0 +1,4 @@ +zenml[server]>=0.50.0 +notebook +scikit-learn<1.3 +pyarrow diff --git a/template/run.py b/template/run.py new file mode 100644 index 0000000..c3089fd --- /dev/null +++ b/template/run.py @@ -0,0 +1,221 @@ +# {% include 'template/license_header' %} + +import os +from typing import Optional + +import click +import yaml +from pipelines import ( + feature_engineering, + inference, + training, +) +from zenml.client import Client +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@click.command( + help=""" +ZenML Starter project. + +Run the ZenML starter project with basic options. + +Examples: + + \b + # Run the feature engineering pipeline + python run.py --feature-pipeline + + \b + # Run the training pipeline + python run.py --training-pipeline + + \b + # Run the training pipeline with versioned artifacts + python run.py --training-pipeline --train-dataset-version-name=1 --test-dataset-version-name=1 + + \b + # Run the inference pipeline + python run.py --inference-pipeline + +""" +) +@click.option( + "--train-dataset-name", + default="dataset_trn", + type=click.STRING, + help="The name of the train dataset produced by feature engineering.", +) +@click.option( + "--train-dataset-version-name", + default=None, + type=click.STRING, + help="Version of the train dataset produced by feature engineering. " + "If not specified, a new version will be created.", +) +@click.option( + "--test-dataset-name", + default="dataset_tst", + type=click.STRING, + help="The name of the test dataset produced by feature engineering.", +) +@click.option( + "--test-dataset-version-name", + default=None, + type=click.STRING, + help="Version of the test dataset produced by feature engineering. " + "If not specified, a new version will be created.", +) +@click.option( + "--feature-pipeline", + is_flag=True, + default=False, + help="Whether to run the pipeline that creates the dataset.", +) +@click.option( + "--training-pipeline", + is_flag=True, + default=False, + help="Whether to run the pipeline that trains the model.", +) +@click.option( + "--inference-pipeline", + is_flag=True, + default=False, + help="Whether to run the pipeline that performs inference.", +) +@click.option( + "--no-cache", + is_flag=True, + default=False, + help="Disable caching for the pipeline run.", +) +def main( + train_dataset_name: str = "dataset_trn", + train_dataset_version_name: Optional[str] = None, + test_dataset_name: str = "dataset_tst", + test_dataset_version_name: Optional[str] = None, + feature_pipeline: bool = False, + training_pipeline: bool = False, + inference_pipeline: bool = False, + no_cache: bool = False, +): + """Main entry point for the pipeline execution. + + This entrypoint is where everything comes together: + + * configuring pipeline with the required parameters + (some of which may come from command line arguments, but most + of which comes from the YAML config files) + * launching the pipeline + + Args: + train_dataset_name: The name of the train dataset produced by feature engineering. + train_dataset_version_name: Version of the train dataset produced by feature engineering. + If not specified, a new version will be created. + test_dataset_name: The name of the test dataset produced by feature engineering. + test_dataset_version_name: Version of the test dataset produced by feature engineering. + If not specified, a new version will be created. + feature_pipeline: Whether to run the pipeline that creates the dataset. + training_pipeline: Whether to run the pipeline that trains the model. + inference_pipeline: Whether to run the pipeline that performs inference. + no_cache: If `True` cache will be disabled. + """ + client = Client() + + config_folder = os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "configs", + ) + + # Execute Feature Engineering Pipeline + if feature_pipeline: + pipeline_args = {} + if no_cache: + pipeline_args["enable_cache"] = False + pipeline_args["config_path"] = os.path.join( + config_folder, "feature_engineering.yaml" + ) + run_args_feature = {} + feature_engineering.with_options(**pipeline_args)(**run_args_feature) + logger.info("Feature Engineering pipeline finished successfully!\n") + + train_dataset_artifact = client.get_artifact_version(train_dataset_name) + test_dataset_artifact = client.get_artifact_version(test_dataset_name) + logger.info( + "The latest feature engineering pipeline produced the following " + f"artifacts: \n\n1. Train Dataset - Name: {train_dataset_name}, " + f"Version Name: {train_dataset_artifact.version} \n2. Test Dataset: " + f"Name: {test_dataset_name}, Version Name: {test_dataset_artifact.version}" + ) + + # Execute Training Pipeline + if training_pipeline: + run_args_train = {} + + # If train_dataset_version_name is specified, use versioned artifacts + if train_dataset_version_name or test_dataset_version_name: + # However, both train and test dataset versions must be specified + assert ( + train_dataset_version_name is not None + and test_dataset_version_name is not None + ) + train_dataset_artifact_version = client.get_artifact_version( + train_dataset_name, train_dataset_version_name + ) + # If train dataset is specified, test dataset must be specified + test_dataset_artifact_version = client.get_artifact_version( + test_dataset_name, test_dataset_version_name + ) + # Use versioned artifacts + run_args_train["train_dataset_id"] = train_dataset_artifact_version.id + run_args_train["test_dataset_id"] = test_dataset_artifact_version.id + + # Run the SGD pipeline + pipeline_args = {} + if no_cache: + pipeline_args["enable_cache"] = False + pipeline_args["config_path"] = os.path.join(config_folder, "training_sgd.yaml") + training.with_options(**pipeline_args)(**run_args_train) + logger.info("Training pipeline with SGD finished successfully!\n\n") + + # Run the RF pipeline + pipeline_args = {} + if no_cache: + pipeline_args["enable_cache"] = False + pipeline_args["config_path"] = os.path.join(config_folder, "training_rf.yaml") + training.with_options(**pipeline_args)(**run_args_train) + logger.info("Training pipeline with RF finished successfully!\n\n") + + if inference_pipeline: + run_args_inference = {} + pipeline_args = {"enable_cache": False} + pipeline_args["config_path"] = os.path.join(config_folder, "inference.yaml") + + # Configure the pipeline + inference_configured = inference.with_options(**pipeline_args) + + # Fetch the production model + with open(pipeline_args["config_path"], "r") as f: + config = yaml.load(f, Loader=yaml.SafeLoader) + zenml_model = client.get_model_version( + config["model_version"]["name"], config["model_version"]["version"] + ) + preprocess_pipeline_artifact = zenml_model.get_artifact("preprocess_pipeline") + + # Use the metadata of feature engineering pipeline artifact + # to get the random state and target column + random_state = preprocess_pipeline_artifact.run_metadata["random_state"].value + target = preprocess_pipeline_artifact.run_metadata["target"].value + run_args_inference["random_state"] = random_state + run_args_inference["target"] = target + + # Run the pipeline + inference_configured(**run_args_inference) + logger.info("Inference pipeline finished successfully!") + + +if __name__ == "__main__": + main() diff --git a/template/steps/__init__.py b/template/steps/__init__.py new file mode 100644 index 0000000..aee0212 --- /dev/null +++ b/template/steps/__init__.py @@ -0,0 +1,26 @@ +# {% include 'template/license_header' %} + +from .data_loader import ( + data_loader, +) +from .data_preprocessor import ( + data_preprocessor, +) +from .data_splitter import ( + data_splitter, +) +from .inference_predict import ( + inference_predict, +) +from .inference_preprocessor import ( + inference_preprocessor, +) +from .model_evaluator import ( + model_evaluator, +) +from .model_promoter import ( + model_promoter, +) +from .model_trainer import ( + model_trainer, +) diff --git a/template/steps/data_loader.py b/template/steps/data_loader.py new file mode 100644 index 0000000..4df6b8a --- /dev/null +++ b/template/steps/data_loader.py @@ -0,0 +1,47 @@ +# {% include 'template/license_header' %} + +import pandas as pd +from sklearn.datasets import load_breast_cancer +from typing_extensions import Annotated +from zenml import step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def data_loader( + random_state: int, is_inference: bool = False, target: str = "target" +) -> Annotated[pd.DataFrame, "dataset"]: + """Dataset reader step. + + This is an example of a dataset reader step that load Breast Cancer dataset. + + This step is parameterized, which allows you to configure the step + independently of the step code, before running it in a pipeline. + In this example, the step can be configured with number of rows and logic + to drop target column or not. See the documentation for more information: + + https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines + + Args: + random_state: Random state for sampling + is_inference: If `True` subset will be returned and target column + will be removed from dataset. + target: Name of target columns in dataset. + + Returns: + The dataset artifact as Pandas DataFrame and name of target column. + """ + dataset = load_breast_cancer(as_frame=True) + inference_size = int(len(dataset.target) * 0.05) + dataset: pd.DataFrame = dataset.frame + inference_subset = dataset.sample(inference_size, random_state=random_state) + if is_inference: + dataset = inference_subset + dataset.drop(columns=target, inplace=True) + else: + dataset.drop(inference_subset.index, inplace=True) + dataset.reset_index(drop=True, inplace=True) + logger.info(f"Dataset with {len(dataset)} records loaded!") + return dataset diff --git a/template/steps/data_preprocessor.py b/template/steps/data_preprocessor.py new file mode 100644 index 0000000..f343407 --- /dev/null +++ b/template/steps/data_preprocessor.py @@ -0,0 +1,74 @@ +# {% include 'template/license_header' %} + +from typing import List, Optional, Tuple + +import pandas as pd +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import MinMaxScaler +from typing_extensions import Annotated +from utils.preprocess import ColumnsDropper, DataFrameCaster, NADropper +from zenml import log_artifact_metadata, step + + +@step +def data_preprocessor( + random_state: int, + dataset_trn: pd.DataFrame, + dataset_tst: pd.DataFrame, + drop_na: Optional[bool] = None, + normalize: Optional[bool] = None, + drop_columns: Optional[List[str]] = None, + target: Optional[str] = "target", +) -> Tuple[ + Annotated[pd.DataFrame, "dataset_trn"], + Annotated[pd.DataFrame, "dataset_tst"], + Annotated[Pipeline, "preprocess_pipeline"], +]: + """Data preprocessor step. + + This is an example of a data processor step that prepares the data so that + it is suitable for model training. It takes in a dataset as an input step + artifact and performs any necessary preprocessing steps like cleaning, + feature engineering, feature selection, etc. It then returns the processed + dataset as a step output artifact. + + This step is parameterized, which allows you to configure the step + independently of the step code, before running it in a pipeline. + In this example, the step can be configured to drop NA values, drop some + columns and normalize numerical columns. See the documentation for more + information: + + https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines + + Args: + random_state: Random state for sampling. + dataset_trn: The train dataset. + dataset_tst: The test dataset. + drop_na: If `True` all NA rows will be dropped. + normalize: If `True` all numeric fields will be normalized. + drop_columns: List of column names to drop. + target: Name of target column in dataset. + + Returns: + The processed datasets (dataset_trn, dataset_tst) and fitted `Pipeline` object. + """ + # We use the sklearn pipeline to chain together multiple preprocessing steps + preprocess_pipeline = Pipeline([("passthrough", "passthrough")]) + if drop_na: + preprocess_pipeline.steps.append(("drop_na", NADropper())) + if drop_columns: + # Drop columns + preprocess_pipeline.steps.append(("drop_columns", ColumnsDropper(drop_columns))) + if normalize: + # Normalize the data + preprocess_pipeline.steps.append(("normalize", MinMaxScaler())) + preprocess_pipeline.steps.append(("cast", DataFrameCaster(dataset_trn.columns))) + dataset_trn = preprocess_pipeline.fit_transform(dataset_trn) + dataset_tst = preprocess_pipeline.transform(dataset_tst) + + # Log metadata so we can load it in the inference pipeline + log_artifact_metadata( + artifact_name="preprocess_pipeline", + metadata={"random_state": random_state, "target": target}, + ) + return dataset_trn, dataset_tst, preprocess_pipeline diff --git a/template/steps/data_splitter.py b/template/steps/data_splitter.py new file mode 100644 index 0000000..3b8eb57 --- /dev/null +++ b/template/steps/data_splitter.py @@ -0,0 +1,45 @@ +# {% include 'template/license_header' %} + +from typing import Tuple + +import pandas as pd +from sklearn.model_selection import train_test_split +from typing_extensions import Annotated +from zenml import step + + +@step +def data_splitter( + dataset: pd.DataFrame, test_size: float = 0.2 +) -> Tuple[ + Annotated[pd.DataFrame, "raw_dataset_trn"], + Annotated[pd.DataFrame, "raw_dataset_tst"], +]: + """Dataset splitter step. + + This is an example of a dataset splitter step that splits the data + into train and test set before passing it to ML model. + + This step is parameterized, which allows you to configure the step + independently of the step code, before running it in a pipeline. + In this example, the step can be configured to use different test + set sizes. See the documentation for more information: + + https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines + + Args: + dataset: Dataset read from source. + test_size: 0.0..1.0 defining portion of test set. + + Returns: + The split dataset: dataset_trn, dataset_tst. + """ + dataset_trn, dataset_tst = train_test_split( + dataset, + test_size=test_size, + random_state=42, + shuffle=True, + ) + dataset_trn = pd.DataFrame(dataset_trn, columns=dataset.columns) + dataset_tst = pd.DataFrame(dataset_tst, columns=dataset.columns) + return dataset_trn, dataset_tst diff --git a/template/steps/inference_predict.py b/template/steps/inference_predict.py new file mode 100644 index 0000000..60f9267 --- /dev/null +++ b/template/steps/inference_predict.py @@ -0,0 +1,56 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Any + +import pandas as pd +from typing_extensions import Annotated +from zenml import step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def inference_predict( + model: Any, + dataset_inf: pd.DataFrame, +) -> Annotated[pd.Series, "predictions"]: + """Predictions step. + + This is an example of a predictions step that takes the data and model in + and returns predicted values. + + This step is parameterized, which allows you to configure the step + independently of the step code, before running it in a pipeline. + In this example, the step can be configured to use different input data. + See the documentation for more information: + + https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines + + Args: + model: Trained model. + dataset_inf: The inference dataset. + + Returns: + The predictions as pandas series + """ + # run prediction from memory + predictions = model.predict(dataset_inf) + + predictions = pd.Series(predictions, name="predicted") + return predictions diff --git a/template/steps/inference_preprocessor.py b/template/steps/inference_preprocessor.py new file mode 100644 index 0000000..d12247e --- /dev/null +++ b/template/steps/inference_preprocessor.py @@ -0,0 +1,49 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pandas as pd +from sklearn.pipeline import Pipeline +from typing_extensions import Annotated +from zenml import step + + +@step +def inference_preprocessor( + dataset_inf: pd.DataFrame, + preprocess_pipeline: Pipeline, + target: str, +) -> Annotated[pd.DataFrame, "inference_dataset"]: + """Data preprocessor step. + + This is an example of a data processor step that prepares the data so that + it is suitable for model inference. It takes in a dataset as an input step + artifact and performs any necessary preprocessing steps based on pretrained + preprocessing pipeline. + + Args: + dataset_inf: The inference dataset. + preprocess_pipeline: Pretrained `Pipeline` to process dataset. + target: Name of target columns in dataset. + + Returns: + The processed dataframe: dataset_inf. + """ + # artificially adding `target` column to avoid Pipeline issues + dataset_inf[target] = pd.Series([1] * dataset_inf.shape[0]) + dataset_inf = preprocess_pipeline.transform(dataset_inf) + dataset_inf.drop(columns=[target], inplace=True) + return dataset_inf diff --git a/template/steps/model_evaluator.py b/template/steps/model_evaluator.py new file mode 100644 index 0000000..426d26e --- /dev/null +++ b/template/steps/model_evaluator.py @@ -0,0 +1,86 @@ +# {% include 'template/license_header' %} + +from typing import Optional + +import pandas as pd +from sklearn.base import ClassifierMixin +from zenml import log_artifact_metadata, step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def model_evaluator( + model: ClassifierMixin, + dataset_trn: pd.DataFrame, + dataset_tst: pd.DataFrame, + min_train_accuracy: float = 0.0, + min_test_accuracy: float = 0.0, + target: Optional[str] = "target", +) -> float: + """Evaluate a trained model. + + This is an example of a model evaluation step that takes in a model artifact + previously trained by another step in your pipeline, and a training + and validation data set pair which it uses to evaluate the model's + performance. The model metrics are then returned as step output artifacts + (in this case, the model accuracy on the train and test set). + + The suggested step implementation also outputs some warnings if the model + performance does not meet some minimum criteria. This is just an example of + how you can use steps to monitor your model performance and alert you if + something goes wrong. As an alternative, you can raise an exception in the + step to force the pipeline run to fail early and all subsequent steps to + be skipped. + + This step is parameterized to configure the step independently of the step code, + before running it in a pipeline. In this example, the step can be configured + to use different values for the acceptable model performance thresholds and + to control whether the pipeline run should fail if the model performance + does not meet the minimum criteria. See the documentation for more + information: + + https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines + + Args: + model: The pre-trained model artifact. + dataset_trn: The train dataset. + dataset_tst: The test dataset. + min_train_accuracy: Minimal acceptable training accuracy value. + min_test_accuracy: Minimal acceptable testing accuracy value. + target: Name of target column in dataset. + + Returns: + The model accuracy on the test set. + """ + # Calculate the model accuracy on the train and test set + trn_acc = model.score( + dataset_trn.drop(columns=[target]), + dataset_trn[target], + ) + tst_acc = model.score( + dataset_tst.drop(columns=[target]), + dataset_tst[target], + ) + logger.info(f"Train accuracy={trn_acc*100:.2f}%") + logger.info(f"Test accuracy={tst_acc*100:.2f}%") + + messages = [] + if trn_acc < min_train_accuracy: + messages.append( + f"Train accuracy {trn_acc*100:.2f}% is below {min_train_accuracy*100:.2f}% !" + ) + if tst_acc < min_test_accuracy: + messages.append( + f"Test accuracy {tst_acc*100:.2f}% is below {min_test_accuracy*100:.2f}% !" + ) + else: + for message in messages: + logger.warning(message) + + log_artifact_metadata( + metadata={"train_accuracy": float(trn_acc), "test_accuracy": float(tst_acc)}, + artifact_name="model", + ) + return float(tst_acc) diff --git a/template/steps/model_promoter.py b/template/steps/model_promoter.py new file mode 100644 index 0000000..d00b773 --- /dev/null +++ b/template/steps/model_promoter.py @@ -0,0 +1,61 @@ +# {% include 'template/license_header' %} + +from zenml import get_step_context, step +from zenml.client import Client +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def model_promoter(accuracy: float, stage: str = "production") -> bool: + """Model promoter step. + + This is an example of a step that conditionally promotes a model. It takes + in the accuracy of the model and the stage to promote the model to. If the + accuracy is below 80%, the model is not promoted. If it is above 80%, the + model is promoted to the stage indicated in the parameters. If there is + already a model in the indicated stage, the model with the higher accuracy + is promoted. + + Args: + accuracy: Accuracy of the model. + stage: Which stage to promote the model to. + + Returns: + Whether the model was promoted or not. + """ + is_promoted = False + + if accuracy < 0.8: + logger.info( + f"Model accuracy {accuracy*100:.2f}% is below 80% ! Not promoting model." + ) + else: + logger.info(f"Model promoted to {stage}!") + is_promoted = True + + # Get the model in the current context + current_model_version = get_step_context().model_version + + # Get the model that is in the production stage + client = Client() + try: + stage_model_version = client.get_model_version( + current_model_version.name, stage + ) + # We compare their metrics + prod_accuracy = ( + stage_model_version.get_artifact("model") + .run_metadata["test_accuracy"] + .value + ) + if float(accuracy) > float(prod_accuracy): + # If current model has better metrics, we promote it + is_promoted = True + current_model_version.set_stage(stage, force=True) + except KeyError: + # If no such model exists, current one is promoted + is_promoted = True + current_model_version.set_stage(stage, force=True) + return is_promoted diff --git a/template/steps/model_trainer.py b/template/steps/model_trainer.py new file mode 100644 index 0000000..c351de6 --- /dev/null +++ b/template/steps/model_trainer.py @@ -0,0 +1,54 @@ +# {% include 'template/license_header' %} + +from typing import Optional + +import pandas as pd +from sklearn.base import ClassifierMixin +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import SGDClassifier +from typing_extensions import Annotated +from zenml import ArtifactConfig, step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def model_trainer( + dataset_trn: pd.DataFrame, + model_type: str = "sgd", + target: Optional[str] = "target", +) -> Annotated[ClassifierMixin, ArtifactConfig(name="model", is_model_artifact=True)]: + """Configure and train a model on the training dataset. + + This is an example of a model training step that takes in a dataset artifact + previously loaded and pre-processed by other steps in your pipeline, then + configures and trains a model on it. The model is then returned as a step + output artifact. + + Args: + dataset_trn: The preprocessed train dataset. + model_type: The type of model to train. + target: The name of the target column in the dataset. + + Returns: + The trained model artifact. + + Raises: + ValueError: If the model type is not supported. + """ + # Initialize the model with the hyperparameters indicated in the step + # parameters and train it on the training set. + if model_type == "sgd": + model = SGDClassifier() + elif model_type == "rf": + model = RandomForestClassifier() + else: + raise ValueError(f"Unknown model type {model_type}") + logger.info(f"Training model {model}...") + + model.fit( + dataset_trn.drop(columns=[target]), + dataset_trn[target], + ) + return model diff --git a/template/utils/__init__.py b/template/utils/__init__.py new file mode 100644 index 0000000..4bc11e5 --- /dev/null +++ b/template/utils/__init__.py @@ -0,0 +1 @@ +# {% include 'template/license_header' %} diff --git a/template/utils/preprocess.py b/template/utils/preprocess.py new file mode 100644 index 0000000..2dd4a85 --- /dev/null +++ b/template/utils/preprocess.py @@ -0,0 +1,41 @@ +# {% include 'template/license_header' %} + +from typing import Union + +import pandas as pd + + +class NADropper: + """Support class to drop NA values in sklearn Pipeline.""" + + def fit(self, *args, **kwargs): + return self + + def transform(self, X: Union[pd.DataFrame, pd.Series]): + return X.dropna() + + +class ColumnsDropper: + """Support class to drop specific columns in sklearn Pipeline.""" + + def __init__(self, columns): + self.columns = columns + + def fit(self, *args, **kwargs): + return self + + def transform(self, X: Union[pd.DataFrame, pd.Series]): + return X.drop(columns=self.columns) + + +class DataFrameCaster: + """Support class to cast type back to pd.DataFrame in sklearn Pipeline.""" + + def __init__(self, columns): + self.columns = columns + + def fit(self, *args, **kwargs): + return self + + def transform(self, X): + return pd.DataFrame(X, columns=self.columns) diff --git a/template/{% if open_source_license %}LICENSE{% endif %} b/template/{% if open_source_license %}LICENSE{% endif %} new file mode 100644 index 0000000..3de332b --- /dev/null +++ b/template/{% if open_source_license %}LICENSE{% endif %} @@ -0,0 +1 @@ +{% include 'template/license' %} \ No newline at end of file diff --git a/template/{{ _copier_conf.answers_file }} b/template/{{ _copier_conf.answers_file }} new file mode 100644 index 0000000..ea97bd4 --- /dev/null +++ b/template/{{ _copier_conf.answers_file }} @@ -0,0 +1,2 @@ +# Changes here will be overwritten by Copier +{{ _copier_answers|to_nice_yaml -}} \ No newline at end of file diff --git a/templates/globals.j2 b/templates/globals.j2 deleted file mode 100644 index cef7b18..0000000 --- a/templates/globals.j2 +++ /dev/null @@ -1,14 +0,0 @@ -{# -This is where global variables can be defined, which then can be used in one -or more templates. This is especially useful for variables that are generated -randomly, such as random names and passwords. - -For example, if the variable "foo" is defined here: - - {% set foo = "bar" %} - -it can be used in any template by including this file at the top: - - {% include "global_variables" %} - This is my foo: {{ foo }} -#} diff --git a/templates/license_header b/templates/license_header deleted file mode 100644 index e02240c..0000000 --- a/templates/license_header +++ /dev/null @@ -1,2 +0,0 @@ -{%- macro license() %}{% include 'templates/license' %}{% endmacro -%} -# {{ license() | replace('\n', '\n# ') }} \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 6234e9e..fa9049a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -31,10 +31,6 @@ def configure_stack(): if stack_name == "local": components = {} for component in [ - ("mlflow_local", "mlflow", StackComponentType.EXPERIMENT_TRACKER), - ("mlflow_local", "mlflow", StackComponentType.MODEL_REGISTRY), - ("mlflow_local", "mlflow", StackComponentType.MODEL_DEPLOYER), - ("evidently", "evidently", StackComponentType.DATA_VALIDATOR), ("local", "local", StackComponentType.ORCHESTRATOR), ("local", "local", StackComponentType.ARTIFACT_STORE), ]: diff --git a/tests/test_starter_template.py b/tests/test_starter_template.py index c7ac609..a164db3 100644 --- a/tests/test_starter_template.py +++ b/tests/test_starter_template.py @@ -11,51 +11,40 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. + + +import os +import pathlib +import platform import shutil import subprocess import sys from typing import Optional -from copier import Worker -import os -import pathlib -import pytest -from zenml.enums import ExecutionStatus -from zenml.post_execution import get_pipeline +import pytest +from copier import Worker from zenml.client import Client +from zenml.enums import ExecutionStatus -TEMPLATE_DIRECTORY = str(pathlib.Path(__file__).parent.parent) +TEMPLATE_DIRECTORY = str(pathlib.Path.joinpath(pathlib.Path(__file__).parent.parent)) def generate_and_run_project( tmp_path_factory: pytest.TempPathFactory, open_source_license: Optional[str] = "apache", - auto_format: bool = True, - use_step_params: bool = True, - use_custom_artifacts: bool = True, - configurable_dataset: bool = True, - configurable_model: bool = True, - sklearn_dataset_name: str = "wine", - sklearn_model_name: str = "SVC", - pipeline_name: str = "model_training_pipeline", + product_name: str = "starter_project", ): """Generate and run the starter project with different options.""" answers = { - "template": "starter", - "project_name": "Pytest Starter", + "project_name": "Pytest Templated Project", "version": "0.0.1", "open_source_license": str(open_source_license).lower(), - "email": "pytest@zenml.io", - "full_name": "Pytest", - "auto_format": auto_format, - "use_custom_artifacts": use_custom_artifacts, - "use_step_params": use_step_params, - "configurable_dataset": configurable_dataset, - "configurable_model": configurable_model, - "sklearn_dataset_name": sklearn_dataset_name, - "sklearn_model_name": sklearn_model_name, + "product_name": product_name, } + if open_source_license: + answers["email"] = "pytest@zenml.io" + answers["full_name"] = "Pytest" # generate the template in a temp path current_dir = os.getcwd() @@ -66,38 +55,50 @@ def generate_and_run_project( dst_path=str(dst_path), data=answers, unsafe=True, + vcs_ref="HEAD", ) as worker: worker.run_copy() # run the project - call = [sys.executable, "run.py"] + call = [ + sys.executable, + "run.py", + "--training-pipeline", + "--feature-pipeline", + "--inference-pipeline", + "--no-cache" + ] try: - subprocess.check_call( + subprocess.check_output( call, cwd=str(dst_path), env=os.environ.copy(), + stderr=subprocess.STDOUT, ) - except Exception as e: + except subprocess.CalledProcessError as e: raise RuntimeError( - f"Failed to run project generated with parameters: {answers}" + f"Failed to run project generated with parameters: {answers}\n" + f"{e.output.decode()}" ) from e # check the pipeline run is successful - pipeline = get_pipeline(pipeline_name) - assert pipeline - runs = pipeline.runs - assert len(runs) == 1 - assert runs[0].status == ExecutionStatus.COMPLETED + for pipeline_name in ["training", "inference", "feature_engineering"]: + pipeline = Client().get_pipeline(pipeline_name) + assert pipeline + runs = pipeline.runs + assert len(runs) == 1 + assert runs[0].status == ExecutionStatus.COMPLETED - # clean up - Client().delete_pipeline(pipeline_name) + # clean up + Client().delete_pipeline(pipeline_name) + Client().delete_model("breast_cancer_classifier") os.chdir(current_dir) shutil.rmtree(dst_path) -@pytest.mark.parametrize("open_source_license", ["mit", None]) +@pytest.mark.parametrize("open_source_license", ["mit", None], ids=["oss", "css"]) def test_generate_license( clean_zenml_client, tmp_path_factory: pytest.TempPathFactory, @@ -111,75 +112,13 @@ def test_generate_license( ) -def test_no_auto_format( - clean_zenml_client, - tmp_path_factory: pytest.TempPathFactory, -): - """Test turning off code auto-format.""" - - generate_and_run_project( - tmp_path_factory=tmp_path_factory, - auto_format=False, - ) - - -@pytest.mark.parametrize("use_custom_artifacts", [True, False]) -@pytest.mark.parametrize("sklearn_dataset_name", ["wine", "iris"]) -@pytest.mark.parametrize( - "sklearn_model_name", - [ - "SGDClassifier", - "DecisionTreeClassifier", - ], -) -def test_step_params_disabled( - clean_zenml_client, - tmp_path_factory: pytest.TempPathFactory, - use_custom_artifacts: bool, - sklearn_dataset_name: str, - sklearn_model_name: str, -): - """Test generating the starter template with step parameters disabled .""" - - generate_and_run_project( - tmp_path_factory=tmp_path_factory, - use_step_params=False, - use_custom_artifacts=use_custom_artifacts, - configurable_dataset=False, - configurable_model=False, - sklearn_dataset_name=sklearn_dataset_name, - sklearn_model_name=sklearn_model_name, - ) - - -@pytest.mark.parametrize("use_custom_artifacts", [True, False]) -@pytest.mark.parametrize("configurable_dataset", [True, False]) -@pytest.mark.parametrize("configurable_model", [True, False]) -@pytest.mark.parametrize("sklearn_dataset_name", ["iris", "breast_cancer"]) -@pytest.mark.parametrize( - "sklearn_model_name", - [ - "RandomForestClassifier", - "KNeighborsClassifier", - ], -) -def test_step_params_enabled( +def test_custom_product_name( clean_zenml_client, tmp_path_factory: pytest.TempPathFactory, - use_custom_artifacts: bool, - configurable_dataset: bool, - configurable_model: bool, - sklearn_dataset_name: str, - sklearn_model_name: str, ): - """Test generating the starter template with step parameters enabled .""" + """Test using custom pipeline name.""" generate_and_run_project( tmp_path_factory=tmp_path_factory, - use_step_params=False, - use_custom_artifacts=use_custom_artifacts, - configurable_dataset=configurable_dataset, - configurable_model=configurable_model, - sklearn_dataset_name=sklearn_dataset_name, - sklearn_model_name=sklearn_model_name, + product_name="custom_product_name", )