diff --git a/examples/sklearn-pipeline.ipynb b/examples/sklearn-pipeline.ipynb new file mode 100644 index 0000000..0cbc5ee --- /dev/null +++ b/examples/sklearn-pipeline.ipynb @@ -0,0 +1,416 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "dc877003", + "metadata": {}, + "outputs": [], + "source": [ + "#import pip\n", + "#pip.main(['install', 'skl2onnx', 'pydot', 'ebm2onnx', 'onnxruntime'])" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0c514309", + "metadata": {}, + "outputs": [], + "source": [ + "import onnx\n", + "import ebm2onnx\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from interpret import glassbox\n", + "from sklearn import compose, impute, pipeline, preprocessing\n", + "\n", + "from onnx.tools.net_drawer import GetOpNodeProducer, GetPydotGraph\n", + "from skl2onnx import convert_sklearn, update_registered_converter\n", + "from skl2onnx.common.shape_calculator import (\n", + " calculate_linear_classifier_output_shapes,\n", + ")\n", + "\n", + "from skl2onnx.common.data_types import Int64TensorType, FloatTensorType, StringTensorType\n", + "from skl2onnx.algebra.onnx_operator import OnnxSubEstimator\n", + "\n", + "import ebm2onnx\n", + "import ebm2onnx.operators as ops\n", + "\n", + "import onnxruntime as rt" + ] + }, + { + "cell_type": "markdown", + "id": "55dc0909", + "metadata": {}, + "source": [ + "# Train a model with feature engineering\n", + "\n", + "We use a scikit-learn pipeline for the feature engineering part. The whole pipeline is serialized in the final ONNX graph. So, the ONNX graph contains both the model and the feature-engineering transforms.\n", + "\n", + "We use a fake dataset here as an example." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ca66831f", + "metadata": {}, + "outputs": [], + "source": [ + "features = [\n", + " \"feature_a\",\n", + " \"feature_b\",\n", + " \"feature_c\",\n", + " \"feature_d\",\n", + " \"feature_e\",\n", + " \"feature_f\",\n", + " \"feature_g\",\n", + "]\n", + "\n", + "df_train = pd.DataFrame(\n", + " {\n", + " \"feature_a\": [0, 0.5, 2, 5],\n", + " \"feature_b\": [0, 0.5, 2, 5],\n", + " \"feature_c\": [0, 0.5, 2, 5],\n", + " \"feature_d\": [0, 0.5, 2, 5],\n", + " \"feature_e\": [0, 1, 0, 1],\n", + " \"feature_f\": [1, 0, 1, 0],\n", + " \"feature_g\": [\"a\", \"b\", \"can_not_determine\", \"can_not_determine\"],\n", + " \"target\": [1, 1, 0, 0],\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a4a0e64a", + "metadata": {}, + "outputs": [], + "source": [ + "numeric_mean_transformer = pipeline.Pipeline(\n", + " steps=[\n", + " (\"imputer\", impute.SimpleImputer(strategy=\"mean\")),\n", + " (\"scaler\", preprocessing.StandardScaler()),\n", + " ]\n", + ")\n", + "\n", + "numeric_median_transformer = pipeline.Pipeline(\n", + " steps=[\n", + " (\"imputer\", impute.SimpleImputer(strategy=\"median\")),\n", + " (\"scaler\", preprocessing.StandardScaler()),\n", + " ]\n", + ")\n", + "\n", + "categorical_transformer = pipeline.Pipeline(\n", + " steps=[\n", + " (\n", + " \"onehot\",\n", + " preprocessing.OneHotEncoder(\n", + " sparse=True,\n", + " # Assumes I have 2 bool and 1 cat feature, and I'm specifying what\n", + " # values I want to drop when one hot encoding.\n", + " drop=list([0, 0, \"can_not_determine\"]),\n", + " handle_unknown=\"ignore\",\n", + " ),\n", + " )\n", + " ]\n", + ")\n", + "\n", + "preprocessor = compose.ColumnTransformer(\n", + " transformers=[\n", + " (\n", + " \"num_mean\",\n", + " numeric_mean_transformer,\n", + " [\"feature_a\", \"feature_b\"],\n", + " ),\n", + " (\n", + " \"num_median\",\n", + " numeric_median_transformer,\n", + " [\"feature_c\", \"feature_d\"],\n", + " ),\n", + " (\"cat\", categorical_transformer, [\"feature_e\", \"feature_f\", \"feature_g\"]),\n", + " \n", + " ]\n", + ")\n", + "\n", + "my_pipeline = pipeline.Pipeline(\n", + " [\n", + " (\"preprocessor\", preprocessor),\n", + " (\n", + " \"model\",\n", + " glassbox.ExplainableBoostingClassifier(\n", + " max_bins=8,\n", + " min_samples_leaf=2,\n", + " max_leaves=2,\n", + " learning_rate=0.5,\n", + " validation_size=0.5,\n", + " early_stopping_rounds=5,\n", + " interactions=0,\n", + " random_state=42,\n", + " ),\n", + " ),\n", + " ]\n", + ")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "789aa119", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('preprocessor',\n", + " ColumnTransformer(transformers=[('num_mean',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer()),\n", + " ('scaler',\n", + " StandardScaler())]),\n", + " ['feature_a', 'feature_b']),\n", + " ('num_median',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='median')),\n", + " ('scaler',\n", + " StandardScaler())]),\n", + " ['feature_c', 'feature_d']),\n", + " ('cat',\n", + " Pipeline(steps=[('onehot',\n", + " OneHotEncoder(drop=[0,\n", + " 0,\n", + " 'can_not_determine'],\n", + " handle_unknown='ignore'))]),\n", + " ['feature_e', 'feature_f',\n", + " 'feature_g'])])),\n", + " ('model',\n", + " ExplainableBoostingClassifier(early_stopping_rounds=5,\n", + " interactions=0,\n", + " learning_rate=0.5, max_bins=8,\n", + " max_leaves=2,\n", + " validation_size=0.5))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('preprocessor',\n", + " ColumnTransformer(transformers=[('num_mean',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer()),\n", + " ('scaler',\n", + " StandardScaler())]),\n", + " ['feature_a', 'feature_b']),\n", + " ('num_median',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='median')),\n", + " ('scaler',\n", + " StandardScaler())]),\n", + " ['feature_c', 'feature_d']),\n", + " ('cat',\n", + " Pipeline(steps=[('onehot',\n", + " OneHotEncoder(drop=[0,\n", + " 0,\n", + " 'can_not_determine'],\n", + " handle_unknown='ignore'))]),\n", + " ['feature_e', 'feature_f',\n", + " 'feature_g'])])),\n", + " ('model',\n", + " ExplainableBoostingClassifier(early_stopping_rounds=5,\n", + " interactions=0,\n", + " learning_rate=0.5, max_bins=8,\n", + " max_leaves=2,\n", + " validation_size=0.5))])
ColumnTransformer(transformers=[('num_mean',\n", + " Pipeline(steps=[('imputer', SimpleImputer()),\n", + " ('scaler', StandardScaler())]),\n", + " ['feature_a', 'feature_b']),\n", + " ('num_median',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='median')),\n", + " ('scaler', StandardScaler())]),\n", + " ['feature_c', 'feature_d']),\n", + " ('cat',\n", + " Pipeline(steps=[('onehot',\n", + " OneHotEncoder(drop=[0, 0,\n", + " 'can_not_determine'],\n", + " handle_unknown='ignore'))]),\n", + " ['feature_e', 'feature_f', 'feature_g'])])
['feature_a', 'feature_b']
SimpleImputer()
StandardScaler()
['feature_c', 'feature_d']
SimpleImputer(strategy='median')
StandardScaler()
['feature_e', 'feature_f', 'feature_g']
OneHotEncoder(drop=[0, 0, 'can_not_determine'], handle_unknown='ignore')
ExplainableBoostingClassifier(early_stopping_rounds=5, interactions=0,\n", + " learning_rate=0.5, max_bins=8, max_leaves=2,\n", + " validation_size=0.5)