diff --git a/examples/sklearn-pipeline.ipynb b/examples/sklearn-pipeline.ipynb new file mode 100644 index 0000000..0cbc5ee --- /dev/null +++ b/examples/sklearn-pipeline.ipynb @@ -0,0 +1,416 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "dc877003", + "metadata": {}, + "outputs": [], + "source": [ + "#import pip\n", + "#pip.main(['install', 'skl2onnx', 'pydot', 'ebm2onnx', 'onnxruntime'])" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0c514309", + "metadata": {}, + "outputs": [], + "source": [ + "import onnx\n", + "import ebm2onnx\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from interpret import glassbox\n", + "from sklearn import compose, impute, pipeline, preprocessing\n", + "\n", + "from onnx.tools.net_drawer import GetOpNodeProducer, GetPydotGraph\n", + "from skl2onnx import convert_sklearn, update_registered_converter\n", + "from skl2onnx.common.shape_calculator import (\n", + " calculate_linear_classifier_output_shapes,\n", + ")\n", + "\n", + "from skl2onnx.common.data_types import Int64TensorType, FloatTensorType, StringTensorType\n", + "from skl2onnx.algebra.onnx_operator import OnnxSubEstimator\n", + "\n", + "import ebm2onnx\n", + "import ebm2onnx.operators as ops\n", + "\n", + "import onnxruntime as rt" + ] + }, + { + "cell_type": "markdown", + "id": "55dc0909", + "metadata": {}, + "source": [ + "# Train a model with feature engineering\n", + "\n", + "We use a scikit-learn pipeline for the feature engineering part. The whole pipeline is serialized in the final ONNX graph. So, the ONNX graph contains both the model and the feature-engineering transforms.\n", + "\n", + "We use a fake dataset here as an example." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ca66831f", + "metadata": {}, + "outputs": [], + "source": [ + "features = [\n", + " \"feature_a\",\n", + " \"feature_b\",\n", + " \"feature_c\",\n", + " \"feature_d\",\n", + " \"feature_e\",\n", + " \"feature_f\",\n", + " \"feature_g\",\n", + "]\n", + "\n", + "df_train = pd.DataFrame(\n", + " {\n", + " \"feature_a\": [0, 0.5, 2, 5],\n", + " \"feature_b\": [0, 0.5, 2, 5],\n", + " \"feature_c\": [0, 0.5, 2, 5],\n", + " \"feature_d\": [0, 0.5, 2, 5],\n", + " \"feature_e\": [0, 1, 0, 1],\n", + " \"feature_f\": [1, 0, 1, 0],\n", + " \"feature_g\": [\"a\", \"b\", \"can_not_determine\", \"can_not_determine\"],\n", + " \"target\": [1, 1, 0, 0],\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a4a0e64a", + "metadata": {}, + "outputs": [], + "source": [ + "numeric_mean_transformer = pipeline.Pipeline(\n", + " steps=[\n", + " (\"imputer\", impute.SimpleImputer(strategy=\"mean\")),\n", + " (\"scaler\", preprocessing.StandardScaler()),\n", + " ]\n", + ")\n", + "\n", + "numeric_median_transformer = pipeline.Pipeline(\n", + " steps=[\n", + " (\"imputer\", impute.SimpleImputer(strategy=\"median\")),\n", + " (\"scaler\", preprocessing.StandardScaler()),\n", + " ]\n", + ")\n", + "\n", + "categorical_transformer = pipeline.Pipeline(\n", + " steps=[\n", + " (\n", + " \"onehot\",\n", + " preprocessing.OneHotEncoder(\n", + " sparse=True,\n", + " # Assumes I have 2 bool and 1 cat feature, and I'm specifying what\n", + " # values I want to drop when one hot encoding.\n", + " drop=list([0, 0, \"can_not_determine\"]),\n", + " handle_unknown=\"ignore\",\n", + " ),\n", + " )\n", + " ]\n", + ")\n", + "\n", + "preprocessor = compose.ColumnTransformer(\n", + " transformers=[\n", + " (\n", + " \"num_mean\",\n", + " numeric_mean_transformer,\n", + " [\"feature_a\", \"feature_b\"],\n", + " ),\n", + " (\n", + " \"num_median\",\n", + " numeric_median_transformer,\n", + " [\"feature_c\", \"feature_d\"],\n", + " ),\n", + " (\"cat\", categorical_transformer, [\"feature_e\", \"feature_f\", \"feature_g\"]),\n", + " \n", + " ]\n", + ")\n", + "\n", + "my_pipeline = pipeline.Pipeline(\n", + " [\n", + " (\"preprocessor\", preprocessor),\n", + " (\n", + " \"model\",\n", + " glassbox.ExplainableBoostingClassifier(\n", + " max_bins=8,\n", + " min_samples_leaf=2,\n", + " max_leaves=2,\n", + " learning_rate=0.5,\n", + " validation_size=0.5,\n", + " early_stopping_rounds=5,\n", + " interactions=0,\n", + " random_state=42,\n", + " ),\n", + " ),\n", + " ]\n", + ")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "789aa119", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('preprocessor',\n",
+       "                 ColumnTransformer(transformers=[('num_mean',\n",
+       "                                                  Pipeline(steps=[('imputer',\n",
+       "                                                                   SimpleImputer()),\n",
+       "                                                                  ('scaler',\n",
+       "                                                                   StandardScaler())]),\n",
+       "                                                  ['feature_a', 'feature_b']),\n",
+       "                                                 ('num_median',\n",
+       "                                                  Pipeline(steps=[('imputer',\n",
+       "                                                                   SimpleImputer(strategy='median')),\n",
+       "                                                                  ('scaler',\n",
+       "                                                                   StandardScaler())]),\n",
+       "                                                  ['feature_c', 'feature_d']),\n",
+       "                                                 ('cat',\n",
+       "                                                  Pipeline(steps=[('onehot',\n",
+       "                                                                   OneHotEncoder(drop=[0,\n",
+       "                                                                                       0,\n",
+       "                                                                                       'can_not_determine'],\n",
+       "                                                                                 handle_unknown='ignore'))]),\n",
+       "                                                  ['feature_e', 'feature_f',\n",
+       "                                                   'feature_g'])])),\n",
+       "                ('model',\n",
+       "                 ExplainableBoostingClassifier(early_stopping_rounds=5,\n",
+       "                                               interactions=0,\n",
+       "                                               learning_rate=0.5, max_bins=8,\n",
+       "                                               max_leaves=2,\n",
+       "                                               validation_size=0.5))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('preprocessor',\n", + " ColumnTransformer(transformers=[('num_mean',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer()),\n", + " ('scaler',\n", + " StandardScaler())]),\n", + " ['feature_a', 'feature_b']),\n", + " ('num_median',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='median')),\n", + " ('scaler',\n", + " StandardScaler())]),\n", + " ['feature_c', 'feature_d']),\n", + " ('cat',\n", + " Pipeline(steps=[('onehot',\n", + " OneHotEncoder(drop=[0,\n", + " 0,\n", + " 'can_not_determine'],\n", + " handle_unknown='ignore'))]),\n", + " ['feature_e', 'feature_f',\n", + " 'feature_g'])])),\n", + " ('model',\n", + " ExplainableBoostingClassifier(early_stopping_rounds=5,\n", + " interactions=0,\n", + " learning_rate=0.5, max_bins=8,\n", + " max_leaves=2,\n", + " validation_size=0.5))])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "my_pipeline.fit(df_train[features], df_train[\"target\"])" + ] + }, + { + "cell_type": "markdown", + "id": "b9a44354", + "metadata": {}, + "source": [ + "# Convert the pipeline to ONNX\n", + "\n", + "We register the EBM model to skl2onnx. This allows for serializing it as part of the whole pipeline conversion to ONNX." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "cd639aab", + "metadata": {}, + "outputs": [], + "source": [ + "update_registered_converter(\n", + " glassbox.ExplainableBoostingClassifier,\n", + " \"ExplainableBoostingClassifier\",\n", + " ebm2onnx.sklearn.ebm_output_shape_calculator,\n", + " ebm2onnx.sklearn.convert_ebm_classifier,\n", + " options={\"nocl\": [True, False], \"zipmap\": [True, False, \"columns\"]},\n", + ")\n", + "\n", + "model_onnx = convert_sklearn(\n", + " my_pipeline,\n", + " \"pipeline_ebm\",\n", + " [\n", + " (\"feature_a\", FloatTensorType([None, 1])),\n", + " (\"feature_b\", FloatTensorType([None, 1])),\n", + " (\"feature_c\", FloatTensorType([None, 1])),\n", + " (\"feature_d\", FloatTensorType([None, 1])),\n", + " (\"feature_e\", Int64TensorType([None, 1])),\n", + " (\"feature_f\", Int64TensorType([None, 1])),\n", + " (\"feature_g\", StringTensorType([None, 1])),\n", + " ],\n", + " target_opset={\"\": 21, \"ai.onnx.ml\": 3},\n", + " options={id(my_pipeline): {\"zipmap\": False}}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5ce4f30", + "metadata": {}, + "outputs": [], + "source": [ + "pydot_graph = GetPydotGraph(\n", + " model_onnx.graph, name=model_onnx.graph.name, rankdir=\"TB\", node_producer=GetOpNodeProducer(\"docstring\")\n", + " )\n", + "pydot_graph.write_png('model.png')\n", + "\n", + "plt.figure(figsize = (80,30), dpi=300)\n", + "plt.imshow(plt.imread('model.png'), interpolation='nearest')" + ] + }, + { + "cell_type": "markdown", + "id": "22e0d3c6", + "metadata": {}, + "source": [ + "# Execute the pipeline with ONNX-Runtime" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "798fe23c", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"model.onnx\", \"wb\") as f:\n", + " f.write(model_onnx.SerializeToString())" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "95f9e081", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "label: [1 1 0 0]\n", + "probabilities: [[0.00554699 0.994453 ]\n", + " [0.26364148 0.7363585 ]\n", + " [0.990155 0.00984505]\n", + " [0.994453 0.00554699]]\n" + ] + } + ], + "source": [ + "sess = rt.InferenceSession(\"model.onnx\", providers=[\"CPUExecutionProvider\"])\n", + "\n", + "pred_onx = sess.run(None, {\n", + " \"feature_a\": df_train[\"feature_a\"].values.reshape([-1, 1]).astype(np.float32),\n", + " \"feature_b\": df_train[\"feature_b\"].values.reshape([-1, 1]).astype(np.float32),\n", + " \"feature_c\": df_train[\"feature_c\"].values.reshape([-1, 1]).astype(np.float32),\n", + " \"feature_d\": df_train[\"feature_d\"].values.reshape([-1, 1]).astype(np.float32),\n", + " \"feature_e\": df_train[\"feature_e\"].values.reshape([-1, 1]),\n", + " \"feature_f\": df_train[\"feature_f\"].values.reshape([-1, 1]),\n", + " \"feature_g\": df_train[\"feature_g\"].values.reshape([-1, 1]),\n", + "})\n", + "\n", + "outputs = sess.get_outputs()\n", + "\n", + "for i, o in enumerate(outputs):\n", + " print(f\"{o.name}: {pred_onx[i]}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}