From c4bf7ed0d8b7c6108e32ca74b37f02d14b739449 Mon Sep 17 00:00:00 2001
From: Romain Picard <romain.picard@softathome.com>
Date: Thu, 25 Jul 2024 18:32:59 +0200
Subject: [PATCH] do not mutate the ebm model during conversion

This prevents from using the ebm model correctly after the conversion.
Fixes #16

Signed-off-by: Romain Picard <romain.picard@softathome.com>
---
 ebm2onnx/convert.py   | 35 +++++++++++++++++------------------
 tests/test_convert.py |  9 ++++++++-
 2 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/ebm2onnx/convert.py b/ebm2onnx/convert.py
index a5e4faa..bcd85b0 100644
--- a/ebm2onnx/convert.py
+++ b/ebm2onnx/convert.py
@@ -1,4 +1,4 @@
-from collections import namedtuple
+from copy import deepcopy
 from .utils import get_latest_opset_version
 from ebm2onnx import graph
 from ebm2onnx import ebm
@@ -10,7 +10,7 @@
 from interpret.glassbox import ExplainableBoostingClassifier, ExplainableBoostingRegressor
 
 
-onnx_type_for={
+onnx_type_for = {
     'bool': onnx.TensorProto.BOOL,
     'float': onnx.TensorProto.FLOAT,
     'double': onnx.TensorProto.DOUBLE,
@@ -23,6 +23,7 @@
     'True': '1',
 }
 
+
 def infer_features_dtype(dtype, feature_name):
     feature_dtype = onnx.TensorProto.DOUBLE
     if dtype is not None:
@@ -94,7 +95,6 @@ def to_onnx(model, dtype, name="ebm",
     target_opset = target_opset or get_latest_opset_version()
     root = graph.create_graph()
 
-    class_index=0
     inputs = [None for _ in model.feature_names_in_]
     parts = []
 
@@ -103,14 +103,16 @@ def to_onnx(model, dtype, name="ebm",
     for _ in range(interaction_count):
         feature_types.append('interaction')
 
+    model_bins = deepcopy(model.bins_)
+
     # first compute the score of each feature
     for feature_index in range(len(model.term_names_)):
-        feature_name=model.term_names_[feature_index]
-        feature_type=feature_types[feature_index]
-        feature_group=model.term_features_[feature_index]
+        feature_name = model.term_names_[feature_index]
+        feature_type = feature_types[feature_index]
+        feature_group = model.term_features_[feature_index]
 
         if feature_type == 'continuous':
-            bins = [-np.inf, -np.inf] + list(model.bins_[feature_group[0]][0])
+            bins = [-np.inf, -np.inf] + list(model_bins[feature_group[0]][0])
             additive_terms = model.term_scores_[feature_index]
 
             feature_dtype = infer_features_dtype(dtype, feature_name)
@@ -122,7 +124,7 @@ def to_onnx(model, dtype, name="ebm",
             parts.append(part)
 
         elif feature_type in ['nominal', 'ordinal']:
-            col_mapping = model.bins_[feature_group[0]][0]
+            col_mapping = model_bins[feature_group[0]][0]
             additive_terms = model.term_scores_[feature_index]
 
             feature_dtype = infer_features_dtype(dtype, feature_name)
@@ -133,8 +135,7 @@ def to_onnx(model, dtype, name="ebm",
                     bool_remap[k]: v
                     for k, v in col_mapping.items()
                 }
-                # replace inplace to re-use it in interactions
-                model.bins_[feature_group[0]][0] = col_mapping
+                model_bins[feature_group[0]][0] = col_mapping
             if feature_dtype != onnx.TensorProto.STRING:
                 part = ops.cast(onnx.TensorProto.STRING)(part)
             part = ops.flatten()(part)
@@ -156,13 +157,13 @@ def to_onnx(model, dtype, name="ebm",
                     # There may be one binning per interaction way or not.
                     # the rule is to use bins_ index if there is one binning available for the way count.
                     # otherwise, use the last binning for the feature
-                    bin_index = -1 if way_count > len(model.bins_[i_feature_index]) else way_count - 1
-                    bins = [-np.inf, -np.inf] + list(model.bins_[i_feature_index][bin_index])
+                    bin_index = -1 if way_count > len(model_bins[i_feature_index]) else way_count - 1
+                    bins = [-np.inf, -np.inf] + list(model_bins[i_feature_index][bin_index])
                     input = graph.strip_to_transients(inputs[i_feature_index])
                     i_parts.append(ebm.get_bin_index_on_continuous_value(bins)(input))
 
                 elif i_feature_type in ['nominal', 'ordinal']:
-                    col_mapping = model.bins_[i_feature_index][0]
+                    col_mapping = model_bins[i_feature_index][0]
                     input = graph.strip_to_transients(inputs[i_feature_index])
                     i_parts.append(ebm.get_bin_index_on_categorical_value(col_mapping)(input))
 
@@ -181,13 +182,13 @@ def to_onnx(model, dtype, name="ebm",
     g = graph.merge(*parts)
     if type(model) is ExplainableBoostingClassifier:
         class_type = onnx.TensorProto.STRING if model.classes_.dtype.type is np.str_ else onnx.TensorProto.INT64
-        classes=model.classes_
+        classes = model.classes_
         if class_type == onnx.TensorProto.STRING:
-            classes=[ c.encode("utf-8") for c in classes]
+            classes = [c.encode("utf-8") for c in classes]
 
         g, scores_output_name = ebm.compute_class_score(model.intercept_, explain_name)(g)
         g_scores = graph.strip_to_transients(g)
-        if len(model.classes_) == 2: # binary classification            
+        if len(model.classes_) == 2:  # binary classification
             g = ebm.predict_class(
                 classes=classes, class_type=class_type,
                 binary=True, prediction_name=prediction_name
@@ -221,7 +222,5 @@ def to_onnx(model, dtype, name="ebm",
     else:
         raise NotImplementedError("{} models are not supported".format(type(model)))
 
-
-
     model = graph.compile(g, target_opset, name=name)
     return model
diff --git a/tests/test_convert.py b/tests/test_convert.py
index 55ffc59..6d68302 100644
--- a/tests/test_convert.py
+++ b/tests/test_convert.py
@@ -168,7 +168,7 @@ def test_predict_regression_without_interactions(interactions, explain):
 
 @pytest.mark.parametrize("explain", [False, True])
 @pytest.mark.parametrize("interactions", [0, 2, [(0, 1, 2)], [(0, 1, 2, 3)]])
-@pytest.mark.parametrize("old_th", [65, 0])
+@pytest.mark.parametrize("old_th", [65, 35, 0])
 def test_predict_binary_classification_with_categorical(interactions, explain, old_th):
     model_ebm, x_test, y_test = train_titanic_binary_classification(
         interactions=interactions,
@@ -199,6 +199,13 @@ def test_predict_binary_classification_with_categorical(interactions, explain, o
 
     if explain is True:
         assert len(pred_onnx) == 2
+        local_explain = model_ebm.explain_local(x_test, y_test)
+        for i in range(len(x_test)):
+            assert np.allclose(
+                local_explain.data(i)['scores'],
+                pred_onnx[1][i][:, 0]
+            )
+
     assert np.allclose(pred_ebm, pred_onnx[0])