using scikit-learn model instead of scikit-learn pipeline

Azure · Jun 3, 2022 · b938f9c · b938f9c
1 parent 1217954
commit b938f9c
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 17 deletions.
diff --git a/classical/aml-cli-v2/data-science/src/evaluate.py b/classical/aml-cli-v2/data-science/src/evaluate.py
@@ -205,18 +205,17 @@ def main():
 
 
     # -------------------- Explainability ------------------- #
-    tree_explainer = TabularExplainer(model.steps[-1][1],
+    tabular_explainer = TabularExplainer(model,
                                    initialization_examples=X_train,
-                                   features=X_train.columns,
-                                   transformations=model.steps[0][1])
+                                   features=X_train.columns)
 
     # save explainer                                 
-    #joblib.dump(tree_explainer, os.path.join(tree_explainer, "explainer"))
+    #joblib.dump(tabular_explainer, os.path.join(tabular_explainer, "explainer"))
 
     # find global explanations for feature importance
     # you can use the training data or the test data here, 
     # but test data would allow you to use Explanation Exploration
-    global_explanation = tree_explainer.explain_global(X_test)
+    global_explanation = tabular_explainer.explain_global(X_test)
 
     # sorted feature importance values and feature names
     sorted_global_importance_values = global_explanation.get_ranked_global_values()

diff --git a/classical/aml-cli-v2/data-science/src/train.py b/classical/aml-cli-v2/data-science/src/train.py
@@ -131,15 +131,25 @@ def main():
 
     # append regressor to preprocessing pipeline.
     # now we have a full prediction pipeline.
-    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
-                          ('regressor', RandomForestRegressor(
-                              n_estimators = args.regressor__n_estimators,
-                              bootstrap = args.regressor__bootstrap,
-                              max_depth = args.regressor__max_depth,
-                              max_features = args.regressor__max_features,
-                              min_samples_leaf = args.regressor__min_samples_leaf,
-                              min_samples_split = args.regressor__min_samples_split,
-                              random_state=0))])
+
+    #model = Pipeline(steps=[('preprocessor', preprocessor),
+    #                      ('regressor', RandomForestRegressor(
+    #                          n_estimators = args.regressor__n_estimators,
+    #                          bootstrap = args.regressor__bootstrap,
+    #                          max_depth = args.regressor__max_depth,
+    #                          max_features = args.regressor__max_features,
+    #                          min_samples_leaf = args.regressor__min_samples_leaf,
+    #                          min_samples_split = args.regressor__min_samples_split,
+    #                          random_state=0))])
+
+
+    model = RandomForestRegressor(n_estimators = args.regressor__n_estimators,
+                                  bootstrap = args.regressor__bootstrap,
+                                  max_depth = args.regressor__max_depth,
+                                  max_features = args.regressor__max_features,
+                                  min_samples_leaf = args.regressor__min_samples_leaf,
+                                  min_samples_split = args.regressor__min_samples_split,
+                                  random_state=0)
 
     mlflow.log_param("model", "RandomForestRegressor")
     mlflow.log_param("n_estimators", args.regressor__n_estimators)
@@ -149,10 +159,10 @@ def main():
     mlflow.log_param("min_samples_leaf", args.regressor__min_samples_leaf)
     mlflow.log_param("min_samples_split", args.regressor__min_samples_split)
 
-    pipeline.fit(X_train, y_train)
+    model.fit(X_train, y_train)
 
     # Predict using the Regression Model
-    yhat_train = pipeline.predict(X_train)
+    yhat_train = model.predict(X_train)
 
     # Evaluate Regression performance with the train set
     r2 = r2_score(y_train, yhat_train)
@@ -174,7 +184,7 @@ def main():
     mlflow.log_artifact("regression_results.png")
 
     # Save the model
-    pickle.dump(pipeline, open((Path(args.model_output) / "model.pkl"), "wb"))
+    pickle.dump(model, open((Path(args.model_output) / "model.pkl"), "wb"))
 
 if __name__ == "__main__":
     main()