From b938f9cc5b556795de397d999247667ef713c9ab Mon Sep 17 00:00:00 2001 From: Maggie Mhanna Date: Fri, 3 Jun 2022 12:44:00 +0000 Subject: [PATCH] using scikit-learn model instead of scikit-learn pipeline --- .../aml-cli-v2/data-science/src/evaluate.py | 9 +++-- .../aml-cli-v2/data-science/src/train.py | 34 ++++++++++++------- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/classical/aml-cli-v2/data-science/src/evaluate.py b/classical/aml-cli-v2/data-science/src/evaluate.py index af772fd..e979f1a 100644 --- a/classical/aml-cli-v2/data-science/src/evaluate.py +++ b/classical/aml-cli-v2/data-science/src/evaluate.py @@ -205,18 +205,17 @@ def main(): # -------------------- Explainability ------------------- # - tree_explainer = TabularExplainer(model.steps[-1][1], + tabular_explainer = TabularExplainer(model, initialization_examples=X_train, - features=X_train.columns, - transformations=model.steps[0][1]) + features=X_train.columns) # save explainer - #joblib.dump(tree_explainer, os.path.join(tree_explainer, "explainer")) + #joblib.dump(tabular_explainer, os.path.join(tabular_explainer, "explainer")) # find global explanations for feature importance # you can use the training data or the test data here, # but test data would allow you to use Explanation Exploration - global_explanation = tree_explainer.explain_global(X_test) + global_explanation = tabular_explainer.explain_global(X_test) # sorted feature importance values and feature names sorted_global_importance_values = global_explanation.get_ranked_global_values() diff --git a/classical/aml-cli-v2/data-science/src/train.py b/classical/aml-cli-v2/data-science/src/train.py index ee82304..391517a 100644 --- a/classical/aml-cli-v2/data-science/src/train.py +++ b/classical/aml-cli-v2/data-science/src/train.py @@ -131,15 +131,25 @@ def main(): # append regressor to preprocessing pipeline. # now we have a full prediction pipeline. - pipeline = Pipeline(steps=[('preprocessor', preprocessor), - ('regressor', RandomForestRegressor( - n_estimators = args.regressor__n_estimators, - bootstrap = args.regressor__bootstrap, - max_depth = args.regressor__max_depth, - max_features = args.regressor__max_features, - min_samples_leaf = args.regressor__min_samples_leaf, - min_samples_split = args.regressor__min_samples_split, - random_state=0))]) + + #model = Pipeline(steps=[('preprocessor', preprocessor), + # ('regressor', RandomForestRegressor( + # n_estimators = args.regressor__n_estimators, + # bootstrap = args.regressor__bootstrap, + # max_depth = args.regressor__max_depth, + # max_features = args.regressor__max_features, + # min_samples_leaf = args.regressor__min_samples_leaf, + # min_samples_split = args.regressor__min_samples_split, + # random_state=0))]) + + + model = RandomForestRegressor(n_estimators = args.regressor__n_estimators, + bootstrap = args.regressor__bootstrap, + max_depth = args.regressor__max_depth, + max_features = args.regressor__max_features, + min_samples_leaf = args.regressor__min_samples_leaf, + min_samples_split = args.regressor__min_samples_split, + random_state=0) mlflow.log_param("model", "RandomForestRegressor") mlflow.log_param("n_estimators", args.regressor__n_estimators) @@ -149,10 +159,10 @@ def main(): mlflow.log_param("min_samples_leaf", args.regressor__min_samples_leaf) mlflow.log_param("min_samples_split", args.regressor__min_samples_split) - pipeline.fit(X_train, y_train) + model.fit(X_train, y_train) # Predict using the Regression Model - yhat_train = pipeline.predict(X_train) + yhat_train = model.predict(X_train) # Evaluate Regression performance with the train set r2 = r2_score(y_train, yhat_train) @@ -174,7 +184,7 @@ def main(): mlflow.log_artifact("regression_results.png") # Save the model - pickle.dump(pipeline, open((Path(args.model_output) / "model.pkl"), "wb")) + pickle.dump(model, open((Path(args.model_output) / "model.pkl"), "wb")) if __name__ == "__main__": main()