diff --git a/examples/model_selection/RF_vs_SPORF_random_search.ipynb b/examples/model_selection/RF_vs_SPORF_random_search.ipynb new file mode 100644 index 0000000000000..e27c3c20b64fa --- /dev/null +++ b/examples/model_selection/RF_vs_SPORF_random_search.ipynb @@ -0,0 +1,326 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Demonstration of randomized search to compare classifier performance\n", + "============================================================================\n", + "An important step in classifier performance comparison is hyperparameter \n", + "optimization. Here, we specify the classifer models we want to tune and a \n", + "dictionary of hyperparameter ranges (preferably similar for fairness in \n", + "comparision) for each classifier. Then, we find the optimal hyperparameters \n", + "through a function that uses RandomizedSearchCV and refit the optimized \n", + "models to obtain accuracies. We can see clearly in the plot that the \n", + "optimized models perform better than or similar to the default parameter models. On the \n", + "dataset we use in this example, car dataset from OpenML-CC18, SPORF also \n", + "performs better than RF overall. \n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Automatically created module for IPython interactive environment\n" + ] + } + ], + "source": [ + "print(__doc__)\n", + "\n", + "from sklearn.model_selection import RandomizedSearchCV\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "import math\n", + "from rerf.rerfClassifier import rerfClassifier\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.datasets import fetch_openml\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn import metrics\n", + "\n", + "from warnings import simplefilter\n", + "simplefilter(action=\"ignore\", category=FutureWarning)\n", + "from warnings import simplefilter\n", + "simplefilter(action=\"ignore\", category=FutureWarning)\n", + "\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def hyperparameter_optimization_random(X, y, *argv):\n", + " \"\"\"\n", + " Given a classifier and a dictionary of hyperparameters, find optimal hyperparameters using RandomizedSearchCV.\n", + "\n", + " Parameters\n", + " ----------\n", + " X : numpy.ndarray\n", + " Input data, shape (n_samples, n_features)\n", + " y : numpy.ndarray\n", + " Output data, shape (n_samples, n_outputs)\n", + " *argv : list of tuples (classifier, hyperparameters)\n", + " List of (classifier, hyperparameters) tuples:\n", + "\n", + " classifier : sklearn-compliant classifier\n", + " For example sklearn.ensemble.RandomForestRegressor, rerf.rerfClassifier, etc\n", + " hyperparameters : dictionary of hyperparameter ranges\n", + " See https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html.\n", + "\n", + " Returns\n", + " -------\n", + " clf_best_params : dictionary\n", + " Dictionary of best hyperparameters\n", + " \"\"\"\n", + "\n", + " clf_best_params = {}\n", + "\n", + " # Iterate over all (classifier, hyperparameters) pairs\n", + " for clf, params in argv:\n", + "\n", + " # Run randomized search\n", + " n_iter_search = 10\n", + " random_search = RandomizedSearchCV(\n", + " clf, param_distributions=params, n_iter=n_iter_search, cv=10, iid=False\n", + " )\n", + " random_search.fit(X, y)\n", + "\n", + " # Save results\n", + " clf_best_params[clf] = random_search.best_params_\n", + "\n", + " return clf_best_params" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Building classifiers and specifying parameter ranges to sample from\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# get some data\n", + "X, y = fetch_openml(data_id=40975, return_X_y=True, as_frame=True) #car dataset\n", + "y = pd.factorize(y)[0]\n", + "X = X.apply(lambda x: pd.factorize(x)[0])\n", + "n_features = np.shape(X)[1]\n", + "n_samples = np.shape(X)[0]\n", + "\n", + "# build a classifier\n", + "rerf = rerfClassifier()\n", + "\n", + "# specify max_depth and min_sample_splits ranges\n", + "max_depth_array_rerf = (np.unique(np.round((np.linspace(2, n_samples, 10))))).astype(\n", + " int\n", + ")\n", + "max_depth_range_rerf = np.append(max_depth_array_rerf, None)\n", + "\n", + "min_sample_splits_range_rerf = (\n", + " np.unique(\n", + " np.round((np.arange(1, math.log(n_samples), (math.log(n_samples) - 2) / 10)))\n", + " )\n", + ").astype(int)\n", + "\n", + "# specify parameters and distributions to sample from\n", + "rerf_param_dict = {\n", + " \"n_estimators\": np.arange(50, 550, 50),\n", + " \"max_depth\": max_depth_range_rerf,\n", + " \"min_samples_split\": min_sample_splits_range_rerf,\n", + " \"feature_combinations\": [1, 2, 3, 4, 5],\n", + " \"max_features\": [\"sqrt\", \"log2\", None, n_features ** 2],\n", + "}\n", + "\n", + "# build another classifier\n", + "rf = RandomForestClassifier()\n", + "\n", + "# specify max_depth and min_sample_splits ranges\n", + "max_depth_array_rf = (np.unique(np.round((np.linspace(2, n_samples, 10))))).astype(int)\n", + "max_depth_range_rf = np.append(max_depth_array_rf, None)\n", + "\n", + "min_sample_splits_range_rf = (\n", + " np.unique(\n", + " np.round((np.arange(2, math.log(n_samples), (math.log(n_samples) - 2) / 10)))\n", + " )\n", + ").astype(int)\n", + "\n", + "# specify parameters and distributions to sample from\n", + "rf_param_dict = {\n", + " \"n_estimators\": np.arange(50, 550, 50),\n", + " \"max_depth\": max_depth_range_rf,\n", + " \"min_samples_split\": min_sample_splits_range_rf,\n", + " \"max_features\": [\"sqrt\", \"log2\", None],\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Obtaining best parameters dictionary and refitting" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{rerfClassifier(feature_combinations=1.5, image_height=None, image_width=None,\n", + " max_depth=None, max_features='auto', min_samples_split=1,\n", + " n_estimators=500, n_jobs=None, oob_score=False,\n", + " patch_height_max=None, patch_height_min=1, patch_width_max=None,\n", + " patch_width_min=1, projection_matrix='RerF', random_state=None): {'n_estimators': 300, 'min_samples_split': 4, 'max_features': 36, 'max_depth': 577, 'feature_combinations': 2}, RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n", + " criterion='gini', max_depth=None, max_features='auto',\n", + " max_leaf_nodes=None, max_samples=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=1, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, n_estimators=100,\n", + " n_jobs=None, oob_score=False, random_state=None,\n", + " verbose=0, warm_start=False): {'n_estimators': 200, 'min_samples_split': 6, 'max_features': None, 'max_depth': 1153}}\n" + ] + } + ], + "source": [ + "best_params = hyperparameter_optimization_random(\n", + " X, y, (rerf, rerf_param_dict), (rf, rf_param_dict)\n", + ")\n", + "print(best_params)\n", + "\n", + "# extract values from dict - seperate each classifier's param dict\n", + "keys, values = zip(*best_params.items())\n", + "\n", + "# train test split\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.33, random_state=42\n", + ")\n", + "\n", + "# get accuracies of optimized and default models\n", + "rerf_opti = rerfClassifier(**values[0])\n", + "rerf_opti.fit(X_train, y_train)\n", + "rerf_pred_opti = rerf_opti.predict(X_test)\n", + "rerf_acc_opti = metrics.accuracy_score(y_test, rerf_pred_opti)\n", + "\n", + "rerf_default = rerfClassifier()\n", + "rerf_default.fit(X_train, y_train)\n", + "rerf_pred_default = rerf_default.predict(X_test)\n", + "rerf_acc_default = metrics.accuracy_score(y_test, rerf_pred_default)\n", + "\n", + "rf_opti = RandomForestClassifier(**values[1])\n", + "rf_opti.fit(X_train, y_train)\n", + "rf_pred_opti = rf_opti.predict(X_test)\n", + "rf_acc_opti = metrics.accuracy_score(y_test, rf_pred_opti)\n", + "\n", + "rf_default = RandomForestClassifier()\n", + "rf_default.fit(X_train, y_train)\n", + "rf_pred_default = rf_default.predict(X_test)\n", + "rf_acc_default = metrics.accuracy_score(y_test, rf_pred_default)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plotting the result" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "labels = [\"SPORF\", \"RF\"]\n", + "rerf_acc = [rerf_acc_opti, rerf_acc_default]\n", + "rf_acc = [rf_acc_opti, rf_acc_default]\n", + "\n", + "x = np.arange(len(labels))\n", + "width = 0.35\n", + "\n", + "fig, ax = plt.subplots()\n", + "rects1 = ax.bar(x - width / 2, rerf_acc, width, label=\"Optimized\")\n", + "rects2 = ax.bar(x + width / 2, rf_acc, width, label=\"Default\")\n", + "\n", + "# Add some text for labels, title and custom x-axis tick labels, etc.\n", + "ax.set_ylabel(\"Accuracy\")\n", + "ax.set_title(\"Accuracy of Optimized/Default SPORF and RF Models on car Dataset\")\n", + "ax.set_xticks(x)\n", + "ax.set_xticklabels(labels)\n", + "ax.legend()\n", + "\n", + "\n", + "def autolabel(rects):\n", + " \"\"\"Attach a text label above each bar in *rects*, displaying its height.\"\"\"\n", + " for rect in rects:\n", + " height = float(\"%.3f\" % (rect.get_height()))\n", + " ax.annotate(\n", + " \"{}\".format(height),\n", + " xy=(rect.get_x() + rect.get_width() / 2, height),\n", + " xytext=(0, 3), # 3 points vertical offset\n", + " textcoords=\"offset points\",\n", + " ha=\"center\",\n", + " va=\"bottom\",\n", + " )\n", + "\n", + "\n", + "autolabel(rects1)\n", + "autolabel(rects2)\n", + "fig.tight_layout()\n", + "plt.ylim((0.9, 1))\n", + "\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/model_selection/RF_vs_SPORF_random_search.py b/examples/model_selection/RF_vs_SPORF_random_search.py new file mode 100644 index 0000000000000..34497549c4e47 --- /dev/null +++ b/examples/model_selection/RF_vs_SPORF_random_search.py @@ -0,0 +1,216 @@ +""" +============================================================================ +Demonstration of randomized search to compare classifier performance +============================================================================ +An important step in classifier performance comparison is hyperparameter +optimization. Here, we specify the classifer models we want to tune and a +dictionary of hyperparameter ranges (preferably similar for fairness in +comparision) for each classifier. Then, we find the optimal hyperparameters +through a function that uses RandomizedSearchCV and refit the optimized +models to obtain accuracies. We can see clearly in the plot that the +optimized models perform better than or similar to the default parameter +models. On the dataset we use in this example, car dataset from OpenML-CC18, +SPORF also performs better than RF overall. +""" +print(__doc__) + +from sklearn.model_selection import RandomizedSearchCV + +import pandas as pd +import numpy as np +import math +from rerf.rerfClassifier import rerfClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.datasets import fetch_openml +from sklearn.model_selection import train_test_split +from sklearn import metrics +from warnings import simplefilter +simplefilter(action="ignore", category=FutureWarning) +from warnings import simplefilter +simplefilter(action="ignore", category=FutureWarning) + +import matplotlib +import matplotlib.pyplot as plt + + +def hyperparameter_optimization_random(X, y, *argv): + """ + Given a classifier and a dictionary of hyperparameters, find optimal hyperparameters using RandomizedSearchCV. + + Parameters + ---------- + X : numpy.ndarray + Input data, shape (n_samples, n_features) + y : numpy.ndarray + Output data, shape (n_samples, n_outputs) + *argv : list of tuples (classifier, hyperparameters) + List of (classifier, hyperparameters) tuples: + + classifier : sklearn-compliant classifier + For example sklearn.ensemble.RandomForestRegressor, rerf.rerfClassifier, etc + hyperparameters : dictionary of hyperparameter ranges + See https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html. + + Returns + ------- + clf_best_params : dictionary + Dictionary of best hyperparameters + """ + + clf_best_params = {} + + # Iterate over all (classifier, hyperparameters) pairs + for clf, params in argv: + + # Run randomized search + n_iter_search = 10 + random_search = RandomizedSearchCV( + clf, param_distributions=params, n_iter=n_iter_search, cv=10, iid=False + ) + random_search.fit(X, y) + + # Save results + clf_best_params[clf] = random_search.best_params_ + + return clf_best_params + + +############################################################################### +# Building classifiers and specifying parameter ranges to sample from +# ---------------------------------------------------------- +# + +# get some data +X, y = fetch_openml(data_id=40975, return_X_y=True, as_frame=True) #car dataset +y = pd.factorize(y)[0] +X = X.apply(lambda x: pd.factorize(x)[0]) +n_features = np.shape(X)[1] +n_samples = np.shape(X)[0] + +# build a classifier +rerf = rerfClassifier() + +# specify max_depth and min_sample_splits ranges +max_depth_array_rerf = (np.unique(np.round((np.linspace(2, n_samples, 10))))).astype( + int +) +max_depth_range_rerf = np.append(max_depth_array_rerf, None) + +min_sample_splits_range_rerf = ( + np.unique( + np.round((np.arange(1, math.log(n_samples), (math.log(n_samples) - 2) / 10))) + ) +).astype(int) + +# specify parameters and distributions to sample from +rerf_param_dict = { + "n_estimators": np.arange(50, 550, 50), + "max_depth": max_depth_range_rerf, + "min_samples_split": min_sample_splits_range_rerf, + "feature_combinations": [1, 2, 3, 4, 5], + "max_features": ["sqrt", "log2", None, n_features ** 2], +} + +# build another classifier +rf = RandomForestClassifier() + +# specify max_depth and min_sample_splits ranges +max_depth_array_rf = (np.unique(np.round((np.linspace(2, n_samples, 10))))).astype(int) +max_depth_range_rf = np.append(max_depth_array_rf, None) + +min_sample_splits_range_rf = ( + np.unique( + np.round((np.arange(2, math.log(n_samples), (math.log(n_samples) - 2) / 10))) + ) +).astype(int) + +# specify parameters and distributions to sample from +rf_param_dict = { + "n_estimators": np.arange(50, 550, 50), + "max_depth": max_depth_range_rf, + "min_samples_split": min_sample_splits_range_rf, + "max_features": ["sqrt", "log2", None], +} + +############################################################################### +# Obtaining best parameters dictionary and refitting +# ---------------------------------------------------------- +# + +best_params = hyperparameter_optimization_random( + X, y, (rerf, rerf_param_dict), (rf, rf_param_dict) +) +print(best_params) + +# extract values from dict - seperate each classifier's param dict +keys, values = zip(*best_params.items()) + +# train test split +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.33, random_state=42 +) + +# get accuracies of optimized and default models +rerf_opti = rerfClassifier(**values[0]) +rerf_opti.fit(X_train, y_train) +rerf_pred_opti = rerf_opti.predict(X_test) +rerf_acc_opti = metrics.accuracy_score(y_test, rerf_pred_opti) + +rerf_default = rerfClassifier() +rerf_default.fit(X_train, y_train) +rerf_pred_default = rerf_default.predict(X_test) +rerf_acc_default = metrics.accuracy_score(y_test, rerf_pred_default) + +rf_opti = RandomForestClassifier(**values[1]) +rf_opti.fit(X_train, y_train) +rf_pred_opti = rf_opti.predict(X_test) +rf_acc_opti = metrics.accuracy_score(y_test, rf_pred_opti) + +rf_default = RandomForestClassifier() +rf_default.fit(X_train, y_train) +rf_pred_default = rf_default.predict(X_test) +rf_acc_default = metrics.accuracy_score(y_test, rf_pred_default) + +############################################################################### +# Plotting the result +# ------------------- + +labels = ["SPORF", "RF"] +rerf_acc = [rerf_acc_opti, rerf_acc_default] +rf_acc = [rf_acc_opti, rf_acc_default] + +x = np.arange(len(labels)) +width = 0.35 + +fig, ax = plt.subplots() +rects1 = ax.bar(x - width / 2, rerf_acc, width, label="Optimized") +rects2 = ax.bar(x + width / 2, rf_acc, width, label="Default") + +# Add some text for labels, title and custom x-axis tick labels, etc. +ax.set_ylabel("Accuracy") +ax.set_title("Accuracy of Optimized/Default SPORF and RF Models on car Dataset") +ax.set_xticks(x) +ax.set_xticklabels(labels) +ax.legend() + + +def autolabel(rects): + """Attach a text label above each bar in *rects*, displaying its height.""" + for rect in rects: + height = float("%.3f" % (rect.get_height())) + ax.annotate( + "{}".format(height), + xy=(rect.get_x() + rect.get_width() / 2, height), + xytext=(0, 3), # 3 points vertical offset + textcoords="offset points", + ha="center", + va="bottom", + ) + + +autolabel(rects1) +autolabel(rects2) +fig.tight_layout() +plt.ylim((0.9, 1)) + +plt.show()