From 3d3d00d61a2fcba90ceea7a0493c877ebcbfeb39 Mon Sep 17 00:00:00 2001 From: jvmead Date: Thu, 31 Oct 2019 10:59:46 +0000 Subject: [PATCH 01/54] Adding basci BDT hyperparameter tuning GridScanCV Will need to remove existing plots and give it a test run in case anything has deprecated --- advanced-python/Hyperparameter tuning | 666 ++++++++++++++++++++++++++ 1 file changed, 666 insertions(+) create mode 100644 advanced-python/Hyperparameter tuning diff --git a/advanced-python/Hyperparameter tuning b/advanced-python/Hyperparameter tuning new file mode 100644 index 00000000..d4fc1fa0 --- /dev/null +++ b/advanced-python/Hyperparameter tuning @@ -0,0 +1,666 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Hyperparameter tuning" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", + " \"This module will be removed in 0.20.\", DeprecationWarning)\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/grid_search.py:42: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.\n", + " DeprecationWarning)\n" + ] + } + ], + "source": [ + "from matplotlib import pyplot as plt\n", + "import uproot\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import xgboost as xgb\n", + "\n", + "from xgboost.sklearn import XGBClassifier\n", + "from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier\n", + "\n", + "from sklearn import cross_validation, metrics\n", + "from sklearn.metrics import roc_curve, auc\n", + "\n", + "from sklearn.model_selection import KFold, cross_validate, cross_val_score\n", + "from sklearn.grid_search import GridSearchCV\n", + "\n", + "# This gives us a special function for this lesson that lets you check how good your selection is\n", + "from python_lesson import check_truth" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_mass(df):\n", + " counts, bins, _ = plt.hist(df['Jpsi_M'], bins=100, range=[2.75, 3.5], histtype='step')\n", + " # You can also use LaTeX in the axis label\n", + " plt.xlabel('$J/\\\\psi$ mass [GeV]')\n", + " plt.xlim(bins[0], bins[-1])" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_comparision(var, mc_df, bkg_df):\n", + " _, bins, _ = plt.hist(mc_df[var], bins=100, histtype='step', label='MC', density=1)\n", + " _, bins, _ = plt.hist(bkg_df[var], bins=bins, histtype='step', label='Background', density=1)\n", + " plt.xlabel(var)\n", + " plt.xlim(bins[0], bins[-1])\n", + " plt.legend(loc='best')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_roc(bdt, training_data, training_columns, label=None):\n", + " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", + " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", + " area = auc(fpr, tpr)\n", + "\n", + " plt.plot([0, 1], [0, 1], color='grey', linestyle='--')\n", + " if label:\n", + " plt.plot(fpr, tpr, label=f'{label} (area = {area:.2f})')\n", + " else:\n", + " plt.plot(fpr, tpr, label=f'ROC curve (area = {area:.2f})')\n", + " plt.xlim(0.0, 1.0)\n", + " plt.ylim(0.0, 1.0)\n", + " plt.xlabel('False Positive Rate')\n", + " plt.ylabel('True Positive Rate')\n", + " plt.legend(loc='lower right')\n", + " # We can make the plot look nicer by forcing the grid to be square\n", + " plt.gca().set_aspect('equal', adjustable='box')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_significance(bdt, training_data, training_columns, label=None):\n", + " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", + " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", + "\n", + " n_sig = 1200\n", + " n_bkg = 23000\n", + " S = n_sig*tpr\n", + " B = n_bkg*fpr\n", + " metric = S/np.sqrt(S+B)\n", + "\n", + " plt.plot(thresholds, metric, label=label)\n", + " plt.xlabel('BDT cut value')\n", + " plt.ylabel('$\\\\frac{S}{\\\\sqrt{S+B}}$')\n", + " plt.xlim(0, 1.0)\n", + "\n", + " optimal_cut = thresholds[np.argmax(metric)]\n", + " plt.axvline(optimal_cut, color='black', linestyle='--')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "#max_entries = 1000\n", + "data_df = uproot.open('/eos/user/l/lhcbsk/advanced-python/data/real_data.root')['DecayTree'].pandas.df()#entrystop=max_entries)\n", + "mc_df = uproot.open('/eos/user/l/lhcbsk/advanced-python/data/simulated_data.root')['DecayTree'].pandas.df()#entrystop=max_entries)\n", + "bkg_df = data_df.query('~(3.0 < Jpsi_M < 3.2)').copy()\n", + "\n", + "for df in [mc_df, data_df, bkg_df]:\n", + " df.eval('Jpsi_eta = arctanh(Jpsi_PZ/Jpsi_P)', inplace=True)\n", + " df.eval('mup_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", + " df.eval('mum_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", + "\n", + "bkg_df['catagory'] = 0 # Use 0 for background\n", + "mc_df['catagory'] = 1 # Use 1 for signal\n", + "training_data = pd.concat([bkg_df, mc_df], copy=True, ignore_index=True)\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['IPdiff'] = np.abs(df['mum_PT'] - df['mup_PT'])\n", + " \n", + "training_columns = [\n", + " 'Jpsi_PT',\n", + " 'mup_PT', 'mup_eta', 'mup_ProbNNmu', 'mup_IP',\n", + " 'mum_PT', 'mum_eta', 'mum_ProbNNmu', 'mum_IP',\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAEKCAYAAAARnO4WAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAHWVJREFUeJzt3X+QFOW97/H312V1iRhUlpRcfrh4DicoLj9kDRBShivGq0jg3Dp4RCMGkwr+4saYREtFDdeUuWh+cgqNQmIQywN4jEaiGMsbNEbqqAEPUUTiJYq6SCliWCGwKvC9f/SwNsPMTu9sz0zP9OdVteXMdE/vQwufeeZ5nv62uTsiIlL7Dqt0A0REpDwU+CIiKaHAFxFJCQW+iEhKKPBFRFJCgS8ikhIKfBGRlFDgi4ikhAJfRCQlelTqFzc2NnpTU1Olfr2ISFVau3bte+7et5j3Vizwm5qaWLNmTaV+vYhIVTKzN4p9r4Z0RERSQoEvIpISCnwRkZSo2Bi+iCTfxx9/TGtrK+3t7ZVuSuo0NDQwYMAA6uvrYzumAl9E8mptbeWoo46iqakJM6t0c1LD3dm+fTutra0MHjw4tuNqSEdE8mpvb6dPnz4K+zIzM/r06RP7NysFvoh0SmFfGaU47wp8EZGU0Bi+iEQ2ft4qtuzYE9vx+h/dk9XXnt7pPmbGhRdeyL333gvA3r176devH2PGjOGRRx4B4LHHHuPGG29k9+7dHHHEEUycOJEf/ehHsbWzVijwJbefNkPbm8Hj3oPgqpcq2x5JhC079rB53jmxHa/p2kcL7nPkkUeyfv169uzZQ8+ePXniiSfo379/x/b169cze/ZsHn30UYYOHcq+ffu46667YmtjLdGQjuTW9ibMbQt+DgS/SIWcffbZPPpo8OGwdOlSzj///I5tt912G3PmzGHo0KEA1NXVcfnll1eknUlXMPDNrMHMnjezP5vZy2b2v3Psc4SZLTezTWb2nJk1laKxIpJO06dPZ9myZbS3t/Piiy8yZsyYjm3r169n9OjRFWxd9YgypPMhcLq77zKzeuAZM3vM3Z8N7fN14G/u/o9mNh24FTivBO2VSug9COb2Pvi5hnikjIYPH87mzZtZunQpkyZNqnRzqlbBwHd3B3ZlntZnfjxrt6nA3MzjB4AFZmaZ90q1yw73cPiLlMmUKVP47ne/y1NPPcX27ds7Xh82bBhr165lxIgRFWxddYg0hm9mdWa2DngXeMLdn8vapT/wFoC77wXagD45jjPLzNaY2Zpt27Z1r+Uikipf+9rXuOmmm2hubj7o9auvvpof/OAHvPrqqwDs37+fO++8sxJNTLxIq3TcfR8w0syOBh4ys5PdfX1ol1xXCBzSu3f3hcBCgJaWFvX+q1V4iEfDO6nS/+iekVbWdOV4UQ0YMIArr7zykNeHDx/Oz372M84//3x2796NmXHOOfGtJKolXVqW6e47zOwp4CwgHPitwECg1cx6AL2B9+NqpJRJ9lLMfMIBr+GdVCm0Zr4Udu3adchrEyZMYMKECR3PJ0+ezOTJk8vYqupUMPDNrC/wcSbsewJnEEzKhq0Avgr8JzANWKXx+yp0YCmmiNSkKD38fsA9ZlZHMOZ/v7s/YmY3A2vcfQXwS+BeM9tE0LOfXrIWS3zCPXrovFcvIlUvyiqdF4FROV6/KfS4HTg33qZJyalHL5IqutJWRCQlFPgiIimhwBcRSQlVy0ybqEsvRXLJnujvrgjXcdTV1dHc3Iy7U1dXx4IFC/j85z/f5V81c+ZMJk+ezLRp04ptbcn06tUr5/LTuCnw00YTtdIdcf/9iXAdR8+ePVm3bh0Ajz/+ONdddx1/+MMf4mtDBHv37qVHj+qPSw3pSPccuOp2bu+g9ydSQh988AHHHHMMEFyQNXHiRE455RSam5t5+OGHO/ZbsmQJw4cPZ8SIEcyYMeOQ49x4443MnDmT/fv3s3LlSoYOHcro0aP55je/2XEB19y5c5kxYwbjx49nxowZtLe3c/HFF9Pc3MyoUaN48sknAVi8eDGzZ8/uOPbkyZN56qmngKDnPmfOHEaMGMHYsWN55513AHj99dcZN24czc3N3HDDDSU5V7lU/0eWVJauupUS27NnDyNHjqS9vZ2tW7eyatUqABoaGnjooYf49Kc/zXvvvcfYsWOZMmUKGzZs4JZbbmH16tU0Njby/vsHX/R/zTXX0NbWxq9+9Ss+/PBDLrnkEp5++mkGDx58UJ19gA0bNvDMM8/Qs2dPfvzjHwPw0ksvsXHjRs4888yO+j35/P3vf2fs2LHccsstXHPNNSxatIgbbriBK6+8kssuu4yLLrqI22+/Pcaz1Tn18EUk0Q4M6WzcuJHf/e53XHTRRbg77s7111/P8OHDOeOMM9iyZQvvvPMOq1atYtq0aTQ2NgJw7LHHdhzr+9//Pjt27OCuu+7CzNi4cSMnnHACgwcPBjgk8KdMmULPnkG9n2eeeabj28LQoUM5/vjjCwb+4Ycf3vGNYfTo0WzevBmA1atXd/yuXN9ASkU9/FqU6wpaFTiTGjBu3Djee+89tm3bxsqVK9m2bRtr166lvr6epqYm2tvbcXfMctVzhFNPPZW1a9fy/vvvc+yxx1KoAsyRRx7Z8Tjfvj169GD//v0dz9vb2zse19fXd7Slrq6OvXv3dmzL18ZSUg+/FoVvT6hbFEoN2bhxI/v27aNPnz60tbXxmc98hvr6ep588kneeOMNACZOnMj999/fUTM/PKRz1llnce2113LOOeewc+dOhg4dymuvvdbR816+fHne333aaadx3333AfDqq6/y5ptv8tnPfpampibWrVvH/v37eeutt3j++ecL/jnGjx/PsmXLADqOWQ7q4YtIdNl3P4vjeAUcGMOHoJd9zz33UFdXx1e+8hW+/OUv09zcTEtLS8c9bYcNG8acOXP44he/SF1dHaNGjWLx4sUdxzv33HPZuXMnU6ZMYeXKldxxxx2cddZZHHnkkZx66ql523H55Zdz6aWX0tzcTI8ePVi8eDFHHHEE48ePZ/DgwZx00kmceOKJnHLKKQX/TPPnz+eCCy7g1ltvZerUqQX3j4tVqqhlS0uLr1mzpiK/u+bN7X3w0rnw8+xtpfy9UvVeeeUVTjzxxEo3o6R27dpFr169cHeuuOIKhgwZwlVXXVXpZgG5z7+ZrXX3lmKOpyEdEUm1RYsWMXLkSIYNG0ZbWxuXXHJJpZtUMhrSEZFUu+qqqxLToy819fBFpFO6l1FllOK8q4efBtn3oBWJqKGhge3bt9OnT5+KLCNMK3dn+/btNDQ0xHpcBX4aaA2+FGnAgAG0traybdu2SjcldRoaGhgwYECsx1Tgi0he9fX1HVehSvXTGL6ISEoo8EVEUkKBLyKSEgp8EZGU0KStxCe7zoqqdIokigJf4pMd7rohikiiFAx8MxsILAGOA/YDC919ftY+E4CHgdczLz3o7jfH21TplG5OLiIFROnh7wW+4+4vmNlRwFoze8LdN2Tt90d3nxx/EyUS3ZxcRAooOGnr7lvd/YXM453AK0D/UjdMRETi1aVVOmbWBIwCnsuxeZyZ/dnMHjOzYTG0TUREYhR50tbMegG/Br7l7h9kbX4BON7dd5nZJOA3wJAcx5gFzAIYNEjjzCIi5RSph29m9QRhf5+7P5i93d0/cPddmccrgXoza8yx30J3b3H3lr59+3az6SIi0hUFA9+Cmqi/BF5x95/k2ee4zH6Y2ecyx90eZ0NFRKR7ogzpjAdmAC+Z2brMa9cDgwDc/U5gGnCZme0F9gDTXXdNKD0txRSRLigY+O7+DNDpnQ/cfQGwIK5GSURJX4qZfeMVXXUrUlG60lZKJxzwuupWpOJUPE1EJCUU+CIiKaEhnWoSnqQFTdSKSJco8KtJ0idpRSTRNKQjIpIS6uEnndbai0hMFPhJp2EcEYmJhnRERFJCgS8ikhIKfBGRlFDgi4ikhAJfRCQltEpHykOVM0UqToEv5aHKmSIVpyEdEZGUUA9fumX8vFVs2bEHgP5H92T1tadXuEUiko8CP4kSXk4hO+Q3zzsHgKZrH61ks0SkAAV+EiW8nMKWHXs6Ql5EqofG8EVEUkKBLyKSEgp8EZGUUOCLiKSEAl9EJCW0SicJdHNyESmDgoFvZgOBJcBxwH5gobvPz9rHgPnAJGA3MNPdX4i/uTUq4cswo+p/dM+D1uLrQiyRZInSw98LfMfdXzCzo4C1ZvaEu28I7XM2MCTzMwb4eea/kiLZ4a4LsUSSpeAYvrtvPdBbd/edwCtA/6zdpgJLPPAscLSZ9Yu9tSIiUrQujeGbWRMwCngua1N/4K3Q89bMa1uz3j8LmAUwaJDGqdMkXI5hc0PWxuxSEiqdLFISkQPfzHoBvwa+5e4fZG/O8RY/5AX3hcBCgJaWlkO2p0rC6+XELVyOofV7jQwIl0juPeiTOQyVThYpmUiBb2b1BGF/n7s/mGOXVmBg6PkA4O3uN6+GVdFEbbh3DsFkbHd84cN/Uy0ekQqIskrHgF8Cr7j7T/LstgKYbWbLCCZr29x9a559pcoUWywtvGqnux8SItJ9UXr444EZwEtmti7z2vXAIAB3vxNYSbAkcxPBssyL42+qVBstyRQpTq5v1XH8eyoY+O7+DLnH6MP7OHBFt1sjIiKHfKuOa4mzrrQtF11NKyIVpsAvlyqapBWR2qTiaSIiKaEefimlbK29iCSbAr+UNIwjIgmiwO+uXJOxKg0gIgmkwO+u7F68SgOISEIp8OPWe9AnoV/F4/bhCz90laxIbVDgx61GhnOKLacgIsmlwC+GVt+UzFb60i/zDSl4vKnCLRKpHQr8Ymj1TcmMa5/f8c2in+ZDRGKlwJeyy773bfY2ESkNBb6UXbFV/7InklWNU6RrFPiSWK1+8J2xlnsjA+b9FdAN0kWKocCXxDqv56KDaoJvbrig43H2sJB6/CKFKfAlsQ4J8Ln5t6nHL1KYAl861MrFVhrrF8lNgS8dauViq/CfI2rPv1S3lBNJEgW+CKW7pZxIkijwpSaEJ3GjDkfVyhCWSFQKfKkJ+YZfOlvNUytDWCJRKfClpmk1j8gnFPhh2UXRwpUvVTCt8rJLTxdRmbSYoR+RWqHADwsXRcsu3KWCaZUXDvgiC6tp5Y0kVTnmlAoGvpndDUwG3nX3k3NsnwA8DLyeeelBd785zkaKlFv2NwF9UEiplWNOKUoPfzGwAFjSyT5/dPfJsbRIJAHCAa9xf6kVBQPf3Z82s6bSNyVhwuPFB55LKhUq56zev0RV6Qv84hrDH2dmfwbeBr7r7i/HdNzKqZFbFUr3dfYPUr1/6YpKX+AXR+C/ABzv7rvMbBLwG2BIrh3NbBYwC2DQIPWYpRuyv4Flb9MHtsghuh347v5B6PFKM7vDzBrd/b0c+y4EFgK0tLR4d3+3pFhnga5bI4rk1O3AN7PjgHfc3c3sc8BhwPZut0ykCmg1j1STKMsylwITgEYzawW+B9QDuPudwDTgMjPbC+wBpru7eu+SClrNI9Ukyiqd8wtsX0CwbLM66QpaEUkJXWmrK2glJhrekaRT4IvERMM7knTpC/zwEA5oGEeqQqUv2JHakL7A1xBOh1whIpWV7368lb5gR2pD+gJfOugGIMlTzP14RaJS4IuUQGd32hKpFAW+SAnEcact3axF4qbAl9oTw52xkkDfCKpXvrmYSlPgS+3JvjWliqxJmYXnYsbPW5X3m1q5v8WlI/B1NW16qciaVFhnvfty9/zTEfhaiikiRai16x/SEfgiFaayC9Upe+lyeHgmrFr+nyrwRcogHAbZY7rFBIU+QCoj33mulmsmFPgiZZYv/CH6xJ3q9kgxFPgpk71cLNUSsHxTPXMpp9oNfK3MyUnlFELCAV8jK3ZqbZJR4lW7ga+VOZJCxRZZ6+xCoaReRJQm2aU6ilW7gS+SElEv3snu/WcfI9eFQtnb0jBfUMywZ67aSXEKf8jarcUfR4EvAgeP5x94XiVX4XbW487+MIgynNeV4yW5t1/s8FYxw55JPg9hCnwRODTca2RMP+4gKrTC6MD2OIaB8h0japBHHd5K030hFPgiUpTOKoJGrevf2QdDvmPEcTOY7N+bloUMCnwRqZh8RcagtD3ttK5WU+CLSEl1tsIkHOrFDj9FnVfQ/QVqKfB1c3KJUwIuyqo2+QK11BOanZWtyLdfWtVO4GvdvcSpBi/KKrUkBGoS2pBkBQPfzO4GJgPvuvvJObYbMB+YBOwGZrr7C3E3VIqncgrdpN5+xWk4Jh5ReviLgQXAkjzbzwaGZH7GAD/P/FcSIq0TVLFRb7/i1HOPR8HAd/enzaypk12mAkvc3YFnzexoM+vn7ltjamN+qpcjIhJZHGP4/YG3Qs9bM6+VPvA1bi8iEtlhMRzDcrzmOXc0m2Vma8xszbZt22L41SIiElUcgd8KDAw9HwC8nWtHd1/o7i3u3tK3b98YfrWIiEQVx5DOCmC2mS0jmKxtK8v4veSVptogZVfFRdZEoizLXApMABrNrBX4HlAP4O53AisJlmRuIliWeXGpGgtoojYCrcopoRotsibpEGWVzvkFtjtwRWwtKkQTtSIiRYljDF9ERKqAAl9EJCUU+CIiKZH84mmqgilJpjo7UkWSH/iapJUkU50dqSIa0hERSQkFvohISiR/SEekWmg8XxIumYGvq2mlGmk8XxIumYGvidou012tRKSQZAa+dJnq54hIIZq0FRFJCfXwq5iGcUSkK5IT+Jqo7TIN44hIV1Qu8N95+dAbSWiiVkSkZCoX+Ps+grl7Cu8nUo10ZyxJoOQM6YjUEt0ZSxJIq3RERFJCPXyRcsge4ulsPw39SIko8EXKIWqIa+hHSkiBX0XC6+5Ba+9FpGsU+FVE6+5FpDsU+CJJohLLUkIKfJEkCQf8T5sV/hIrBb5IUqm+vsQsUuCb2VnAfKAO+IW7z8vaPhP4IbAl89ICd/9FjO1MLRVIE0BX7kosCga+mdUBtwNfAlqBP5nZCnffkLXrcnefXYI2ppomagXQlbsSiyhX2n4O2OTur7n7R8AyYGppmyUiInGLEvj9gbdCz1szr2X7FzN70cweMLOBuQ5kZrPMbI2Zrdm224torogAnwzxzO0dTO6KRBBlDN9yvJad1r8Flrr7h2Z2KXAPcPohb3JfCCwEaPlvdUp8kWJpQleKECXwW4Fwj30A8HZ4B3ffHnq6CLi1+01LL03USpd0VqdHk7sSEiXw/wQMMbPBBKtwpgMXhHcws37uvjXzdArwSqytrHG5SiZoolYi6yzQ1fuXkIKB7+57zWw28DjBssy73f1lM7sZWOPuK4BvmtkUYC/wPjCzhG2uOVqJIyLlEGkdvruvBFZmvXZT6PF1wHXxNq22adhGykLDPRKiK20rRL16KQsN90iIAr9MVNpYEkeF2lJHgV8m6tFL4mhpZ+oo8EVEvf2UUOCLSOe9/Z82Q9ubwWN9GFQ1Bb6IHCxXZc65bcFj1eivagp8ETlYZyGucf+qpsAvIa21F5EkUeCXkFbmiEiSKPBjpl69pIau4q06CvxuUuEzSS1dxVt1FPhFyO7FK+BFpBoo8IugsXmRAjob7sneT0M/ZaPAF5H4RQ3x8Lr+bPowiJ0CX0QqR/MAZaXAj0irb0Sk2inw89DqG5EKU0G32Cnw89DErEiF5SvjEC7mlk0fDJ1S4ItI8mX39g8Uc8um4m6dUuCHaJxeJKGiBnd4v+wVQPoAUODrIiqRGpUd7vmWgKbogyB1ga/JWJGUyhfqKVr+mbrA12SsiBwk6lXBB/at4m8DqQh8jc2LSF5dCfAqnxSOFPhmdhYwH6gDfuHu87K2HwEsAUYD24Hz3H1zvE3tGo3Ni0jsOpsUDkvoh0HBwDezOuB24EtAK/AnM1vh7htCu30d+Ju7/6OZTQduBc4rRYOj0tCNiJRUZ4Ge0G8CUXr4nwM2uftrAGa2DJgKhAN/KjA38/gBYIGZmbt7jG3tVK7JWBGRikjoN4Eogd8feCv0vBUYk28fd99rZm1AH+C97jYwO8jzNlLDNiKSRFG/CXQmpg+GKIFvOV7L7rlH2QczmwXMyjzdZWZ/ifD7I3kDsOviOlqHRmL40EoBnafCdI6i0XnKaT18uyNmP1vsUaIEfiswMPR8APB2nn1azawH0Bt4P/tA7r4QWFhcU8vPzNa4e0ul25F0Ok+F6RxFo/NUmJmtKfa9h0XY50/AEDMbbGaHA9OBFVn7rAC+mnk8DVhVzvF7EREprGAPPzMmPxt4nGBZ5t3u/rKZ3QyscfcVwC+Be81sE0HPfnopGy0iIl0XaR2+u68EVma9dlPocTtwbrxNS4SqGX6qMJ2nwnSOotF5Kqzoc2QaeRERSYcoY/giIlIDFPgEpSPM7C9mtsnMrs2x/dtmtsHMXjSz35vZ8ZVoZyUVOkeh/aaZmZtZKldaRDlPZvavmb9PL5vZv5e7jUkQ4d/cIDN70sz+K/PvblIl2llJZna3mb1rZuvzbDcz+7fMOXzRzE4peFB3T/UPwUT0X4ETgMOBPwMnZe3z34FPZR5fBiyvdLuTdo4y+x0FPA08C7RUut1JPE/AEOC/gGMyzz9T6XYn9DwtBC7LPD4J2FzpdlfgPJ0GnAKsz7N9EvAYwXVQY4HnCh1TPfxQ6Qh3/wg4UDqig7s/6e67M0+fJbgWIU0KnqOM7wO3Ae3lbFyCRDlP3wBud/e/Abj7u2VuYxJEOU8OfDrzuDeHXvtT89z9aXJczxQyFVjigWeBo82sX2fHVODnLh3Rv5P9v07wqZomBc+RmY0CBrr7I+VsWMJE+bv0T8A/mdlqM3s2U4k2baKcp7nAhWbWSrBC8H+Vp2lVpavZlY56+AVEKgsBYGYXAi3AF0vaouTp9ByZ2WHAT4GZ5WpQQkX5u9SDYFhnAsE3xT+a2cnuvqPEbUuSKOfpfGCxu//YzMYRXOdzsrvvL33zqkbk7DpAPfxopSMwszOAOcAUd/+wTG1LikLn6CjgZOApM9tMMJ64IoUTt1HLkDzs7h+7++vAXwg+ANIkynn6OnA/gLv/J9BAUGdHPhEpu8IU+BFKR2SGK+4iCPs0jrl2eo7cvc3dG929yd2bCOY5prh70TU/qlSUMiS/IVgEgJk1EgzxvFbWVlZelPP0JjARwMxOJAj8bWVtZfKtAC7KrNYZC7S5+9bO3pD6IR2PVjrih0Av4D/MDOBNd59SsUaXWcRzlHoRz9PjwJlmtgHYB1zt7tsr1+ryi3ievgMsMrOrCIYpZnpmaUpamNlSgqG/xsxcxveAegB3v5NgbmMSsAnYDVxc8JgpO4ciIqmlIR0RkZRQ4IuIpIQCX0QkJRT4IiIpocAXEUkJBb7UBDMbaGavm9mxmefHZJ4fb2ZDzOwRM/urma3NVGE8LbPfTDPbZmbrMtUrHzCzT2W2/bOZnVRke0amscKjJJsCX2qCu78F/ByYl3lpHkHFxXeAR4GF7v4P7j6aoC7LCaG3L3f3ke4+DPgIOC/z+j8TVGosxkiCNdIiiaF1+FIzzKweWAvcTVCVchQwAzjN3b+a5z0zCUo5zzazHsCvgV8B7wKPAG2Zn3/JvOV2oC/BhS7fcPeNZnYuwUUx+zL7nkFwMUxPYAvwf9x9eex/YJEuSv2VtlI73P1jM7sa+B1wprt/ZGbDgBcKvPU8M/sC0A94Ffitu+8zsxXAI+7+AICZ/R641N3/n5mNAe4ATgduAv6Hu28xs6Mzv/cmMh8kpfnTinSdhnSk1pwNbCUo5nYIM3vIzNab2YOhl5e7+0jgOOAl4Ooc7+sFfJ6gvMY6gtpKB2qPrwYWm9k3CEoFiCSSAl9qhpmNBL5EUK3zqszNIF4muGsQAO7+PwnKOB+b/f5MrZbfEtxpKNthwI7MWP+BnxMz77sUuIGgcuFaM+sT6x9MJCYKfKkJFlS1+znwLXd/k6Dg3Y+AfwfGm1m42N2nOjnUFwhuvwewk6D0M+7+AfB6Zrz+wP1ER2Qe/4O7P+fuNxFUdBwYfq9IUmjSVmqCmc0CJrr7eZnndcDzwLcJVur8BBiaebwTuM3d/29m0vaHBJOrhxHUGJ/p7u+a2XhgEfAhMA3YT/Ch0o+gauEyd785Mzw0hOCGFL8HvgUcQ1ANsh5N2kpCKPBFRFJCQzoiIimhwBcRSQkFvohISijwRURSQoEvIpISCnwRkZRQ4IuIpIQCX0QkJf4/4R4UtsdHB4gAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "xgboost_bdt = XGBClassifier() # LR=0.1 as default compared to bdt3 later\n", + "xgboost_bdt.fit(training_data[training_columns], training_data['catagory'])\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBtest'] = xgboost_bdt.predict_proba(df[training_columns])[:,1]\n", + "\n", + "plt.figure()\n", + "plot_comparision('XGBtest', mc_df, bkg_df)\n", + "\n", + "plt.figure()\n", + "plot_significance(xgboost_bdt, training_data, training_columns)\n", + "\n", + "plt.figure()\n", + "plot_roc(xgboost_bdt, training_data, training_columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### $k$-folding\n", + "\n", + "Let's go search for `scikit learn k-folding`.\n", + "\n", + " - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html\n", + "\n", + "Look at the example section:\n", + "\n", + "```python\n", + ">>> from sklearn.model_selection import KFold\n", + ">>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])\n", + ">>> y = np.array([1, 2, 3, 4])\n", + ">>> kf = KFold(n_splits=2)\n", + ">>> kf.get_n_splits(X)\n", + "2\n", + ">>> print(kf) \n", + "KFold(n_splits=2, random_state=None, shuffle=False)\n", + ">>> for train_index, test_index in kf.split(X):\n", + "... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n", + "... X_train, X_test = X[train_index], X[test_index]\n", + "... y_train, y_test = y[train_index], y[test_index]\n", + "TRAIN: [2 3] TEST: [0 1]\n", + "TRAIN: [0 1] TEST: [2 3]\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "accuracy: [0.70586404 0.71157239 0.7035807 0.7068715 0.72565912]\n", + "-logloss: [-0.54623834 -0.54819564 -0.54917768 -0.54841566 -0.52840935]\n", + "roc_auc: [0.79658373 0.79572423 0.79352088 0.79733309 0.82042929]\n" + ] + } + ], + "source": [ + "X1, y1 = training_data[training_columns], training_data['catagory']\n", + "splits = 5\n", + "kf = KFold(splits,True)\n", + "for train, test in kf.split(X1):\n", + " X_train, X_test = X1.iloc[train], X1.iloc[test]\n", + " y_train, y_test = y1.iloc[train], y1.iloc[test]\n", + " xgboost_bdt.fit(X_train,y_train)\n", + "cv_acc_1 = cross_val_score(xgboost_bdt, X_test, y_test, cv=splits, scoring=\"accuracy\")\n", + "cv_los_1 = cross_val_score(xgboost_bdt, X_test, y_test, cv=splits, scoring=\"neg_log_loss\")\n", + "cv_auc_1 = cross_val_score(xgboost_bdt, X_test, y_test, cv=splits, scoring=\"roc_auc\")\n", + "print(\"accuracy: \",cv_acc_1)\n", + "print(\"-logloss: \",cv_los_1)\n", + "print(\"roc_auc: \",cv_auc_1)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def modelfit(alg, metric, train, test, predictors, cv_folds=5, early_stop=10): #50):\n", + " xgb_param = alg.get_xgb_params()\n", + " xgtrain = xgb.DMatrix(train, label=test, feature_names=predictors)\n", + " cvresult = xgb.cv(xgb_param,\n", + " xgtrain,\n", + " num_boost_round=alg.get_params()['n_estimators'],\n", + " nfold=cv_folds,\n", + " metrics=metric,\n", + " early_stopping_rounds=early_stop)\n", + " alg.set_params(n_estimators=cvresult.shape[0])\n", + " #Fit the algorithm on the data \n", + " alg.fit(train, test, eval_metric=metric)\n", + " #Predict training set: \n", + " train_predictions = alg.predict(train)\n", + " #Print model report: \n", + " print(\"\\nModel Report : best iteration \"+str(cvresult.shape[0]))\n", + " print(\"Accuracy : \"+str(metrics.accuracy_score(test, train_predictions)))\n", + " return cvresult.shape[0]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Model Report : best iteration 733\n", + "Accuracy : 0.8002681966886428\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n" + ] + } + ], + "source": [ + "X1, y1 = training_data[training_columns], training_data['catagory']\n", + "LR = 0.2 # choosing a high learning rate to establish earlystopping limit to use during grid scan\n", + "bdt0 = XGBClassifier( learning_rate=LR, n_estimators=1000,\n", + " #max_depth=6, min_child_weight=1, #default values\n", + " #gamma=0, subsample=0.8,\n", + " #colsample_bytree=0.8, scale_pos_weight=1,\n", + " objective='binary:logistic', #'mutli:softprob', num_class=3, #or more\n", + " seed=123)\n", + "estimators = modelfit(bdt0, 'error', X1, y1, training_columns) #'merror' for multiclass" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'max_depth': 7, 'min_child_weight': 3}\n", + "0.8143962709007511\n" + ] + } + ], + "source": [ + "bdt1 = XGBClassifier( learning_rate=LR, n_estimators=estimators,\n", + " #max_depth=6, min_child_weight=1, #default values\n", + " #gamma=0, subsample=0.8,\n", + " #colsample_bytree=0.8, scale_pos_weight=1, \n", + " objective='binary:logistic', #'mutli:softprob', num_class=3, #or more\n", + " seed=123)\n", + " \n", + "param_test1 = {\n", + " 'max_depth':np.arange( 5, 9, 2 ),\n", + " 'min_child_weight':np.arange( 1, 5, 2 ),\n", + " #'gamma':np.arange( 0.0, 1.0, 0.2 ),\n", + " #'colsample_bytree':np.arange( 0.4, 1.0, 0.2 ),\n", + " #'subsample':np.arange( 0.4, 1.0, 0.2 ),\n", + " #'scale_pos_weight':np.arange( 0.4, 1.6, 0.2 )\n", + "}\n", + "gsearch1 = GridSearchCV(estimator=bdt1,\n", + " param_grid=param_test1,\n", + " scoring='accuracy',\n", + " iid=False,\n", + " cv=5)\n", + "gsearch1.fit(X1,y1)\n", + "print(gsearch1.best_params_)\n", + "print(gsearch1.best_score_)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'max_depth': 7, 'min_child_weight': 3}\n", + "0.8143962709007511\n" + ] + } + ], + "source": [ + "#second stage with decreased step size and smaller grid scan\n", + "bdt2 = XGBClassifier( learning_rate=LR, n_estimators=estimators,\n", + " #max_depth=6, min_child_weight=1, #default values\n", + " #gamma=0, subsample=0.8,\n", + " #colsample_bytree=0.8, scale_pos_weight=1,\n", + " objective='binary:logistic', #'mutli:softprob', num_class=3, #or more\n", + " seed=123)\n", + "param_test2 = {\n", + " 'max_depth':np.arange( gsearch1.best_params_['max_depth']-1 if gsearch1.best_params_['max_depth']>=4 else gsearch1.best_params_['max_depth'],\n", + " gsearch1.best_params_['max_depth']+1, 1 ),\n", + " 'min_child_weight':np.arange( gsearch1.best_params_['min_child_weight']-1 if gsearch1.best_params_['min_child_weight']>=1.1 else gsearch1.best_params_['min_child_weight'],\n", + " gsearch1.best_params_['min_child_weight']+1, 1 ), #0.5 ),\n", + " # 'gamma':np.arange( gsearch1.best_params_['gamma']-0.1 if gsearch1.best_params_['gamma']>=0.1 else gsearch1.best_params_['gamma'],\n", + " # gsearch1.best_params_['gamma']+0.1, 0.05 ),\n", + " #'colsample_bytree':np.arange( gsearch1.best_params_['colsample_bytree']-0.1 if gsearch1.best_params_['colsample_bytree']>=1.1 else gsearch1.best_params_['colsample_bytree'],\n", + " # gsearch1.best_params_['colsample_bytree']+0.1, 0.05 ),\n", + " # 'subsample':np.arange( gsearch1.best_params_['subsample']-0.1 if gsearch1.best_params_['subsample']>=1.1 else gsearch1.best_params_['subsample'],\n", + " # gsearch1.best_params_['subsample']+0.1, 0.05 ),\n", + " #'scale_pos_weight':np.arange( gsearch1.best_params_['scale_pos_weight']-0.1 if gsearch1.best_params_['scale_pos_weight']>=1.1 else gsearch1.best_params_['scale_pos_weight'],\n", + " # gsearch1.best_params_['scale_pos_weight']+0.1, 0.05 )\n", + "}\n", + "gsearch2 = GridSearchCV(estimator=bdt2,\n", + " param_grid=param_test2,\n", + " scoring='accuracy',\n", + " iid=False,\n", + " cv=5)\n", + "gsearch2.fit(X1,y1)\n", + "print(gsearch2.best_params_)\n", + "print(gsearch2.best_score_)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Model Report : best iteration 499\n", + "Accuracy : 0.8630785326402843\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n" + ] + } + ], + "source": [ + "# now repeat with lower learning rate, early stopping monitoring using optimal hyperparameters\n", + "bdt3 = XGBClassifier( learning_rate=0.1, n_estimators=1000, # 0.1 learning rate to compare to default used in xgboost_bdt\n", + " max_depth=gsearch2.best_params_['max_depth'], min_child_weight=gsearch2.best_params_['min_child_weight'],\n", + " #gamma=gsearch2.best_params_['gamma'], subsample=gsearch2.best_params_['subsample'],\n", + " #colsample_bytree=gsearch2.best_params_['colsample_bytree'], scale_pos_weight=gsearch2.best_params_['scale_pos_weight'], \n", + " objective='binary:logistic', #'multi:softprob', num_class=3, #or more\n", + " seed=123 )\n", + "estimators = modelfit(bdt3, 'error', X1, y1, training_columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "bdt3.fit(training_data[training_columns], training_data['catagory'])\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBtune'] = bdt3.predict_proba(df[training_columns])[:,1]\n", + "\n", + "plt.figure()\n", + "plot_comparision('XGBtune', mc_df, bkg_df)\n", + "\n", + "plt.figure()\n", + "plot_significance(bdt3, training_data, training_columns)\n", + "\n", + "plt.figure()\n", + "plot_roc(bdt3, training_data, training_columns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# now repeat with lower learning rate, early stopping monitoring using optimal hyperparameters\n", + "bdt4 = XGBClassifier( learning_rate=0.01, n_estimators=10000, # even lower learning rate to compare to default\n", + " max_depth=gsearch2.best_params_['max_depth'], min_child_weight=gsearch2.best_params_['min_child_weight'],\n", + " #gamma=gsearch2.best_params_['gamma'], subsample=gsearch2.best_params_['subsample'],\n", + " #colsample_bytree=gsearch2.best_params_['colsample_bytree'], scale_pos_weight=gsearch2.best_params_['scale_pos_weight'], \n", + " objective='binary:logistic', #'multi:softprob', num_class=3, #or more\n", + " seed=123 )\n", + "estimators = modelfit(bdt4, 'error', X1, y1, training_columns)\n", + "\n", + "bdt4.fit(training_data[training_columns], training_data['catagory'])\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBtune'] = bdt4.predict_proba(df[training_columns])[:,1]\n", + "\n", + "plt.figure()\n", + "plot_comparision('XGBtune', mc_df, bkg_df)\n", + "\n", + "plt.figure()\n", + "plot_significance(bdt4, training_data, training_columns)\n", + "\n", + "plt.figure()\n", + "plot_roc(bdt4, training_data, training_columns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From f9838ab95ff42594cc22f9e9c78df4b18d76ae41 Mon Sep 17 00:00:00 2001 From: jvmead Date: Thu, 31 Oct 2019 11:38:48 +0000 Subject: [PATCH 02/54] Rename Hyperparameter tuning to 4_bHyperparameterTuning.ipynb --- .../{Hyperparameter tuning => 4_bHyperparameterTuning.ipynb} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename advanced-python/{Hyperparameter tuning => 4_bHyperparameterTuning.ipynb} (100%) diff --git a/advanced-python/Hyperparameter tuning b/advanced-python/4_bHyperparameterTuning.ipynb similarity index 100% rename from advanced-python/Hyperparameter tuning rename to advanced-python/4_bHyperparameterTuning.ipynb From eef9eab093cf986b4dfd115e7bf1e137fce7066d Mon Sep 17 00:00:00 2001 From: jvmead Date: Thu, 31 Oct 2019 11:39:06 +0000 Subject: [PATCH 03/54] Rename 4_bHyperparameterTuning.ipynb to 4bHyperparameterTuning.ipynb --- ...4_bHyperparameterTuning.ipynb => 4bHyperparameterTuning.ipynb} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename advanced-python/{4_bHyperparameterTuning.ipynb => 4bHyperparameterTuning.ipynb} (100%) diff --git a/advanced-python/4_bHyperparameterTuning.ipynb b/advanced-python/4bHyperparameterTuning.ipynb similarity index 100% rename from advanced-python/4_bHyperparameterTuning.ipynb rename to advanced-python/4bHyperparameterTuning.ipynb From 1ea1bfc1d44aaf2ac98a1d0109065b294c3a1bdc Mon Sep 17 00:00:00 2001 From: jvmead Date: Thu, 31 Oct 2019 11:44:10 +0000 Subject: [PATCH 04/54] Update 4bHyperparameterTuning.ipynb --- advanced-python/4bHyperparameterTuning.ipynb | 290 ------------------- 1 file changed, 290 deletions(-) diff --git a/advanced-python/4bHyperparameterTuning.ipynb b/advanced-python/4bHyperparameterTuning.ipynb index d4fc1fa0..3bc99271 100644 --- a/advanced-python/4bHyperparameterTuning.ipynb +++ b/advanced-python/4bHyperparameterTuning.ipynb @@ -11,18 +11,6 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", - " \"This module will be removed in 0.20.\", DeprecationWarning)\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/grid_search.py:42: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.\n", - " DeprecationWarning)\n" - ] - } - ], "source": [ "from matplotlib import pyplot as plt\n", "import uproot\n", @@ -48,7 +36,6 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], "source": [ "def plot_mass(df):\n", " counts, bins, _ = plt.hist(df['Jpsi_M'], bins=100, range=[2.75, 3.5], histtype='step')\n", @@ -61,7 +48,6 @@ "cell_type": "code", "execution_count": 3, "metadata": {}, - "outputs": [], "source": [ "def plot_comparision(var, mc_df, bkg_df):\n", " _, bins, _ = plt.hist(mc_df[var], bins=100, histtype='step', label='MC', density=1)\n", @@ -75,7 +61,6 @@ "cell_type": "code", "execution_count": 4, "metadata": {}, - "outputs": [], "source": [ "def plot_roc(bdt, training_data, training_columns, label=None):\n", " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", @@ -100,7 +85,6 @@ "cell_type": "code", "execution_count": 5, "metadata": {}, - "outputs": [], "source": [ "def plot_significance(bdt, training_data, training_columns, label=None):\n", " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", @@ -125,7 +109,6 @@ "cell_type": "code", "execution_count": 6, "metadata": {}, - "outputs": [], "source": [ "#max_entries = 1000\n", "data_df = uproot.open('/eos/user/l/lhcbsk/advanced-python/data/real_data.root')['DecayTree'].pandas.df()#entrystop=max_entries)\n", @@ -154,53 +137,6 @@ "cell_type": "code", "execution_count": 7, "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAEKCAYAAAARnO4WAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAHWVJREFUeJzt3X+QFOW97/H312V1iRhUlpRcfrh4DicoLj9kDRBShivGq0jg3Dp4RCMGkwr+4saYREtFDdeUuWh+cgqNQmIQywN4jEaiGMsbNEbqqAEPUUTiJYq6SCliWCGwKvC9f/SwNsPMTu9sz0zP9OdVteXMdE/vQwufeeZ5nv62uTsiIlL7Dqt0A0REpDwU+CIiKaHAFxFJCQW+iEhKKPBFRFJCgS8ikhIKfBGRlFDgi4ikhAJfRCQlelTqFzc2NnpTU1Olfr2ISFVau3bte+7et5j3Vizwm5qaWLNmTaV+vYhIVTKzN4p9r4Z0RERSQoEvIpISCnwRkZSo2Bi+iCTfxx9/TGtrK+3t7ZVuSuo0NDQwYMAA6uvrYzumAl9E8mptbeWoo46iqakJM6t0c1LD3dm+fTutra0MHjw4tuNqSEdE8mpvb6dPnz4K+zIzM/r06RP7NysFvoh0SmFfGaU47wp8EZGU0Bi+iEQ2ft4qtuzYE9vx+h/dk9XXnt7pPmbGhRdeyL333gvA3r176devH2PGjOGRRx4B4LHHHuPGG29k9+7dHHHEEUycOJEf/ehHsbWzVijwJbefNkPbm8Hj3oPgqpcq2x5JhC079rB53jmxHa/p2kcL7nPkkUeyfv169uzZQ8+ePXniiSfo379/x/b169cze/ZsHn30UYYOHcq+ffu46667YmtjLdGQjuTW9ibMbQt+DgS/SIWcffbZPPpo8OGwdOlSzj///I5tt912G3PmzGHo0KEA1NXVcfnll1eknUlXMPDNrMHMnjezP5vZy2b2v3Psc4SZLTezTWb2nJk1laKxIpJO06dPZ9myZbS3t/Piiy8yZsyYjm3r169n9OjRFWxd9YgypPMhcLq77zKzeuAZM3vM3Z8N7fN14G/u/o9mNh24FTivBO2VSug9COb2Pvi5hnikjIYPH87mzZtZunQpkyZNqnRzqlbBwHd3B3ZlntZnfjxrt6nA3MzjB4AFZmaZ90q1yw73cPiLlMmUKVP47ne/y1NPPcX27ds7Xh82bBhr165lxIgRFWxddYg0hm9mdWa2DngXeMLdn8vapT/wFoC77wXagD45jjPLzNaY2Zpt27Z1r+Uikipf+9rXuOmmm2hubj7o9auvvpof/OAHvPrqqwDs37+fO++8sxJNTLxIq3TcfR8w0syOBh4ys5PdfX1ol1xXCBzSu3f3hcBCgJaWFvX+q1V4iEfDO6nS/+iekVbWdOV4UQ0YMIArr7zykNeHDx/Oz372M84//3x2796NmXHOOfGtJKolXVqW6e47zOwp4CwgHPitwECg1cx6AL2B9+NqpJRJ9lLMfMIBr+GdVCm0Zr4Udu3adchrEyZMYMKECR3PJ0+ezOTJk8vYqupUMPDNrC/wcSbsewJnEEzKhq0Avgr8JzANWKXx+yp0YCmmiNSkKD38fsA9ZlZHMOZ/v7s/YmY3A2vcfQXwS+BeM9tE0LOfXrIWS3zCPXrovFcvIlUvyiqdF4FROV6/KfS4HTg33qZJyalHL5IqutJWRCQlFPgiIimhwBcRSQlVy0ybqEsvRXLJnujvrgjXcdTV1dHc3Iy7U1dXx4IFC/j85z/f5V81c+ZMJk+ezLRp04ptbcn06tUr5/LTuCnw00YTtdIdcf/9iXAdR8+ePVm3bh0Ajz/+ONdddx1/+MMf4mtDBHv37qVHj+qPSw3pSPccuOp2bu+g9ydSQh988AHHHHMMEFyQNXHiRE455RSam5t5+OGHO/ZbsmQJw4cPZ8SIEcyYMeOQ49x4443MnDmT/fv3s3LlSoYOHcro0aP55je/2XEB19y5c5kxYwbjx49nxowZtLe3c/HFF9Pc3MyoUaN48sknAVi8eDGzZ8/uOPbkyZN56qmngKDnPmfOHEaMGMHYsWN55513AHj99dcZN24czc3N3HDDDSU5V7lU/0eWVJauupUS27NnDyNHjqS9vZ2tW7eyatUqABoaGnjooYf49Kc/zXvvvcfYsWOZMmUKGzZs4JZbbmH16tU0Njby/vsHX/R/zTXX0NbWxq9+9Ss+/PBDLrnkEp5++mkGDx58UJ19gA0bNvDMM8/Qs2dPfvzjHwPw0ksvsXHjRs4888yO+j35/P3vf2fs2LHccsstXHPNNSxatIgbbriBK6+8kssuu4yLLrqI22+/Pcaz1Tn18EUk0Q4M6WzcuJHf/e53XHTRRbg77s7111/P8OHDOeOMM9iyZQvvvPMOq1atYtq0aTQ2NgJw7LHHdhzr+9//Pjt27OCuu+7CzNi4cSMnnHACgwcPBjgk8KdMmULPnkG9n2eeeabj28LQoUM5/vjjCwb+4Ycf3vGNYfTo0WzevBmA1atXd/yuXN9ASkU9/FqU6wpaFTiTGjBu3Djee+89tm3bxsqVK9m2bRtr166lvr6epqYm2tvbcXfMctVzhFNPPZW1a9fy/vvvc+yxx1KoAsyRRx7Z8Tjfvj169GD//v0dz9vb2zse19fXd7Slrq6OvXv3dmzL18ZSUg+/FoVvT6hbFEoN2bhxI/v27aNPnz60tbXxmc98hvr6ep588kneeOMNACZOnMj999/fUTM/PKRz1llnce2113LOOeewc+dOhg4dymuvvdbR816+fHne333aaadx3333AfDqq6/y5ptv8tnPfpampibWrVvH/v37eeutt3j++ecL/jnGjx/PsmXLADqOWQ7q4YtIdNl3P4vjeAUcGMOHoJd9zz33UFdXx1e+8hW+/OUv09zcTEtLS8c9bYcNG8acOXP44he/SF1dHaNGjWLx4sUdxzv33HPZuXMnU6ZMYeXKldxxxx2cddZZHHnkkZx66ql523H55Zdz6aWX0tzcTI8ePVi8eDFHHHEE48ePZ/DgwZx00kmceOKJnHLKKQX/TPPnz+eCCy7g1ltvZerUqQX3j4tVqqhlS0uLr1mzpiK/u+bN7X3w0rnw8+xtpfy9UvVeeeUVTjzxxEo3o6R27dpFr169cHeuuOIKhgwZwlVXXVXpZgG5z7+ZrXX3lmKOpyEdEUm1RYsWMXLkSIYNG0ZbWxuXXHJJpZtUMhrSEZFUu+qqqxLToy819fBFpFO6l1FllOK8q4efBtn3oBWJqKGhge3bt9OnT5+KLCNMK3dn+/btNDQ0xHpcBX4aaA2+FGnAgAG0traybdu2SjcldRoaGhgwYECsx1Tgi0he9fX1HVehSvXTGL6ISEoo8EVEUkKBLyKSEgp8EZGU0KStxCe7zoqqdIokigJf4pMd7rohikiiFAx8MxsILAGOA/YDC919ftY+E4CHgdczLz3o7jfH21TplG5OLiIFROnh7wW+4+4vmNlRwFoze8LdN2Tt90d3nxx/EyUS3ZxcRAooOGnr7lvd/YXM453AK0D/UjdMRETi1aVVOmbWBIwCnsuxeZyZ/dnMHjOzYTG0TUREYhR50tbMegG/Br7l7h9kbX4BON7dd5nZJOA3wJAcx5gFzAIYNEjjzCIi5RSph29m9QRhf5+7P5i93d0/cPddmccrgXoza8yx30J3b3H3lr59+3az6SIi0hUFA9+Cmqi/BF5x95/k2ee4zH6Y2ecyx90eZ0NFRKR7ogzpjAdmAC+Z2brMa9cDgwDc/U5gGnCZme0F9gDTXXdNKD0txRSRLigY+O7+DNDpnQ/cfQGwIK5GSURJX4qZfeMVXXUrUlG60lZKJxzwuupWpOJUPE1EJCUU+CIiKaEhnWoSnqQFTdSKSJco8KtJ0idpRSTRNKQjIpIS6uEnndbai0hMFPhJp2EcEYmJhnRERFJCgS8ikhIKfBGRlFDgi4ikhAJfRCQltEpHykOVM0UqToEv5aHKmSIVpyEdEZGUUA9fumX8vFVs2bEHgP5H92T1tadXuEUiko8CP4kSXk4hO+Q3zzsHgKZrH61ks0SkAAV+EiW8nMKWHXs6Ql5EqofG8EVEUkKBLyKSEgp8EZGUUOCLiKSEAl9EJCW0SicJdHNyESmDgoFvZgOBJcBxwH5gobvPz9rHgPnAJGA3MNPdX4i/uTUq4cswo+p/dM+D1uLrQiyRZInSw98LfMfdXzCzo4C1ZvaEu28I7XM2MCTzMwb4eea/kiLZ4a4LsUSSpeAYvrtvPdBbd/edwCtA/6zdpgJLPPAscLSZ9Yu9tSIiUrQujeGbWRMwCngua1N/4K3Q89bMa1uz3j8LmAUwaJDGqdMkXI5hc0PWxuxSEiqdLFISkQPfzHoBvwa+5e4fZG/O8RY/5AX3hcBCgJaWlkO2p0rC6+XELVyOofV7jQwIl0juPeiTOQyVThYpmUiBb2b1BGF/n7s/mGOXVmBg6PkA4O3uN6+GVdFEbbh3DsFkbHd84cN/Uy0ekQqIskrHgF8Cr7j7T/LstgKYbWbLCCZr29x9a559pcoUWywtvGqnux8SItJ9UXr444EZwEtmti7z2vXAIAB3vxNYSbAkcxPBssyL42+qVBstyRQpTq5v1XH8eyoY+O7+DLnH6MP7OHBFt1sjIiKHfKuOa4mzrrQtF11NKyIVpsAvlyqapBWR2qTiaSIiKaEefimlbK29iCSbAr+UNIwjIgmiwO+uXJOxKg0gIgmkwO+u7F68SgOISEIp8OPWe9AnoV/F4/bhCz90laxIbVDgx61GhnOKLacgIsmlwC+GVt+UzFb60i/zDSl4vKnCLRKpHQr8Ymj1TcmMa5/f8c2in+ZDRGKlwJeyy773bfY2ESkNBb6UXbFV/7InklWNU6RrFPiSWK1+8J2xlnsjA+b9FdAN0kWKocCXxDqv56KDaoJvbrig43H2sJB6/CKFKfAlsQ4J8Ln5t6nHL1KYAl861MrFVhrrF8lNgS8dauViq/CfI2rPv1S3lBNJEgW+CKW7pZxIkijwpSaEJ3GjDkfVyhCWSFQKfKkJ+YZfOlvNUytDWCJRKfClpmk1j8gnFPhh2UXRwpUvVTCt8rJLTxdRmbSYoR+RWqHADwsXRcsu3KWCaZUXDvgiC6tp5Y0kVTnmlAoGvpndDUwG3nX3k3NsnwA8DLyeeelBd785zkaKlFv2NwF9UEiplWNOKUoPfzGwAFjSyT5/dPfJsbRIJAHCAa9xf6kVBQPf3Z82s6bSNyVhwuPFB55LKhUq56zev0RV6Qv84hrDH2dmfwbeBr7r7i/HdNzKqZFbFUr3dfYPUr1/6YpKX+AXR+C/ABzv7rvMbBLwG2BIrh3NbBYwC2DQIPWYpRuyv4Flb9MHtsghuh347v5B6PFKM7vDzBrd/b0c+y4EFgK0tLR4d3+3pFhnga5bI4rk1O3AN7PjgHfc3c3sc8BhwPZut0ykCmg1j1STKMsylwITgEYzawW+B9QDuPudwDTgMjPbC+wBpru7eu+SClrNI9Ukyiqd8wtsX0CwbLM66QpaEUkJXWmrK2glJhrekaRT4IvERMM7knTpC/zwEA5oGEeqQqUv2JHakL7A1xBOh1whIpWV7368lb5gR2pD+gJfOugGIMlTzP14RaJS4IuUQGd32hKpFAW+SAnEcact3axF4qbAl9oTw52xkkDfCKpXvrmYSlPgS+3JvjWliqxJmYXnYsbPW5X3m1q5v8WlI/B1NW16qciaVFhnvfty9/zTEfhaiikiRai16x/SEfgiFaayC9Upe+lyeHgmrFr+nyrwRcogHAbZY7rFBIU+QCoj33mulmsmFPgiZZYv/CH6xJ3q9kgxFPgpk71cLNUSsHxTPXMpp9oNfK3MyUnlFELCAV8jK3ZqbZJR4lW7ga+VOZJCxRZZ6+xCoaReRJQm2aU6ilW7gS+SElEv3snu/WcfI9eFQtnb0jBfUMywZ67aSXEKf8jarcUfR4EvAgeP5x94XiVX4XbW487+MIgynNeV4yW5t1/s8FYxw55JPg9hCnwRODTca2RMP+4gKrTC6MD2OIaB8h0japBHHd5K030hFPgiUpTOKoJGrevf2QdDvmPEcTOY7N+bloUMCnwRqZh8RcagtD3ttK5WU+CLSEl1tsIkHOrFDj9FnVfQ/QVqKfB1c3KJUwIuyqo2+QK11BOanZWtyLdfWtVO4GvdvcSpBi/KKrUkBGoS2pBkBQPfzO4GJgPvuvvJObYbMB+YBOwGZrr7C3E3VIqncgrdpN5+xWk4Jh5ReviLgQXAkjzbzwaGZH7GAD/P/FcSIq0TVLFRb7/i1HOPR8HAd/enzaypk12mAkvc3YFnzexoM+vn7ltjamN+qpcjIhJZHGP4/YG3Qs9bM6+VPvA1bi8iEtlhMRzDcrzmOXc0m2Vma8xszbZt22L41SIiElUcgd8KDAw9HwC8nWtHd1/o7i3u3tK3b98YfrWIiEQVx5DOCmC2mS0jmKxtK8v4veSVptogZVfFRdZEoizLXApMABrNrBX4HlAP4O53AisJlmRuIliWeXGpGgtoojYCrcopoRotsibpEGWVzvkFtjtwRWwtKkQTtSIiRYljDF9ERKqAAl9EJCUU+CIiKZH84mmqgilJpjo7UkWSH/iapJUkU50dqSIa0hERSQkFvohISiR/SEekWmg8XxIumYGvq2mlGmk8XxIumYGvidou012tRKSQZAa+dJnq54hIIZq0FRFJCfXwq5iGcUSkK5IT+Jqo7TIN44hIV1Qu8N95+dAbSWiiVkSkZCoX+Ps+grl7Cu8nUo10ZyxJoOQM6YjUEt0ZSxJIq3RERFJCPXyRcsge4ulsPw39SIko8EXKIWqIa+hHSkiBX0XC6+5Ba+9FpGsU+FVE6+5FpDsU+CJJohLLUkIKfJEkCQf8T5sV/hIrBb5IUqm+vsQsUuCb2VnAfKAO+IW7z8vaPhP4IbAl89ICd/9FjO1MLRVIE0BX7kosCga+mdUBtwNfAlqBP5nZCnffkLXrcnefXYI2ppomagXQlbsSiyhX2n4O2OTur7n7R8AyYGppmyUiInGLEvj9gbdCz1szr2X7FzN70cweMLOBuQ5kZrPMbI2Zrdm224torogAnwzxzO0dTO6KRBBlDN9yvJad1r8Flrr7h2Z2KXAPcPohb3JfCCwEaPlvdUp8kWJpQleKECXwW4Fwj30A8HZ4B3ffHnq6CLi1+01LL03USpd0VqdHk7sSEiXw/wQMMbPBBKtwpgMXhHcws37uvjXzdArwSqytrHG5SiZoolYi6yzQ1fuXkIKB7+57zWw28DjBssy73f1lM7sZWOPuK4BvmtkUYC/wPjCzhG2uOVqJIyLlEGkdvruvBFZmvXZT6PF1wHXxNq22adhGykLDPRKiK20rRL16KQsN90iIAr9MVNpYEkeF2lJHgV8m6tFL4mhpZ+oo8EVEvf2UUOCLSOe9/Z82Q9ubwWN9GFQ1Bb6IHCxXZc65bcFj1eivagp8ETlYZyGucf+qpsAvIa21F5EkUeCXkFbmiEiSKPBjpl69pIau4q06CvxuUuEzSS1dxVt1FPhFyO7FK+BFpBoo8IugsXmRAjob7sneT0M/ZaPAF5H4RQ3x8Lr+bPowiJ0CX0QqR/MAZaXAj0irb0Sk2inw89DqG5EKU0G32Cnw89DErEiF5SvjEC7mlk0fDJ1S4ItI8mX39g8Uc8um4m6dUuCHaJxeJKGiBnd4v+wVQPoAUODrIiqRGpUd7vmWgKbogyB1ga/JWJGUyhfqKVr+mbrA12SsiBwk6lXBB/at4m8DqQh8jc2LSF5dCfAqnxSOFPhmdhYwH6gDfuHu87K2HwEsAUYD24Hz3H1zvE3tGo3Ni0jsOpsUDkvoh0HBwDezOuB24EtAK/AnM1vh7htCu30d+Ju7/6OZTQduBc4rRYOj0tCNiJRUZ4Ge0G8CUXr4nwM2uftrAGa2DJgKhAN/KjA38/gBYIGZmbt7jG3tVK7JWBGRikjoN4Eogd8feCv0vBUYk28fd99rZm1AH+C97jYwO8jzNlLDNiKSRFG/CXQmpg+GKIFvOV7L7rlH2QczmwXMyjzdZWZ/ifD7I3kDsOviOlqHRmL40EoBnafCdI6i0XnKaT18uyNmP1vsUaIEfiswMPR8APB2nn1azawH0Bt4P/tA7r4QWFhcU8vPzNa4e0ul25F0Ok+F6RxFo/NUmJmtKfa9h0XY50/AEDMbbGaHA9OBFVn7rAC+mnk8DVhVzvF7EREprGAPPzMmPxt4nGBZ5t3u/rKZ3QyscfcVwC+Be81sE0HPfnopGy0iIl0XaR2+u68EVma9dlPocTtwbrxNS4SqGX6qMJ2nwnSOotF5Kqzoc2QaeRERSYcoY/giIlIDFPgEpSPM7C9mtsnMrs2x/dtmtsHMXjSz35vZ8ZVoZyUVOkeh/aaZmZtZKldaRDlPZvavmb9PL5vZv5e7jUkQ4d/cIDN70sz+K/PvblIl2llJZna3mb1rZuvzbDcz+7fMOXzRzE4peFB3T/UPwUT0X4ETgMOBPwMnZe3z34FPZR5fBiyvdLuTdo4y+x0FPA08C7RUut1JPE/AEOC/gGMyzz9T6XYn9DwtBC7LPD4J2FzpdlfgPJ0GnAKsz7N9EvAYwXVQY4HnCh1TPfxQ6Qh3/wg4UDqig7s/6e67M0+fJbgWIU0KnqOM7wO3Ae3lbFyCRDlP3wBud/e/Abj7u2VuYxJEOU8OfDrzuDeHXvtT89z9aXJczxQyFVjigWeBo82sX2fHVODnLh3Rv5P9v07wqZomBc+RmY0CBrr7I+VsWMJE+bv0T8A/mdlqM3s2U4k2baKcp7nAhWbWSrBC8H+Vp2lVpavZlY56+AVEKgsBYGYXAi3AF0vaouTp9ByZ2WHAT4GZ5WpQQkX5u9SDYFhnAsE3xT+a2cnuvqPEbUuSKOfpfGCxu//YzMYRXOdzsrvvL33zqkbk7DpAPfxopSMwszOAOcAUd/+wTG1LikLn6CjgZOApM9tMMJ64IoUTt1HLkDzs7h+7++vAXwg+ANIkynn6OnA/gLv/J9BAUGdHPhEpu8IU+BFKR2SGK+4iCPs0jrl2eo7cvc3dG929yd2bCOY5prh70TU/qlSUMiS/IVgEgJk1EgzxvFbWVlZelPP0JjARwMxOJAj8bWVtZfKtAC7KrNYZC7S5+9bO3pD6IR2PVjrih0Av4D/MDOBNd59SsUaXWcRzlHoRz9PjwJlmtgHYB1zt7tsr1+ryi3ievgMsMrOrCIYpZnpmaUpamNlSgqG/xsxcxveAegB3v5NgbmMSsAnYDVxc8JgpO4ciIqmlIR0RkZRQ4IuIpIQCX0QkJRT4IiIpocAXEUkJBb7UBDMbaGavm9mxmefHZJ4fb2ZDzOwRM/urma3NVGE8LbPfTDPbZmbrMtUrHzCzT2W2/bOZnVRke0amscKjJJsCX2qCu78F/ByYl3lpHkHFxXeAR4GF7v4P7j6aoC7LCaG3L3f3ke4+DPgIOC/z+j8TVGosxkiCNdIiiaF1+FIzzKweWAvcTVCVchQwAzjN3b+a5z0zCUo5zzazHsCvgV8B7wKPAG2Zn3/JvOV2oC/BhS7fcPeNZnYuwUUx+zL7nkFwMUxPYAvwf9x9eex/YJEuSv2VtlI73P1jM7sa+B1wprt/ZGbDgBcKvPU8M/sC0A94Ffitu+8zsxXAI+7+AICZ/R641N3/n5mNAe4ATgduAv6Hu28xs6Mzv/cmMh8kpfnTinSdhnSk1pwNbCUo5nYIM3vIzNab2YOhl5e7+0jgOOAl4Ooc7+sFfJ6gvMY6gtpKB2qPrwYWm9k3CEoFiCSSAl9qhpmNBL5EUK3zqszNIF4muGsQAO7+PwnKOB+b/f5MrZbfEtxpKNthwI7MWP+BnxMz77sUuIGgcuFaM+sT6x9MJCYKfKkJFlS1+znwLXd/k6Dg3Y+AfwfGm1m42N2nOjnUFwhuvwewk6D0M+7+AfB6Zrz+wP1ER2Qe/4O7P+fuNxFUdBwYfq9IUmjSVmqCmc0CJrr7eZnndcDzwLcJVur8BBiaebwTuM3d/29m0vaHBJOrhxHUGJ/p7u+a2XhgEfAhMA3YT/Ch0o+gauEyd785Mzw0hOCGFL8HvgUcQ1ANsh5N2kpCKPBFRFJCQzoiIimhwBcRSQkFvohISijwRURSQoEvIpISCnwRkZRQ4IuIpIQCX0QkJf4/4R4UtsdHB4gAAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "xgboost_bdt = XGBClassifier() # LR=0.1 as default compared to bdt3 later\n", - "xgboost_bdt.fit(training_data[training_columns], training_data['catagory'])\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['XGBtest'] = xgboost_bdt.predict_proba(df[training_columns])[:,1]\n", - "\n", - "plt.figure()\n", - "plot_comparision('XGBtest', mc_df, bkg_df)\n", - "\n", - "plt.figure()\n", - "plot_significance(xgboost_bdt, training_data, training_columns)\n", - "\n", - "plt.figure()\n", - "plot_roc(xgboost_bdt, training_data, training_columns)" - ] }, { "cell_type": "markdown", @@ -236,33 +172,6 @@ "cell_type": "code", "execution_count": 8, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "accuracy: [0.70586404 0.71157239 0.7035807 0.7068715 0.72565912]\n", - "-logloss: [-0.54623834 -0.54819564 -0.54917768 -0.54841566 -0.52840935]\n", - "roc_auc: [0.79658373 0.79572423 0.79352088 0.79733309 0.82042929]\n" - ] - } - ], "source": [ "X1, y1 = training_data[training_columns], training_data['catagory']\n", "splits = 5\n", @@ -283,7 +192,6 @@ "cell_type": "code", "execution_count": 9, "metadata": {}, - "outputs": [], "source": [ "def modelfit(alg, metric, train, test, predictors, cv_folds=5, early_stop=10): #50):\n", " xgb_param = alg.get_xgb_params()\n", @@ -309,25 +217,6 @@ "cell_type": "code", "execution_count": 10, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Model Report : best iteration 733\n", - "Accuracy : 0.8002681966886428\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n" - ] - } - ], "source": [ "X1, y1 = training_data[training_columns], training_data['catagory']\n", "LR = 0.2 # choosing a high learning rate to establish earlystopping limit to use during grid scan\n", @@ -346,62 +235,6 @@ "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'max_depth': 7, 'min_child_weight': 3}\n", - "0.8143962709007511\n" - ] - } - ], "source": [ "bdt1 = XGBClassifier( learning_rate=LR, n_estimators=estimators,\n", " #max_depth=6, min_child_weight=1, #default values\n", @@ -432,62 +265,6 @@ "cell_type": "code", "execution_count": 12, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'max_depth': 7, 'min_child_weight': 3}\n", - "0.8143962709007511\n" - ] - } - ], "source": [ "#second stage with decreased step size and smaller grid scan\n", "bdt2 = XGBClassifier( learning_rate=LR, n_estimators=estimators,\n", @@ -524,25 +301,6 @@ "cell_type": "code", "execution_count": 13, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Model Report : best iteration 499\n", - "Accuracy : 0.8630785326402843\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n" - ] - } - ], "source": [ "# now repeat with lower learning rate, early stopping monitoring using optimal hyperparameters\n", "bdt3 = XGBClassifier( learning_rate=0.1, n_estimators=1000, # 0.1 learning rate to compare to default used in xgboost_bdt\n", @@ -558,58 +316,11 @@ "cell_type": "code", "execution_count": 14, "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "bdt3.fit(training_data[training_columns], training_data['catagory'])\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['XGBtune'] = bdt3.predict_proba(df[training_columns])[:,1]\n", - "\n", - "plt.figure()\n", - "plot_comparision('XGBtune', mc_df, bkg_df)\n", - "\n", - "plt.figure()\n", - "plot_significance(bdt3, training_data, training_columns)\n", - "\n", - "plt.figure()\n", - "plot_roc(bdt3, training_data, training_columns)" - ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "# now repeat with lower learning rate, early stopping monitoring using optimal hyperparameters\n", "bdt4 = XGBClassifier( learning_rate=0.01, n_estimators=10000, # even lower learning rate to compare to default\n", @@ -638,7 +349,6 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], "source": [] } ], From 2a3e18a54070b566a8952b7479ed176ccfb22ad1 Mon Sep 17 00:00:00 2001 From: jvmead Date: Thu, 31 Oct 2019 11:49:14 +0000 Subject: [PATCH 05/54] Update 4bHyperparameterTuning.ipynb --- advanced-python/4bHyperparameterTuning.ipynb | 47 +++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/advanced-python/4bHyperparameterTuning.ipynb b/advanced-python/4bHyperparameterTuning.ipynb index 3bc99271..bc2abe8f 100644 --- a/advanced-python/4bHyperparameterTuning.ipynb +++ b/advanced-python/4bHyperparameterTuning.ipynb @@ -1,4 +1,4 @@ -{ + "cells": [ { "cell_type": "markdown", @@ -11,6 +11,7 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, + "outputs": [], "source": [ "from matplotlib import pyplot as plt\n", "import uproot\n", @@ -36,6 +37,7 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, + "outputs": [], "source": [ "def plot_mass(df):\n", " counts, bins, _ = plt.hist(df['Jpsi_M'], bins=100, range=[2.75, 3.5], histtype='step')\n", @@ -48,6 +50,7 @@ "cell_type": "code", "execution_count": 3, "metadata": {}, + "outputs": [], "source": [ "def plot_comparision(var, mc_df, bkg_df):\n", " _, bins, _ = plt.hist(mc_df[var], bins=100, histtype='step', label='MC', density=1)\n", @@ -61,6 +64,7 @@ "cell_type": "code", "execution_count": 4, "metadata": {}, + "outputs": [], "source": [ "def plot_roc(bdt, training_data, training_columns, label=None):\n", " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", @@ -85,6 +89,7 @@ "cell_type": "code", "execution_count": 5, "metadata": {}, + "outputs": [], "source": [ "def plot_significance(bdt, training_data, training_columns, label=None):\n", " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", @@ -109,6 +114,7 @@ "cell_type": "code", "execution_count": 6, "metadata": {}, + "outputs": [], "source": [ "#max_entries = 1000\n", "data_df = uproot.open('/eos/user/l/lhcbsk/advanced-python/data/real_data.root')['DecayTree'].pandas.df()#entrystop=max_entries)\n", @@ -137,6 +143,22 @@ "cell_type": "code", "execution_count": 7, "metadata": {}, + "outputs": [], + "source": [ + "xgboost_bdt = XGBClassifier() # LR=0.1 as default compared to bdt3 later\n", + "xgboost_bdt.fit(training_data[training_columns], training_data['catagory'])\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBtest'] = xgboost_bdt.predict_proba(df[training_columns])[:,1]\n", + "\n", + "plt.figure()\n", + "plot_comparision('XGBtest', mc_df, bkg_df)\n", + "\n", + "plt.figure()\n", + "plot_significance(xgboost_bdt, training_data, training_columns)\n", + "\n", + "plt.figure()\n", + "plot_roc(xgboost_bdt, training_data, training_columns)" + ] }, { "cell_type": "markdown", @@ -172,6 +194,7 @@ "cell_type": "code", "execution_count": 8, "metadata": {}, + "outputs": [], "source": [ "X1, y1 = training_data[training_columns], training_data['catagory']\n", "splits = 5\n", @@ -192,6 +215,7 @@ "cell_type": "code", "execution_count": 9, "metadata": {}, + "outputs": [], "source": [ "def modelfit(alg, metric, train, test, predictors, cv_folds=5, early_stop=10): #50):\n", " xgb_param = alg.get_xgb_params()\n", @@ -217,6 +241,7 @@ "cell_type": "code", "execution_count": 10, "metadata": {}, + "outputs": [], "source": [ "X1, y1 = training_data[training_columns], training_data['catagory']\n", "LR = 0.2 # choosing a high learning rate to establish earlystopping limit to use during grid scan\n", @@ -235,6 +260,7 @@ "metadata": { "scrolled": true }, + "outputs": [], "source": [ "bdt1 = XGBClassifier( learning_rate=LR, n_estimators=estimators,\n", " #max_depth=6, min_child_weight=1, #default values\n", @@ -265,6 +291,7 @@ "cell_type": "code", "execution_count": 12, "metadata": {}, + "outputs": [], "source": [ "#second stage with decreased step size and smaller grid scan\n", "bdt2 = XGBClassifier( learning_rate=LR, n_estimators=estimators,\n", @@ -301,6 +328,7 @@ "cell_type": "code", "execution_count": 13, "metadata": {}, + "outputs": [], "source": [ "# now repeat with lower learning rate, early stopping monitoring using optimal hyperparameters\n", "bdt3 = XGBClassifier( learning_rate=0.1, n_estimators=1000, # 0.1 learning rate to compare to default used in xgboost_bdt\n", @@ -316,11 +344,27 @@ "cell_type": "code", "execution_count": 14, "metadata": {}, + "outputs": [], + "source": [ + "bdt3.fit(training_data[training_columns], training_data['catagory'])\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBtune'] = bdt3.predict_proba(df[training_columns])[:,1]\n", + "\n", + "plt.figure()\n", + "plot_comparision('XGBtune', mc_df, bkg_df)\n", + "\n", + "plt.figure()\n", + "plot_significance(bdt3, training_data, training_columns)\n", + "\n", + "plt.figure()\n", + "plot_roc(bdt3, training_data, training_columns)" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "# now repeat with lower learning rate, early stopping monitoring using optimal hyperparameters\n", "bdt4 = XGBClassifier( learning_rate=0.01, n_estimators=10000, # even lower learning rate to compare to default\n", @@ -349,6 +393,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, + "outputs": [], "source": [] } ], From 110fe1ae373af8d946c6957d379febd833c4f5ed Mon Sep 17 00:00:00 2001 From: jvmead Date: Thu, 31 Oct 2019 11:49:52 +0000 Subject: [PATCH 06/54] Update 4bHyperparameterTuning.ipynb --- advanced-python/4bHyperparameterTuning.ipynb | 261 ++++++++++++++++++- 1 file changed, 253 insertions(+), 8 deletions(-) diff --git a/advanced-python/4bHyperparameterTuning.ipynb b/advanced-python/4bHyperparameterTuning.ipynb index bc2abe8f..5419ebb0 100644 --- a/advanced-python/4bHyperparameterTuning.ipynb +++ b/advanced-python/4bHyperparameterTuning.ipynb @@ -11,7 +11,18 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", + " \"This module will be removed in 0.20.\", DeprecationWarning)\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/grid_search.py:42: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.\n", + " DeprecationWarning)\n" + ] + } + ], "source": [ "from matplotlib import pyplot as plt\n", "import uproot\n", @@ -143,7 +154,38 @@ "cell_type": "code", "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "xgboost_bdt = XGBClassifier() # LR=0.1 as default compared to bdt3 later\n", "xgboost_bdt.fit(training_data[training_columns], training_data['catagory'])\n", @@ -194,7 +236,33 @@ "cell_type": "code", "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "accuracy: [0.70586404 0.71157239 0.7035807 0.7068715 0.72565912]\n", + "-logloss: [-0.54623834 -0.54819564 -0.54917768 -0.54841566 -0.52840935]\n", + "roc_auc: [0.79658373 0.79572423 0.79352088 0.79733309 0.82042929]\n" + ] + } + ], "source": [ "X1, y1 = training_data[training_columns], training_data['catagory']\n", "splits = 5\n", @@ -241,7 +309,25 @@ "cell_type": "code", "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Model Report : best iteration 733\n", + "Accuracy : 0.8002681966886428\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n" + ] + } + ], "source": [ "X1, y1 = training_data[training_columns], training_data['catagory']\n", "LR = 0.2 # choosing a high learning rate to establish earlystopping limit to use during grid scan\n", @@ -260,7 +346,62 @@ "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'max_depth': 7, 'min_child_weight': 3}\n", + "0.8143962709007511\n" + ] + } + ], "source": [ "bdt1 = XGBClassifier( learning_rate=LR, n_estimators=estimators,\n", " #max_depth=6, min_child_weight=1, #default values\n", @@ -291,7 +432,62 @@ "cell_type": "code", "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n", + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'max_depth': 7, 'min_child_weight': 3}\n", + "0.8143962709007511\n" + ] + } + ], "source": [ "#second stage with decreased step size and smaller grid scan\n", "bdt2 = XGBClassifier( learning_rate=LR, n_estimators=estimators,\n", @@ -328,7 +524,25 @@ "cell_type": "code", "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Model Report : best iteration 499\n", + "Accuracy : 0.8630785326402843\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n" + ] + } + ], "source": [ "# now repeat with lower learning rate, early stopping monitoring using optimal hyperparameters\n", "bdt3 = XGBClassifier( learning_rate=0.1, n_estimators=1000, # 0.1 learning rate to compare to default used in xgboost_bdt\n", @@ -344,7 +558,38 @@ "cell_type": "code", "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "bdt3.fit(training_data[training_columns], training_data['catagory'])\n", "for df in [mc_df, bkg_df, data_df, training_data]:\n", From 8439ee0c1a1cd9f4f9595076bb888c0f71120e22 Mon Sep 17 00:00:00 2001 From: jvmead Date: Thu, 31 Oct 2019 11:51:35 +0000 Subject: [PATCH 07/54] Update 4bHyperparameterTuning.ipynb --- advanced-python/4bHyperparameterTuning.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/advanced-python/4bHyperparameterTuning.ipynb b/advanced-python/4bHyperparameterTuning.ipynb index 5419ebb0..d4fc1fa0 100644 --- a/advanced-python/4bHyperparameterTuning.ipynb +++ b/advanced-python/4bHyperparameterTuning.ipynb @@ -1,4 +1,4 @@ - +{ "cells": [ { "cell_type": "markdown", From 32d0d63252879283a5631001a4e2a7e9ce6723d5 Mon Sep 17 00:00:00 2001 From: jvmead Date: Thu, 31 Oct 2019 11:53:02 +0000 Subject: [PATCH 08/54] Update 4bHyperparameterTuning.ipynb --- advanced-python/4bHyperparameterTuning.ipynb | 261 +------------------ 1 file changed, 8 insertions(+), 253 deletions(-) diff --git a/advanced-python/4bHyperparameterTuning.ipynb b/advanced-python/4bHyperparameterTuning.ipynb index d4fc1fa0..c08cb821 100644 --- a/advanced-python/4bHyperparameterTuning.ipynb +++ b/advanced-python/4bHyperparameterTuning.ipynb @@ -11,18 +11,7 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", - " \"This module will be removed in 0.20.\", DeprecationWarning)\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/grid_search.py:42: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.\n", - " DeprecationWarning)\n" - ] - } - ], + "outputs": [], "source": [ "from matplotlib import pyplot as plt\n", "import uproot\n", @@ -154,38 +143,7 @@ "cell_type": "code", "execution_count": 7, "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "xgboost_bdt = XGBClassifier() # LR=0.1 as default compared to bdt3 later\n", "xgboost_bdt.fit(training_data[training_columns], training_data['catagory'])\n", @@ -236,33 +194,7 @@ "cell_type": "code", "execution_count": 8, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "accuracy: [0.70586404 0.71157239 0.7035807 0.7068715 0.72565912]\n", - "-logloss: [-0.54623834 -0.54819564 -0.54917768 -0.54841566 -0.52840935]\n", - "roc_auc: [0.79658373 0.79572423 0.79352088 0.79733309 0.82042929]\n" - ] - } - ], + "outputs": [], "source": [ "X1, y1 = training_data[training_columns], training_data['catagory']\n", "splits = 5\n", @@ -309,25 +241,7 @@ "cell_type": "code", "execution_count": 10, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Model Report : best iteration 733\n", - "Accuracy : 0.8002681966886428\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n" - ] - } - ], + "outputs": [], "source": [ "X1, y1 = training_data[training_columns], training_data['catagory']\n", "LR = 0.2 # choosing a high learning rate to establish earlystopping limit to use during grid scan\n", @@ -346,62 +260,7 @@ "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'max_depth': 7, 'min_child_weight': 3}\n", - "0.8143962709007511\n" - ] - } - ], + "outputs": [], "source": [ "bdt1 = XGBClassifier( learning_rate=LR, n_estimators=estimators,\n", " #max_depth=6, min_child_weight=1, #default values\n", @@ -432,62 +291,7 @@ "cell_type": "code", "execution_count": 12, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'max_depth': 7, 'min_child_weight': 3}\n", - "0.8143962709007511\n" - ] - } - ], + "outputs": [], "source": [ "#second stage with decreased step size and smaller grid scan\n", "bdt2 = XGBClassifier( learning_rate=LR, n_estimators=estimators,\n", @@ -524,25 +328,7 @@ "cell_type": "code", "execution_count": 13, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Model Report : best iteration 499\n", - "Accuracy : 0.8630785326402843\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/cvmfs/sft.cern.ch/lcg/views/LCG_94python3/x86_64-centos7-gcc7-opt/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", - " if diff:\n" - ] - } - ], + "outputs": [], "source": [ "# now repeat with lower learning rate, early stopping monitoring using optimal hyperparameters\n", "bdt3 = XGBClassifier( learning_rate=0.1, n_estimators=1000, # 0.1 learning rate to compare to default used in xgboost_bdt\n", @@ -558,38 +344,7 @@ "cell_type": "code", "execution_count": 14, "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "bdt3.fit(training_data[training_columns], training_data['catagory'])\n", "for df in [mc_df, bkg_df, data_df, training_data]:\n", From 57a462804684a5692562bc3b9e2cabbc426b5836 Mon Sep 17 00:00:00 2001 From: jvmead Date: Wed, 26 Feb 2020 12:42:14 +0000 Subject: [PATCH 09/54] Update advanced-python/4bHyperparameterTuning.ipynb Co-Authored-By: Chris Burr --- advanced-python/4bHyperparameterTuning.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/advanced-python/4bHyperparameterTuning.ipynb b/advanced-python/4bHyperparameterTuning.ipynb index c08cb821..03f85d08 100644 --- a/advanced-python/4bHyperparameterTuning.ipynb +++ b/advanced-python/4bHyperparameterTuning.ipynb @@ -23,7 +23,7 @@ "from xgboost.sklearn import XGBClassifier\n", "from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier\n", "\n", - "from sklearn import cross_validation, metrics\n", + "from sklearn import metrics\n", "from sklearn.metrics import roc_curve, auc\n", "\n", "from sklearn.model_selection import KFold, cross_validate, cross_val_score\n", From 743fc5551fde425781b94dec9ff2277630541970 Mon Sep 17 00:00:00 2001 From: jvmead Date: Wed, 26 Feb 2020 14:05:35 +0000 Subject: [PATCH 10/54] GridSearchCV grid_search.GridSearchCV -> model_selection.GridSearchCV --- advanced-python/4bHyperparameterTuning.ipynb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/advanced-python/4bHyperparameterTuning.ipynb b/advanced-python/4bHyperparameterTuning.ipynb index 03f85d08..a614f533 100644 --- a/advanced-python/4bHyperparameterTuning.ipynb +++ b/advanced-python/4bHyperparameterTuning.ipynb @@ -26,8 +26,7 @@ "from sklearn import metrics\n", "from sklearn.metrics import roc_curve, auc\n", "\n", - "from sklearn.model_selection import KFold, cross_validate, cross_val_score\n", - "from sklearn.grid_search import GridSearchCV\n", + "from sklearn.model_selection import KFold, cross_validate, cross_val_score, GridSearchCV\n", "\n", "# This gives us a special function for this lesson that lets you check how good your selection is\n", "from python_lesson import check_truth" From 445d0456a391527a6607ebe4554f1f0b29cfb63d Mon Sep 17 00:00:00 2001 From: jvmead Date: Wed, 26 Feb 2020 14:25:36 +0000 Subject: [PATCH 11/54] Update 4bHyperparameterTuning.ipynb --- advanced-python/4bHyperparameterTuning.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/advanced-python/4bHyperparameterTuning.ipynb b/advanced-python/4bHyperparameterTuning.ipynb index a614f533..9dfd6e77 100644 --- a/advanced-python/4bHyperparameterTuning.ipynb +++ b/advanced-python/4bHyperparameterTuning.ipynb @@ -116,8 +116,8 @@ "outputs": [], "source": [ "#max_entries = 1000\n", - "data_df = uproot.open('/eos/user/l/lhcbsk/advanced-python/data/real_data.root')['DecayTree'].pandas.df()#entrystop=max_entries)\n", - "mc_df = uproot.open('/eos/user/l/lhcbsk/advanced-python/data/simulated_data.root')['DecayTree'].pandas.df()#entrystop=max_entries)\n", + "data_df = uproot.open('root://eosuser.cern.ch//eos/user/l/lhcbsk/advanced-python/data/real_data.root')['DecayTree'].pandas.df()#entrystop=max_entries)\n", + "mc_df = uproot.open('root://eosuser.cern.ch//eos/user/l/lhcbsk/advanced-python/data/simulated_data.root')['DecayTree'].pandas.df()#entrystop=max_entries)\n", "bkg_df = data_df.query('~(3.0 < Jpsi_M < 3.2)').copy()\n", "\n", "for df in [mc_df, data_df, bkg_df]:\n", From c7a9b9f2a0f40f558bd1aaffd0e10fb12617f0c6 Mon Sep 17 00:00:00 2001 From: jvmead Date: Wed, 26 Feb 2020 14:34:01 +0000 Subject: [PATCH 12/54] EOS mount -> HTTPS --- advanced-python/4bHyperparameterTuning.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/advanced-python/4bHyperparameterTuning.ipynb b/advanced-python/4bHyperparameterTuning.ipynb index 9dfd6e77..086efb7b 100644 --- a/advanced-python/4bHyperparameterTuning.ipynb +++ b/advanced-python/4bHyperparameterTuning.ipynb @@ -116,8 +116,8 @@ "outputs": [], "source": [ "#max_entries = 1000\n", - "data_df = uproot.open('root://eosuser.cern.ch//eos/user/l/lhcbsk/advanced-python/data/real_data.root')['DecayTree'].pandas.df()#entrystop=max_entries)\n", - "mc_df = uproot.open('root://eosuser.cern.ch//eos/user/l/lhcbsk/advanced-python/data/simulated_data.root')['DecayTree'].pandas.df()#entrystop=max_entries)\n", + "data_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/real_data.root')['DecayTree'].pandas.df()#entrystop=max_entries)\n", + "mc_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/simulated_data.root')['DecayTree'].pandas.df()#entrystop=max_entries)\n", "bkg_df = data_df.query('~(3.0 < Jpsi_M < 3.2)').copy()\n", "\n", "for df in [mc_df, data_df, bkg_df]:\n", From 9db151283d3f1fd4baa2cbcd4f9caf0f30104f83 Mon Sep 17 00:00:00 2001 From: jvmead Date: Fri, 28 Feb 2020 20:02:16 +0000 Subject: [PATCH 13/54] Update and rename 4bHyperparameterTuning.ipynb to 4bModelTuning.ipynb --- advanced-python/4bHyperparameterTuning.ipynb | 420 -------------- advanced-python/4bModelTuning.ipynb | 577 +++++++++++++++++++ 2 files changed, 577 insertions(+), 420 deletions(-) delete mode 100644 advanced-python/4bHyperparameterTuning.ipynb create mode 100644 advanced-python/4bModelTuning.ipynb diff --git a/advanced-python/4bHyperparameterTuning.ipynb b/advanced-python/4bHyperparameterTuning.ipynb deleted file mode 100644 index 086efb7b..00000000 --- a/advanced-python/4bHyperparameterTuning.ipynb +++ /dev/null @@ -1,420 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Hyperparameter tuning" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from matplotlib import pyplot as plt\n", - "import uproot\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "import xgboost as xgb\n", - "\n", - "from xgboost.sklearn import XGBClassifier\n", - "from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier\n", - "\n", - "from sklearn import metrics\n", - "from sklearn.metrics import roc_curve, auc\n", - "\n", - "from sklearn.model_selection import KFold, cross_validate, cross_val_score, GridSearchCV\n", - "\n", - "# This gives us a special function for this lesson that lets you check how good your selection is\n", - "from python_lesson import check_truth" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def plot_mass(df):\n", - " counts, bins, _ = plt.hist(df['Jpsi_M'], bins=100, range=[2.75, 3.5], histtype='step')\n", - " # You can also use LaTeX in the axis label\n", - " plt.xlabel('$J/\\\\psi$ mass [GeV]')\n", - " plt.xlim(bins[0], bins[-1])" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "def plot_comparision(var, mc_df, bkg_df):\n", - " _, bins, _ = plt.hist(mc_df[var], bins=100, histtype='step', label='MC', density=1)\n", - " _, bins, _ = plt.hist(bkg_df[var], bins=bins, histtype='step', label='Background', density=1)\n", - " plt.xlabel(var)\n", - " plt.xlim(bins[0], bins[-1])\n", - " plt.legend(loc='best')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def plot_roc(bdt, training_data, training_columns, label=None):\n", - " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", - " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", - " area = auc(fpr, tpr)\n", - "\n", - " plt.plot([0, 1], [0, 1], color='grey', linestyle='--')\n", - " if label:\n", - " plt.plot(fpr, tpr, label=f'{label} (area = {area:.2f})')\n", - " else:\n", - " plt.plot(fpr, tpr, label=f'ROC curve (area = {area:.2f})')\n", - " plt.xlim(0.0, 1.0)\n", - " plt.ylim(0.0, 1.0)\n", - " plt.xlabel('False Positive Rate')\n", - " plt.ylabel('True Positive Rate')\n", - " plt.legend(loc='lower right')\n", - " # We can make the plot look nicer by forcing the grid to be square\n", - " plt.gca().set_aspect('equal', adjustable='box')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "def plot_significance(bdt, training_data, training_columns, label=None):\n", - " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", - " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", - "\n", - " n_sig = 1200\n", - " n_bkg = 23000\n", - " S = n_sig*tpr\n", - " B = n_bkg*fpr\n", - " metric = S/np.sqrt(S+B)\n", - "\n", - " plt.plot(thresholds, metric, label=label)\n", - " plt.xlabel('BDT cut value')\n", - " plt.ylabel('$\\\\frac{S}{\\\\sqrt{S+B}}$')\n", - " plt.xlim(0, 1.0)\n", - "\n", - " optimal_cut = thresholds[np.argmax(metric)]\n", - " plt.axvline(optimal_cut, color='black', linestyle='--')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "#max_entries = 1000\n", - "data_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/real_data.root')['DecayTree'].pandas.df()#entrystop=max_entries)\n", - "mc_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/simulated_data.root')['DecayTree'].pandas.df()#entrystop=max_entries)\n", - "bkg_df = data_df.query('~(3.0 < Jpsi_M < 3.2)').copy()\n", - "\n", - "for df in [mc_df, data_df, bkg_df]:\n", - " df.eval('Jpsi_eta = arctanh(Jpsi_PZ/Jpsi_P)', inplace=True)\n", - " df.eval('mup_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", - " df.eval('mum_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", - "\n", - "bkg_df['catagory'] = 0 # Use 0 for background\n", - "mc_df['catagory'] = 1 # Use 1 for signal\n", - "training_data = pd.concat([bkg_df, mc_df], copy=True, ignore_index=True)\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['IPdiff'] = np.abs(df['mum_PT'] - df['mup_PT'])\n", - " \n", - "training_columns = [\n", - " 'Jpsi_PT',\n", - " 'mup_PT', 'mup_eta', 'mup_ProbNNmu', 'mup_IP',\n", - " 'mum_PT', 'mum_eta', 'mum_ProbNNmu', 'mum_IP',\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "xgboost_bdt = XGBClassifier() # LR=0.1 as default compared to bdt3 later\n", - "xgboost_bdt.fit(training_data[training_columns], training_data['catagory'])\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['XGBtest'] = xgboost_bdt.predict_proba(df[training_columns])[:,1]\n", - "\n", - "plt.figure()\n", - "plot_comparision('XGBtest', mc_df, bkg_df)\n", - "\n", - "plt.figure()\n", - "plot_significance(xgboost_bdt, training_data, training_columns)\n", - "\n", - "plt.figure()\n", - "plot_roc(xgboost_bdt, training_data, training_columns)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### $k$-folding\n", - "\n", - "Let's go search for `scikit learn k-folding`.\n", - "\n", - " - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html\n", - "\n", - "Look at the example section:\n", - "\n", - "```python\n", - ">>> from sklearn.model_selection import KFold\n", - ">>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])\n", - ">>> y = np.array([1, 2, 3, 4])\n", - ">>> kf = KFold(n_splits=2)\n", - ">>> kf.get_n_splits(X)\n", - "2\n", - ">>> print(kf) \n", - "KFold(n_splits=2, random_state=None, shuffle=False)\n", - ">>> for train_index, test_index in kf.split(X):\n", - "... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n", - "... X_train, X_test = X[train_index], X[test_index]\n", - "... y_train, y_test = y[train_index], y[test_index]\n", - "TRAIN: [2 3] TEST: [0 1]\n", - "TRAIN: [0 1] TEST: [2 3]\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "X1, y1 = training_data[training_columns], training_data['catagory']\n", - "splits = 5\n", - "kf = KFold(splits,True)\n", - "for train, test in kf.split(X1):\n", - " X_train, X_test = X1.iloc[train], X1.iloc[test]\n", - " y_train, y_test = y1.iloc[train], y1.iloc[test]\n", - " xgboost_bdt.fit(X_train,y_train)\n", - "cv_acc_1 = cross_val_score(xgboost_bdt, X_test, y_test, cv=splits, scoring=\"accuracy\")\n", - "cv_los_1 = cross_val_score(xgboost_bdt, X_test, y_test, cv=splits, scoring=\"neg_log_loss\")\n", - "cv_auc_1 = cross_val_score(xgboost_bdt, X_test, y_test, cv=splits, scoring=\"roc_auc\")\n", - "print(\"accuracy: \",cv_acc_1)\n", - "print(\"-logloss: \",cv_los_1)\n", - "print(\"roc_auc: \",cv_auc_1)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "def modelfit(alg, metric, train, test, predictors, cv_folds=5, early_stop=10): #50):\n", - " xgb_param = alg.get_xgb_params()\n", - " xgtrain = xgb.DMatrix(train, label=test, feature_names=predictors)\n", - " cvresult = xgb.cv(xgb_param,\n", - " xgtrain,\n", - " num_boost_round=alg.get_params()['n_estimators'],\n", - " nfold=cv_folds,\n", - " metrics=metric,\n", - " early_stopping_rounds=early_stop)\n", - " alg.set_params(n_estimators=cvresult.shape[0])\n", - " #Fit the algorithm on the data \n", - " alg.fit(train, test, eval_metric=metric)\n", - " #Predict training set: \n", - " train_predictions = alg.predict(train)\n", - " #Print model report: \n", - " print(\"\\nModel Report : best iteration \"+str(cvresult.shape[0]))\n", - " print(\"Accuracy : \"+str(metrics.accuracy_score(test, train_predictions)))\n", - " return cvresult.shape[0]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "X1, y1 = training_data[training_columns], training_data['catagory']\n", - "LR = 0.2 # choosing a high learning rate to establish earlystopping limit to use during grid scan\n", - "bdt0 = XGBClassifier( learning_rate=LR, n_estimators=1000,\n", - " #max_depth=6, min_child_weight=1, #default values\n", - " #gamma=0, subsample=0.8,\n", - " #colsample_bytree=0.8, scale_pos_weight=1,\n", - " objective='binary:logistic', #'mutli:softprob', num_class=3, #or more\n", - " seed=123)\n", - "estimators = modelfit(bdt0, 'error', X1, y1, training_columns) #'merror' for multiclass" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "bdt1 = XGBClassifier( learning_rate=LR, n_estimators=estimators,\n", - " #max_depth=6, min_child_weight=1, #default values\n", - " #gamma=0, subsample=0.8,\n", - " #colsample_bytree=0.8, scale_pos_weight=1, \n", - " objective='binary:logistic', #'mutli:softprob', num_class=3, #or more\n", - " seed=123)\n", - " \n", - "param_test1 = {\n", - " 'max_depth':np.arange( 5, 9, 2 ),\n", - " 'min_child_weight':np.arange( 1, 5, 2 ),\n", - " #'gamma':np.arange( 0.0, 1.0, 0.2 ),\n", - " #'colsample_bytree':np.arange( 0.4, 1.0, 0.2 ),\n", - " #'subsample':np.arange( 0.4, 1.0, 0.2 ),\n", - " #'scale_pos_weight':np.arange( 0.4, 1.6, 0.2 )\n", - "}\n", - "gsearch1 = GridSearchCV(estimator=bdt1,\n", - " param_grid=param_test1,\n", - " scoring='accuracy',\n", - " iid=False,\n", - " cv=5)\n", - "gsearch1.fit(X1,y1)\n", - "print(gsearch1.best_params_)\n", - "print(gsearch1.best_score_)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "#second stage with decreased step size and smaller grid scan\n", - "bdt2 = XGBClassifier( learning_rate=LR, n_estimators=estimators,\n", - " #max_depth=6, min_child_weight=1, #default values\n", - " #gamma=0, subsample=0.8,\n", - " #colsample_bytree=0.8, scale_pos_weight=1,\n", - " objective='binary:logistic', #'mutli:softprob', num_class=3, #or more\n", - " seed=123)\n", - "param_test2 = {\n", - " 'max_depth':np.arange( gsearch1.best_params_['max_depth']-1 if gsearch1.best_params_['max_depth']>=4 else gsearch1.best_params_['max_depth'],\n", - " gsearch1.best_params_['max_depth']+1, 1 ),\n", - " 'min_child_weight':np.arange( gsearch1.best_params_['min_child_weight']-1 if gsearch1.best_params_['min_child_weight']>=1.1 else gsearch1.best_params_['min_child_weight'],\n", - " gsearch1.best_params_['min_child_weight']+1, 1 ), #0.5 ),\n", - " # 'gamma':np.arange( gsearch1.best_params_['gamma']-0.1 if gsearch1.best_params_['gamma']>=0.1 else gsearch1.best_params_['gamma'],\n", - " # gsearch1.best_params_['gamma']+0.1, 0.05 ),\n", - " #'colsample_bytree':np.arange( gsearch1.best_params_['colsample_bytree']-0.1 if gsearch1.best_params_['colsample_bytree']>=1.1 else gsearch1.best_params_['colsample_bytree'],\n", - " # gsearch1.best_params_['colsample_bytree']+0.1, 0.05 ),\n", - " # 'subsample':np.arange( gsearch1.best_params_['subsample']-0.1 if gsearch1.best_params_['subsample']>=1.1 else gsearch1.best_params_['subsample'],\n", - " # gsearch1.best_params_['subsample']+0.1, 0.05 ),\n", - " #'scale_pos_weight':np.arange( gsearch1.best_params_['scale_pos_weight']-0.1 if gsearch1.best_params_['scale_pos_weight']>=1.1 else gsearch1.best_params_['scale_pos_weight'],\n", - " # gsearch1.best_params_['scale_pos_weight']+0.1, 0.05 )\n", - "}\n", - "gsearch2 = GridSearchCV(estimator=bdt2,\n", - " param_grid=param_test2,\n", - " scoring='accuracy',\n", - " iid=False,\n", - " cv=5)\n", - "gsearch2.fit(X1,y1)\n", - "print(gsearch2.best_params_)\n", - "print(gsearch2.best_score_)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "# now repeat with lower learning rate, early stopping monitoring using optimal hyperparameters\n", - "bdt3 = XGBClassifier( learning_rate=0.1, n_estimators=1000, # 0.1 learning rate to compare to default used in xgboost_bdt\n", - " max_depth=gsearch2.best_params_['max_depth'], min_child_weight=gsearch2.best_params_['min_child_weight'],\n", - " #gamma=gsearch2.best_params_['gamma'], subsample=gsearch2.best_params_['subsample'],\n", - " #colsample_bytree=gsearch2.best_params_['colsample_bytree'], scale_pos_weight=gsearch2.best_params_['scale_pos_weight'], \n", - " objective='binary:logistic', #'multi:softprob', num_class=3, #or more\n", - " seed=123 )\n", - "estimators = modelfit(bdt3, 'error', X1, y1, training_columns)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "bdt3.fit(training_data[training_columns], training_data['catagory'])\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['XGBtune'] = bdt3.predict_proba(df[training_columns])[:,1]\n", - "\n", - "plt.figure()\n", - "plot_comparision('XGBtune', mc_df, bkg_df)\n", - "\n", - "plt.figure()\n", - "plot_significance(bdt3, training_data, training_columns)\n", - "\n", - "plt.figure()\n", - "plot_roc(bdt3, training_data, training_columns)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# now repeat with lower learning rate, early stopping monitoring using optimal hyperparameters\n", - "bdt4 = XGBClassifier( learning_rate=0.01, n_estimators=10000, # even lower learning rate to compare to default\n", - " max_depth=gsearch2.best_params_['max_depth'], min_child_weight=gsearch2.best_params_['min_child_weight'],\n", - " #gamma=gsearch2.best_params_['gamma'], subsample=gsearch2.best_params_['subsample'],\n", - " #colsample_bytree=gsearch2.best_params_['colsample_bytree'], scale_pos_weight=gsearch2.best_params_['scale_pos_weight'], \n", - " objective='binary:logistic', #'multi:softprob', num_class=3, #or more\n", - " seed=123 )\n", - "estimators = modelfit(bdt4, 'error', X1, y1, training_columns)\n", - "\n", - "bdt4.fit(training_data[training_columns], training_data['catagory'])\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['XGBtune'] = bdt4.predict_proba(df[training_columns])[:,1]\n", - "\n", - "plt.figure()\n", - "plot_comparision('XGBtune', mc_df, bkg_df)\n", - "\n", - "plt.figure()\n", - "plot_significance(bdt4, training_data, training_columns)\n", - "\n", - "plt.figure()\n", - "plot_roc(bdt4, training_data, training_columns)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/advanced-python/4bModelTuning.ipynb b/advanced-python/4bModelTuning.ipynb new file mode 100644 index 00000000..74a7a780 --- /dev/null +++ b/advanced-python/4bModelTuning.ipynb @@ -0,0 +1,577 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model tuning setup" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#%store -r bkg_df\n", + "#%store -r mc_df\n", + "#%store -r data_df" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#@title\n", + "!pip install uproot\n", + "!pip install sklearn\n", + "\n", + "import time\n", + "from matplotlib import pyplot as plt\n", + "import uproot\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import xgboost as xgb\n", + "\n", + "from xgboost.sklearn import XGBClassifier\n", + "from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier\n", + "\n", + "from sklearn import metrics\n", + "from sklearn.metrics import roc_curve, auc\n", + "\n", + "from sklearn.model_selection import KFold, train_test_split, cross_validate, cross_val_score, GridSearchCV" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def plot_mass(df):\n", + " counts, bins, _ = plt.hist(df['Jpsi_M'], bins=100, range=[2.75, 3.5], histtype='step')\n", + " # You can also use LaTeX in the axis label\n", + " plt.xlabel('$J/\\\\psi$ mass [GeV]')\n", + " plt.xlim(bins[0], bins[-1])" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def plot_comparision(var, mc_df, bkg_df):\n", + " _, bins, _ = plt.hist(mc_df[var], bins=100, histtype='step', label='MC', density=1)\n", + " _, bins, _ = plt.hist(bkg_df[var], bins=bins, histtype='step', label='Background', density=1)\n", + " plt.xlabel(var)\n", + " plt.xlim(bins[0], bins[-1])\n", + " plt.legend(loc='best')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def plot_roc(bdt, training_data, training_columns, label=None):\n", + " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", + " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", + " area = auc(fpr, tpr)\n", + "\n", + " plt.plot([0, 1], [0, 1], color='grey', linestyle='--')\n", + " if label:\n", + " plt.plot(fpr, tpr, label=f'{label} (area = {area:.2f})')\n", + " else:\n", + " plt.plot(fpr, tpr, label=f'ROC curve (area = {area:.2f})')\n", + " plt.xlim(0.0, 1.0)\n", + " plt.ylim(0.0, 1.0)\n", + " plt.xlabel('False Positive Rate')\n", + " plt.ylabel('True Positive Rate')\n", + " plt.legend(loc='lower right')\n", + " # We can make the plot look nicer by forcing the grid to be square\n", + " plt.gca().set_aspect('equal', adjustable='box')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def plot_significance(bdt, training_data, training_columns, label=None):\n", + " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", + " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", + "\n", + " n_sig = 1200\n", + " n_bkg = 23000\n", + " S = n_sig*tpr\n", + " B = n_bkg*fpr\n", + " metric = S/np.sqrt(S+B)\n", + "\n", + " plt.plot(thresholds, metric, label=label)\n", + " plt.xlabel('BDT cut value')\n", + " plt.ylabel('$\\\\frac{S}{\\\\sqrt{S+B}}$')\n", + " plt.xlim(0, 1.0)\n", + "\n", + " optimal_cut = thresholds[np.argmax(metric)]\n", + " plt.axvline(optimal_cut, color='black', linestyle='--')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#max_entries = 1000 # try running with low stats for bug fixing your changes quickly \n", + "data_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/real_data.root',\n", + " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", + " )['DecayTree'].pandas.df()#(entrystop=max_entries)\n", + "mc_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/simulated_data.root',\n", + " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", + " )['DecayTree'].pandas.df()#(entrystop=max_entries)\n", + "bkg_df = data_df.query('~(3.0 < Jpsi_M < 3.2)')\n", + "\n", + "for df in [mc_df, data_df, bkg_df]:\n", + " df.eval('Jpsi_eta = arctanh(Jpsi_PZ/Jpsi_P)', inplace=True)\n", + " df.eval('mup_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", + " df.eval('mum_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", + "\n", + "bkg_df['catagory'] = 0 # Use 0 for background\n", + "mc_df['catagory'] = 1 # Use 1 for signal\n", + "training_data = pd.concat([bkg_df, mc_df], copy=True, ignore_index=True)\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['IPdiff'] = np.abs(df['mum_PT'] - df['mup_PT'])\n", + " \n", + "training_columns = [\n", + " 'Jpsi_PT',\n", + " 'mup_PT', 'mup_eta', 'mup_ProbNNmu', 'mup_IP',\n", + " 'mum_PT', 'mum_eta', 'mum_ProbNNmu', 'mum_IP',\n", + "]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Previously we trained an XGBClassifier with the default settings, with learning rate = 0.3 and maximum iterations = 100. This cut off to the training process may be limiting the performance of our model. We can monitor the performance of our model as a function of training iteration and stop the training when the gradient approximates zero. " + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "X1, y1 = training_data[training_columns], training_data['catagory']\n", + "X_train, X_test, y_train, y_test = train_test_split(X1, y1) \n", + "# default train_size = 0.25, this can be varied to suit your data\n", + "\n", + "LR = 0.3 # the coefficient of step size decay, eta, has alias 'learning_rate' with default 0.3\n", + "\n", + "stime = time.time()\n", + "bdt = XGBClassifier(learning_rate = LR, n_estimators=100, seed=123, n_threads=-1)\n", + "bdt.fit(training_data[training_columns], training_data['catagory'])\n", + "print(\"XGBoost --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGB'] = bdt.predict_proba(df[training_columns])[:,1]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cross-validation\n", + "\n", + "Splitting the data into randomised subsets for training allows you to monitor your model's performance on the fly using the statistically independant remainder of your sample - this is called cross-validation (CV). We can see below that at the 100th iteration the metrics still show a trend of improvement." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def training_monitor(alg): \n", + "\n", + " # A model trained with eval_set and eval_metric will return evals_result\n", + " results = alg.evals_result()\n", + " epochs = len(results['validation_0']['logloss'])\n", + " x_axis = range(0, epochs)\n", + "\n", + " # Plotting logLoss as a function of training iteration\n", + " fig, ax = plt.subplots()\n", + " ax.plot(x_axis, results['validation_0']['logloss'], label='Train') # for each eval_set\n", + " if results['validation_1']: ax.plot(x_axis, results['validation_1']['logloss'], label='Test') \n", + " ax.legend()\n", + " plt.ylabel('LogLoss')\n", + " plt.title('LogLoss')\n", + " plt.show()\n", + " \n", + " # Plotting classification error as a function of training iteration\n", + " fig, ax = plt.subplots()\n", + " ax.plot(x_axis, results['validation_0']['error'], label='Train') # for each eval_set\n", + " if results['validation_1']: ax.plot(x_axis, results['validation_1']['error'], label='Test')\n", + " ax.legend()\n", + " plt.ylabel('Error')\n", + " plt.title('Error')\n", + " plt.show()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This involves training on less data but allows us to monitor progress to check if the model is becoming over-specific to our training sample. The minimisation of loss and classification error are common metrics for model assessment. As shown below, the cost to performance is negligible. If the test sample gradient were to invert this would be considered overtraining and is why monitoring performance without CV can be a time costly pitfall." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Defining a model with multi-threading set to maximum\n", + "bdt_cv = XGBClassifier(learning_rate = LR, n_estimators=100, seed=123, n_threads=-1)\n", + "\n", + "# Model fitting with CV and printing out processing time\n", + "stime = time.time()\n", + "bdt_cv.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", + " eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)\n", + "print(\"\\nXGBoost cross-validation --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Writing model predictions out for data\n", + "training_monitor(bdt_cv)\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBcv'] = bdt_cv.predict_proba(df[training_columns])[:,1]\n", + "\n", + "# Drawing plot of model respone for signal and background classes\n", + "plt.figure()\n", + "plot_comparision('XGB', mc_df, bkg_df)\n", + "plot_comparision('XGBcv', mc_df, bkg_df)\n", + "\n", + "# Drawing signal significance comparison as a function of minimum cut on model response\n", + "plt.figure()\n", + "plot_significance(bdt, training_data, training_columns)\n", + "plot_significance(bdt_cv, training_data, training_columns)\n", + "\n", + "# Drawing the signal efficiency vs background rejection curve (ROC)\n", + "plt.figure()\n", + "plot_roc(bdt, training_data, training_columns)\n", + "plot_roc(bdt_cv, training_data, training_columns)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### $k$-folding & early stopping\n", + "\n", + "Performing CV on each of a number, k, of ways to split your data gives you k models to choose from. Some choose to average the performance across the models from each fold as any instability might imply the model will not be reliable. The results below seem stable; each fold provides a consistant performance across multiple metrics, so we'll just choose the best one." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Defining the folds with a seed to test consistently \n", + "splits = 4 # to match 0.25 value of test_train_split default though this may not be optimal\n", + "kf = KFold(splits, True, random_state=123)\n", + "\n", + "# Printing processing time of the kfold cross-validation\n", + "stime = time.time()\n", + "for train, test in kf.split(X1):\n", + " X_train, X_test = X1.iloc[train], X1.iloc[test]\n", + " y_train, y_test = y1.iloc[train], y1.iloc[test]\n", + " bdt.fit(X_train,y_train)\n", + "print(\"\\nXGBoost k-folding --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Calculating scores of each fold using variety of CV-metrics\n", + "cv_acc = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"accuracy\", n_jobs=-1)\n", + "cv_los = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"neg_log_loss\", n_jobs=-1)\n", + "cv_auc = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"roc_auc\", n_jobs=-1)\n", + "\n", + "# Printing results and indicating best fold\n", + "print(\"accuracy: \",cv_acc, \" -> best fold =\", np.argmax(cv_acc) )\n", + "print(\"-logloss: \",cv_los, \" -> best fold =\", np.argmax(cv_los) )\n", + "print(\"roc_auc: \",cv_auc, \" -> best fold =\", np.argmax(cv_auc) )\n", + "bestfold = np.argmax(cv_acc)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Early stopping defines a maximum number of rounds the cross-validation metric (we'll use 'error'=1-accuracy) is allowed to not improve before training is terminated. As is standard, we will be reverting back to a 'previous best' model based on test sample score, this helps avoid overtraining. Early stopping prevents us training too many of extra models thus saving time. Set the limit too small though and your training might be cut off prematurely." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def modelfit(alg, metric, params, label, predictors, kfold, fbest, early_stop=10):\n", + "\n", + " # Loading data split inputs providing best fold result\n", + " for k, (train, test) in enumerate(kf.split(params)):\n", + " if (k==fbest):\n", + " X_train, X_test = params.iloc[train], params.iloc[test]\n", + " y_train, y_test = label.iloc[train], label.iloc[test]\n", + "\n", + " # Defining data in terms of training variables and class label\n", + " xgb_param = alg.get_xgb_params()\n", + " data = xgb.DMatrix(params, label=label, feature_names=predictors, nthread=-1)\n", + " \n", + " # Runs timed CV on our model using early stopping based on our metric\n", + " stime = time.time()\n", + " cvresult = xgb.cv(xgb_param,\n", + " data,\n", + " num_boost_round=alg.get_params()['n_estimators'],\n", + " #nfold=cv_folds, # to use in build folding\n", + " folds=kfold, # use -> ignores nfold \n", + " metrics=metric,\n", + " early_stopping_rounds=early_stop)\n", + " alg.set_params(n_estimators=cvresult.shape[0])\n", + " print(\"\\nXGBoost early-stop folding --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + " # Fitting the algorithm on the data with CV evaluation early stopping\n", + " stime = time.time()\n", + " alg.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", + " eval_set=[(X_train, y_train), (X_test, y_test)],\n", + " verbose=False, early_stopping_rounds=early_stop)\n", + " training_monitor(alg)\n", + " print(\"XGBoost early-stop limit --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + " # Predicting training set:\n", + " train_predictions = alg.predict(X_train) \n", + " test_predictions = alg.predict(X_test)\n", + "\n", + " # Printing model report: \n", + " print(\"\\nModel Report : best iteration \"+str(cvresult.shape[0]))\n", + " print(\"Train Accuracy : \"+str(metrics.accuracy_score(y_train, train_predictions)))\n", + " print(\"Test Accuracy : \"+str(metrics.accuracy_score(y_test, test_predictions)))\n", + " return cvresult.shape[0]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This function incorporates the k-folding CV and early stopping, saving not only the optimal model but also the index of its training iteration. This means, in our subsequent steps, we can apply an upper limit on training for models based on the convergence of the default hyperparameters, saving us some time. " + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Defining model with high maximum estimators for use with early stopping\n", + "bdt_es = XGBClassifier(learning_rate = LR, n_estimators=1000,\n", + " # Default values of other hyperparamters\n", + " #max_depth=6, min_child_weight=1,\n", + " #gamma=0, subsample=0.8,\n", + " #colsample_bytree=0.8, scale_pos_weight=1,\n", + " #objective='binary:logistic', # default for binary classification\n", + " #objective='mutli:softprob', num_class=3, # for multiclassifiers\n", + " seed=123, n_threads=-1)\n", + "\n", + "# Timing the CV using early stopping\n", + "stime = time.time()\n", + "estimators = modelfit(bdt_es, \"error\", X1, y1, training_columns, kf, bestfold)\n", + "print(\"\\nmodelfit(bdt_es) --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Saving model predictions\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBes'] = bdt_es.predict_proba(df[training_columns])[:,1]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This provides us with an improved model as well as a benchmark to test against in both performance and training efficiency. When training using new combinations of hyperparameters, the maximum number of estimators from our model report will cut off any new models improving more slowly than our default, while, for more efficient models, the early stopping will kick in." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Drawing plot to compare model response for signal and background classes\n", + "plt.figure()\n", + "plot_comparision('XGBcv', mc_df, bkg_df)\n", + "plot_comparision('XGBes', mc_df, bkg_df)\n", + "\n", + "# Drawing signal significance comparison as a function of minimum cut on model response\n", + "plt.figure()\n", + "plot_significance(bdt_cv, training_data, training_columns)\n", + "plot_significance(bdt_es, training_data, training_columns)\n", + "\n", + "# Drawing comaprison of the signal efficiency vs background rejection curve (ROC)\n", + "plt.figure()\n", + "plot_roc(bdt_cv, training_data, training_columns)\n", + "plot_roc(bdt_es, training_data, training_columns)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Hyperameter optimisation\n", + "\n", + "Below we provide a \"grid\" of hyperparameters, defining the structure of the trees and constraints on the learning, but there are many more values to choose from and a larger parameter space to be explored. These optimsations are very problem specific and their impact will have to be weighed against the computing resources and timeframe you have at your disposal. For the sake of expedient demonstration we are comparing the default parameters to only one predetermined variation in 2 parameters. " + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Define a function that performs a gridscan of HPs\n", + "def hpgridscan(alg, metric, params, label, kfold, fbest, early_stop=10):\n", + "\n", + " # Load data fold with best performance\n", + " for k, (train, test) in enumerate(kf.split(params)):\n", + " if (k==fbest):\n", + " X_train, X_test = params.iloc[train], params.iloc[test]\n", + " y_train, y_test = label.iloc[train], label.iloc[test]\n", + "\n", + " # Define a dictionary of numpy arrays for our HPs\n", + " params = {\n", + " 'max_depth':np.array([7]),\n", + " 'min_child_weight':np.array([3]),\n", + " #'max_depth':np.arange( 5, 9, 1 ),\n", + " #'min_child_weight':np.arange( 1, 5, 1 ),\n", + " ##'gamma':np.arange( 0.0, 1.0, 0.1 ),\n", + " ##'colsample_bytree':np.arange( 0.4, 1.0, 0.1 ),\n", + " ##'subsample':np.arange( 0.4, 1.0, 0.1 ),\n", + " ##'scale_pos_weight':np.arange( 0.4, 1.6, 0.1 )\n", + " }\n", + "\n", + " # Perform timed grid scan with established n_estimator cutoff and early stopping\n", + " stime = time.time()\n", + " gs = GridSearchCV(estimator=alg,\n", + " param_grid=params,\n", + " scoring=metric,\n", + " iid=False,\n", + " cv=kf,\n", + " n_jobs=-1) \n", + " gs.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", + " eval_set=[(X_train, y_train), (X_test, y_test)],\n", + " verbose=False, early_stopping_rounds=early_stop)\n", + " print(\"XGBoost grid-scan --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + " # Return suggested parameters, performance and best model\n", + " training_monitor(gs.best_estimator_)\n", + " print(\"Suggestion:\", gs.best_params_)\n", + " print(\"Accuracy:\" ,gs.best_score_)\n", + " return gs" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Running with estimators maximum for shortened training\n", + "bdt_st = XGBClassifier( learning_rate = LR, n_estimators=estimators,\n", + " seed=123, n_threads=-1)\n", + "\n", + "# Running timed hyperparameter gridscan\n", + "stime = time.time()\n", + "gs = hpgridscan(bdt_st, \"accuracy\", X1, y1, kf, bestfold)\n", + "bdt_gs = gs.best_estimator_\n", + "print(\"\\nhpgridscan(bdt_st) --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Get model predictions\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBgs'] = bdt_gs.predict_proba(df[training_columns])[:,1]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Even this naive grid scan, using the same fold as before for fair comparison, can provide significant improvements as demonstrated above. These may be pushed further by including more hyperparameters for a trade off with processing time. However, even with parrallisation these tasks can take hours or longer and might only provide improvement of O(>1%)." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "## We could define a model using optimal hyperparameters from our grid scan\n", + "#bdt_opt = XGBClassifier( learning_rate = LR, n_estimators=1000, \n", + "# max_depth=gs.best_params_['max_depth'],\n", + "# min_child_weight=gs.best_params_['min_child_weight'], \n", + "# seed=123, n_threads=-1 )\n", + "\n", + "## Run with CV early stopping\n", + "#stime = time.time()\n", + "#estimators = modelfit(bdt_opt, 'error', X1, y1, training_columns, kf, bestfold)\n", + "#print(\"\\nmodelfit(bdt_opt) --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "## Get model predictions\n", + "#for df in [mc_df, bkg_df, data_df, training_data]:\n", + "# df['XGBopt'] = bdt_opt.predict_proba(df[training_columns])[:,1]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Comapring model response from the end of last session to the end of this one\n", + "plt.figure()\n", + "plot_comparision('XGB', mc_df, bkg_df)\n", + "plot_comparision('XGBgs', mc_df, bkg_df)\n", + "\n", + "# Comparing the impact on projected performance at each stage of the tutorial\n", + "plt.figure()\n", + "plot_significance(bdt, training_data, training_columns)\n", + "plot_significance(bdt_cv, training_data, training_columns)\n", + "plot_significance(bdt_es, training_data, training_columns)\n", + "plot_significance(bdt_gs, training_data, training_columns)\n", + "#plot_significance(bdt_opt, training_data, training_columns)\n", + "\n", + "# Comparing model performance for each level of tuning\n", + "plt.figure()\n", + "plot_roc(bdt, training_data, training_columns)\n", + "plot_roc(bdt_cv, training_data, training_columns)\n", + "plot_roc(bdt_es, training_data, training_columns)\n", + "plot_roc(bdt_gs, training_data, training_columns)\n", + "#plot_roc(bdt_opt, training_data, training_columns)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also choose a higher learning rate to perform course scans of your space and decrease it again to retrain your final model. If you can afford to, it might be best to include learning rate itself as a parameter in your grid. With some libraries you can specify your choice of kernel. Both these choices will impact your optimal maximum number of iterations, so setting it sufficiently high and using early stopping might be a good strategy.\n", + "\n", + "For less exhaustive and non-discritised methods try smart combinations of the following to perform adaptive scans or build your own: \n", + "* sklearn.model_selection.RandomizedSearchCV\n", + "* sklearn.model_selection.GridSearchCV\n", + "\n", + "Moving to higher dimentional optimisation problems may require more sophisticated solutions:\n", + "* skopt.BayesSearchCV\n", + "* hyperopt.tpe" + ] + } + ] +} From 8e2b4ab7cdb4d31a847679f6c616a349d8fa0d10 Mon Sep 17 00:00:00 2001 From: jvmead Date: Fri, 28 Feb 2020 20:21:02 +0000 Subject: [PATCH 14/54] re-adding formatting and versioning --- advanced-python/4bModelTuning.ipynb | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/advanced-python/4bModelTuning.ipynb b/advanced-python/4bModelTuning.ipynb index 74a7a780..69063425 100644 --- a/advanced-python/4bModelTuning.ipynb +++ b/advanced-python/4bModelTuning.ipynb @@ -1,4 +1,25 @@ { + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + "language": "python", + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + }, "cells": [ { "cell_type": "markdown", From 1a449bbd4d193d7cb4e2ca41db70cf7289b195bb Mon Sep 17 00:00:00 2001 From: jvmead Date: Fri, 28 Feb 2020 20:34:48 +0000 Subject: [PATCH 15/54] Update 4bModelTuning.ipynb --- advanced-python/4bModelTuning.ipynb | 1191 +++++++++++++-------------- 1 file changed, 595 insertions(+), 596 deletions(-) diff --git a/advanced-python/4bModelTuning.ipynb b/advanced-python/4bModelTuning.ipynb index 69063425..00a728a6 100644 --- a/advanced-python/4bModelTuning.ipynb +++ b/advanced-python/4bModelTuning.ipynb @@ -1,598 +1,597 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - "language": "python", - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - }, - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model tuning setup" - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "#%store -r bkg_df\n", - "#%store -r mc_df\n", - "#%store -r data_df" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "#@title\n", - "!pip install uproot\n", - "!pip install sklearn\n", - "\n", - "import time\n", - "from matplotlib import pyplot as plt\n", - "import uproot\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "import xgboost as xgb\n", - "\n", - "from xgboost.sklearn import XGBClassifier\n", - "from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier\n", - "\n", - "from sklearn import metrics\n", - "from sklearn.metrics import roc_curve, auc\n", - "\n", - "from sklearn.model_selection import KFold, train_test_split, cross_validate, cross_val_score, GridSearchCV" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "def plot_mass(df):\n", - " counts, bins, _ = plt.hist(df['Jpsi_M'], bins=100, range=[2.75, 3.5], histtype='step')\n", - " # You can also use LaTeX in the axis label\n", - " plt.xlabel('$J/\\\\psi$ mass [GeV]')\n", - " plt.xlim(bins[0], bins[-1])" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "def plot_comparision(var, mc_df, bkg_df):\n", - " _, bins, _ = plt.hist(mc_df[var], bins=100, histtype='step', label='MC', density=1)\n", - " _, bins, _ = plt.hist(bkg_df[var], bins=bins, histtype='step', label='Background', density=1)\n", - " plt.xlabel(var)\n", - " plt.xlim(bins[0], bins[-1])\n", - " plt.legend(loc='best')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "def plot_roc(bdt, training_data, training_columns, label=None):\n", - " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", - " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", - " area = auc(fpr, tpr)\n", - "\n", - " plt.plot([0, 1], [0, 1], color='grey', linestyle='--')\n", - " if label:\n", - " plt.plot(fpr, tpr, label=f'{label} (area = {area:.2f})')\n", - " else:\n", - " plt.plot(fpr, tpr, label=f'ROC curve (area = {area:.2f})')\n", - " plt.xlim(0.0, 1.0)\n", - " plt.ylim(0.0, 1.0)\n", - " plt.xlabel('False Positive Rate')\n", - " plt.ylabel('True Positive Rate')\n", - " plt.legend(loc='lower right')\n", - " # We can make the plot look nicer by forcing the grid to be square\n", - " plt.gca().set_aspect('equal', adjustable='box')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "def plot_significance(bdt, training_data, training_columns, label=None):\n", - " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", - " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", - "\n", - " n_sig = 1200\n", - " n_bkg = 23000\n", - " S = n_sig*tpr\n", - " B = n_bkg*fpr\n", - " metric = S/np.sqrt(S+B)\n", - "\n", - " plt.plot(thresholds, metric, label=label)\n", - " plt.xlabel('BDT cut value')\n", - " plt.ylabel('$\\\\frac{S}{\\\\sqrt{S+B}}$')\n", - " plt.xlim(0, 1.0)\n", - "\n", - " optimal_cut = thresholds[np.argmax(metric)]\n", - " plt.axvline(optimal_cut, color='black', linestyle='--')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "#max_entries = 1000 # try running with low stats for bug fixing your changes quickly \n", - "data_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/real_data.root',\n", - " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", - " )['DecayTree'].pandas.df()#(entrystop=max_entries)\n", - "mc_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/simulated_data.root',\n", - " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", - " )['DecayTree'].pandas.df()#(entrystop=max_entries)\n", - "bkg_df = data_df.query('~(3.0 < Jpsi_M < 3.2)')\n", - "\n", - "for df in [mc_df, data_df, bkg_df]:\n", - " df.eval('Jpsi_eta = arctanh(Jpsi_PZ/Jpsi_P)', inplace=True)\n", - " df.eval('mup_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", - " df.eval('mum_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", - "\n", - "bkg_df['catagory'] = 0 # Use 0 for background\n", - "mc_df['catagory'] = 1 # Use 1 for signal\n", - "training_data = pd.concat([bkg_df, mc_df], copy=True, ignore_index=True)\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['IPdiff'] = np.abs(df['mum_PT'] - df['mup_PT'])\n", - " \n", - "training_columns = [\n", - " 'Jpsi_PT',\n", - " 'mup_PT', 'mup_eta', 'mup_ProbNNmu', 'mup_IP',\n", - " 'mum_PT', 'mum_eta', 'mum_ProbNNmu', 'mum_IP',\n", - "]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Previously we trained an XGBClassifier with the default settings, with learning rate = 0.3 and maximum iterations = 100. This cut off to the training process may be limiting the performance of our model. We can monitor the performance of our model as a function of training iteration and stop the training when the gradient approximates zero. " - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "X1, y1 = training_data[training_columns], training_data['catagory']\n", - "X_train, X_test, y_train, y_test = train_test_split(X1, y1) \n", - "# default train_size = 0.25, this can be varied to suit your data\n", - "\n", - "LR = 0.3 # the coefficient of step size decay, eta, has alias 'learning_rate' with default 0.3\n", - "\n", - "stime = time.time()\n", - "bdt = XGBClassifier(learning_rate = LR, n_estimators=100, seed=123, n_threads=-1)\n", - "bdt.fit(training_data[training_columns], training_data['catagory'])\n", - "print(\"XGBoost --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['XGB'] = bdt.predict_proba(df[training_columns])[:,1]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Cross-validation\n", - "\n", - "Splitting the data into randomised subsets for training allows you to monitor your model's performance on the fly using the statistically independant remainder of your sample - this is called cross-validation (CV). We can see below that at the 100th iteration the metrics still show a trend of improvement." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "def training_monitor(alg): \n", - "\n", - " # A model trained with eval_set and eval_metric will return evals_result\n", - " results = alg.evals_result()\n", - " epochs = len(results['validation_0']['logloss'])\n", - " x_axis = range(0, epochs)\n", - "\n", - " # Plotting logLoss as a function of training iteration\n", - " fig, ax = plt.subplots()\n", - " ax.plot(x_axis, results['validation_0']['logloss'], label='Train') # for each eval_set\n", - " if results['validation_1']: ax.plot(x_axis, results['validation_1']['logloss'], label='Test') \n", - " ax.legend()\n", - " plt.ylabel('LogLoss')\n", - " plt.title('LogLoss')\n", - " plt.show()\n", - " \n", - " # Plotting classification error as a function of training iteration\n", - " fig, ax = plt.subplots()\n", - " ax.plot(x_axis, results['validation_0']['error'], label='Train') # for each eval_set\n", - " if results['validation_1']: ax.plot(x_axis, results['validation_1']['error'], label='Test')\n", - " ax.legend()\n", - " plt.ylabel('Error')\n", - " plt.title('Error')\n", - " plt.show()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This involves training on less data but allows us to monitor progress to check if the model is becoming over-specific to our training sample. The minimisation of loss and classification error are common metrics for model assessment. As shown below, the cost to performance is negligible. If the test sample gradient were to invert this would be considered overtraining and is why monitoring performance without CV can be a time costly pitfall." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Defining a model with multi-threading set to maximum\n", - "bdt_cv = XGBClassifier(learning_rate = LR, n_estimators=100, seed=123, n_threads=-1)\n", - "\n", - "# Model fitting with CV and printing out processing time\n", - "stime = time.time()\n", - "bdt_cv.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", - " eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)\n", - "print(\"\\nXGBoost cross-validation --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "# Writing model predictions out for data\n", - "training_monitor(bdt_cv)\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['XGBcv'] = bdt_cv.predict_proba(df[training_columns])[:,1]\n", - "\n", - "# Drawing plot of model respone for signal and background classes\n", - "plt.figure()\n", - "plot_comparision('XGB', mc_df, bkg_df)\n", - "plot_comparision('XGBcv', mc_df, bkg_df)\n", - "\n", - "# Drawing signal significance comparison as a function of minimum cut on model response\n", - "plt.figure()\n", - "plot_significance(bdt, training_data, training_columns)\n", - "plot_significance(bdt_cv, training_data, training_columns)\n", - "\n", - "# Drawing the signal efficiency vs background rejection curve (ROC)\n", - "plt.figure()\n", - "plot_roc(bdt, training_data, training_columns)\n", - "plot_roc(bdt_cv, training_data, training_columns)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### $k$-folding & early stopping\n", - "\n", - "Performing CV on each of a number, k, of ways to split your data gives you k models to choose from. Some choose to average the performance across the models from each fold as any instability might imply the model will not be reliable. The results below seem stable; each fold provides a consistant performance across multiple metrics, so we'll just choose the best one." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Defining the folds with a seed to test consistently \n", - "splits = 4 # to match 0.25 value of test_train_split default though this may not be optimal\n", - "kf = KFold(splits, True, random_state=123)\n", - "\n", - "# Printing processing time of the kfold cross-validation\n", - "stime = time.time()\n", - "for train, test in kf.split(X1):\n", - " X_train, X_test = X1.iloc[train], X1.iloc[test]\n", - " y_train, y_test = y1.iloc[train], y1.iloc[test]\n", - " bdt.fit(X_train,y_train)\n", - "print(\"\\nXGBoost k-folding --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "# Calculating scores of each fold using variety of CV-metrics\n", - "cv_acc = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"accuracy\", n_jobs=-1)\n", - "cv_los = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"neg_log_loss\", n_jobs=-1)\n", - "cv_auc = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"roc_auc\", n_jobs=-1)\n", - "\n", - "# Printing results and indicating best fold\n", - "print(\"accuracy: \",cv_acc, \" -> best fold =\", np.argmax(cv_acc) )\n", - "print(\"-logloss: \",cv_los, \" -> best fold =\", np.argmax(cv_los) )\n", - "print(\"roc_auc: \",cv_auc, \" -> best fold =\", np.argmax(cv_auc) )\n", - "bestfold = np.argmax(cv_acc)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Early stopping defines a maximum number of rounds the cross-validation metric (we'll use 'error'=1-accuracy) is allowed to not improve before training is terminated. As is standard, we will be reverting back to a 'previous best' model based on test sample score, this helps avoid overtraining. Early stopping prevents us training too many of extra models thus saving time. Set the limit too small though and your training might be cut off prematurely." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "def modelfit(alg, metric, params, label, predictors, kfold, fbest, early_stop=10):\n", - "\n", - " # Loading data split inputs providing best fold result\n", - " for k, (train, test) in enumerate(kf.split(params)):\n", - " if (k==fbest):\n", - " X_train, X_test = params.iloc[train], params.iloc[test]\n", - " y_train, y_test = label.iloc[train], label.iloc[test]\n", - "\n", - " # Defining data in terms of training variables and class label\n", - " xgb_param = alg.get_xgb_params()\n", - " data = xgb.DMatrix(params, label=label, feature_names=predictors, nthread=-1)\n", - " \n", - " # Runs timed CV on our model using early stopping based on our metric\n", - " stime = time.time()\n", - " cvresult = xgb.cv(xgb_param,\n", - " data,\n", - " num_boost_round=alg.get_params()['n_estimators'],\n", - " #nfold=cv_folds, # to use in build folding\n", - " folds=kfold, # use -> ignores nfold \n", - " metrics=metric,\n", - " early_stopping_rounds=early_stop)\n", - " alg.set_params(n_estimators=cvresult.shape[0])\n", - " print(\"\\nXGBoost early-stop folding --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - " # Fitting the algorithm on the data with CV evaluation early stopping\n", - " stime = time.time()\n", - " alg.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", - " eval_set=[(X_train, y_train), (X_test, y_test)],\n", - " verbose=False, early_stopping_rounds=early_stop)\n", - " training_monitor(alg)\n", - " print(\"XGBoost early-stop limit --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - " # Predicting training set:\n", - " train_predictions = alg.predict(X_train) \n", - " test_predictions = alg.predict(X_test)\n", - "\n", - " # Printing model report: \n", - " print(\"\\nModel Report : best iteration \"+str(cvresult.shape[0]))\n", - " print(\"Train Accuracy : \"+str(metrics.accuracy_score(y_train, train_predictions)))\n", - " print(\"Test Accuracy : \"+str(metrics.accuracy_score(y_test, test_predictions)))\n", - " return cvresult.shape[0]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This function incorporates the k-folding CV and early stopping, saving not only the optimal model but also the index of its training iteration. This means, in our subsequent steps, we can apply an upper limit on training for models based on the convergence of the default hyperparameters, saving us some time. " - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Defining model with high maximum estimators for use with early stopping\n", - "bdt_es = XGBClassifier(learning_rate = LR, n_estimators=1000,\n", - " # Default values of other hyperparamters\n", - " #max_depth=6, min_child_weight=1,\n", - " #gamma=0, subsample=0.8,\n", - " #colsample_bytree=0.8, scale_pos_weight=1,\n", - " #objective='binary:logistic', # default for binary classification\n", - " #objective='mutli:softprob', num_class=3, # for multiclassifiers\n", - " seed=123, n_threads=-1)\n", - "\n", - "# Timing the CV using early stopping\n", - "stime = time.time()\n", - "estimators = modelfit(bdt_es, \"error\", X1, y1, training_columns, kf, bestfold)\n", - "print(\"\\nmodelfit(bdt_es) --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "# Saving model predictions\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['XGBes'] = bdt_es.predict_proba(df[training_columns])[:,1]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This provides us with an improved model as well as a benchmark to test against in both performance and training efficiency. When training using new combinations of hyperparameters, the maximum number of estimators from our model report will cut off any new models improving more slowly than our default, while, for more efficient models, the early stopping will kick in." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Drawing plot to compare model response for signal and background classes\n", - "plt.figure()\n", - "plot_comparision('XGBcv', mc_df, bkg_df)\n", - "plot_comparision('XGBes', mc_df, bkg_df)\n", - "\n", - "# Drawing signal significance comparison as a function of minimum cut on model response\n", - "plt.figure()\n", - "plot_significance(bdt_cv, training_data, training_columns)\n", - "plot_significance(bdt_es, training_data, training_columns)\n", - "\n", - "# Drawing comaprison of the signal efficiency vs background rejection curve (ROC)\n", - "plt.figure()\n", - "plot_roc(bdt_cv, training_data, training_columns)\n", - "plot_roc(bdt_es, training_data, training_columns)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Hyperameter optimisation\n", - "\n", - "Below we provide a \"grid\" of hyperparameters, defining the structure of the trees and constraints on the learning, but there are many more values to choose from and a larger parameter space to be explored. These optimsations are very problem specific and their impact will have to be weighed against the computing resources and timeframe you have at your disposal. For the sake of expedient demonstration we are comparing the default parameters to only one predetermined variation in 2 parameters. " - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Define a function that performs a gridscan of HPs\n", - "def hpgridscan(alg, metric, params, label, kfold, fbest, early_stop=10):\n", - "\n", - " # Load data fold with best performance\n", - " for k, (train, test) in enumerate(kf.split(params)):\n", - " if (k==fbest):\n", - " X_train, X_test = params.iloc[train], params.iloc[test]\n", - " y_train, y_test = label.iloc[train], label.iloc[test]\n", - "\n", - " # Define a dictionary of numpy arrays for our HPs\n", - " params = {\n", - " 'max_depth':np.array([7]),\n", - " 'min_child_weight':np.array([3]),\n", - " #'max_depth':np.arange( 5, 9, 1 ),\n", - " #'min_child_weight':np.arange( 1, 5, 1 ),\n", - " ##'gamma':np.arange( 0.0, 1.0, 0.1 ),\n", - " ##'colsample_bytree':np.arange( 0.4, 1.0, 0.1 ),\n", - " ##'subsample':np.arange( 0.4, 1.0, 0.1 ),\n", - " ##'scale_pos_weight':np.arange( 0.4, 1.6, 0.1 )\n", - " }\n", - "\n", - " # Perform timed grid scan with established n_estimator cutoff and early stopping\n", - " stime = time.time()\n", - " gs = GridSearchCV(estimator=alg,\n", - " param_grid=params,\n", - " scoring=metric,\n", - " iid=False,\n", - " cv=kf,\n", - " n_jobs=-1) \n", - " gs.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", - " eval_set=[(X_train, y_train), (X_test, y_test)],\n", - " verbose=False, early_stopping_rounds=early_stop)\n", - " print(\"XGBoost grid-scan --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - " # Return suggested parameters, performance and best model\n", - " training_monitor(gs.best_estimator_)\n", - " print(\"Suggestion:\", gs.best_params_)\n", - " print(\"Accuracy:\" ,gs.best_score_)\n", - " return gs" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Running with estimators maximum for shortened training\n", - "bdt_st = XGBClassifier( learning_rate = LR, n_estimators=estimators,\n", - " seed=123, n_threads=-1)\n", - "\n", - "# Running timed hyperparameter gridscan\n", - "stime = time.time()\n", - "gs = hpgridscan(bdt_st, \"accuracy\", X1, y1, kf, bestfold)\n", - "bdt_gs = gs.best_estimator_\n", - "print(\"\\nhpgridscan(bdt_st) --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "# Get model predictions\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['XGBgs'] = bdt_gs.predict_proba(df[training_columns])[:,1]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Even this naive grid scan, using the same fold as before for fair comparison, can provide significant improvements as demonstrated above. These may be pushed further by including more hyperparameters for a trade off with processing time. However, even with parrallisation these tasks can take hours or longer and might only provide improvement of O(>1%)." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "## We could define a model using optimal hyperparameters from our grid scan\n", - "#bdt_opt = XGBClassifier( learning_rate = LR, n_estimators=1000, \n", - "# max_depth=gs.best_params_['max_depth'],\n", - "# min_child_weight=gs.best_params_['min_child_weight'], \n", - "# seed=123, n_threads=-1 )\n", - "\n", - "## Run with CV early stopping\n", - "#stime = time.time()\n", - "#estimators = modelfit(bdt_opt, 'error', X1, y1, training_columns, kf, bestfold)\n", - "#print(\"\\nmodelfit(bdt_opt) --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "## Get model predictions\n", - "#for df in [mc_df, bkg_df, data_df, training_data]:\n", - "# df['XGBopt'] = bdt_opt.predict_proba(df[training_columns])[:,1]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Comapring model response from the end of last session to the end of this one\n", - "plt.figure()\n", - "plot_comparision('XGB', mc_df, bkg_df)\n", - "plot_comparision('XGBgs', mc_df, bkg_df)\n", - "\n", - "# Comparing the impact on projected performance at each stage of the tutorial\n", - "plt.figure()\n", - "plot_significance(bdt, training_data, training_columns)\n", - "plot_significance(bdt_cv, training_data, training_columns)\n", - "plot_significance(bdt_es, training_data, training_columns)\n", - "plot_significance(bdt_gs, training_data, training_columns)\n", - "#plot_significance(bdt_opt, training_data, training_columns)\n", - "\n", - "# Comparing model performance for each level of tuning\n", - "plt.figure()\n", - "plot_roc(bdt, training_data, training_columns)\n", - "plot_roc(bdt_cv, training_data, training_columns)\n", - "plot_roc(bdt_es, training_data, training_columns)\n", - "plot_roc(bdt_gs, training_data, training_columns)\n", - "#plot_roc(bdt_opt, training_data, training_columns)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also choose a higher learning rate to perform course scans of your space and decrease it again to retrain your final model. If you can afford to, it might be best to include learning rate itself as a parameter in your grid. With some libraries you can specify your choice of kernel. Both these choices will impact your optimal maximum number of iterations, so setting it sufficiently high and using early stopping might be a good strategy.\n", - "\n", - "For less exhaustive and non-discritised methods try smart combinations of the following to perform adaptive scans or build your own: \n", - "* sklearn.model_selection.RandomizedSearchCV\n", - "* sklearn.model_selection.GridSearchCV\n", - "\n", - "Moving to higher dimentional optimisation problems may require more sophisticated solutions:\n", - "* skopt.BayesSearchCV\n", - "* hyperopt.tpe" - ] - } - ] + "cells": [{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model tuning setup" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#%store -r bkg_df\n", + "#%store -r mc_df\n", + "#%store -r data_df" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#@title\n", + "!pip install uproot\n", + "!pip install sklearn\n", + "\n", + "import time\n", + "from matplotlib import pyplot as plt\n", + "import uproot\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import xgboost as xgb\n", + "\n", + "from xgboost.sklearn import XGBClassifier\n", + "from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier\n", + "\n", + "from sklearn import metrics\n", + "from sklearn.metrics import roc_curve, auc\n", + "\n", + "from sklearn.model_selection import KFold, train_test_split, cross_validate, cross_val_score, GridSearchCV" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def plot_mass(df):\n", + " counts, bins, _ = plt.hist(df['Jpsi_M'], bins=100, range=[2.75, 3.5], histtype='step')\n", + " # You can also use LaTeX in the axis label\n", + " plt.xlabel('$J/\\\\psi$ mass [GeV]')\n", + " plt.xlim(bins[0], bins[-1])" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def plot_comparision(var, mc_df, bkg_df):\n", + " _, bins, _ = plt.hist(mc_df[var], bins=100, histtype='step', label='MC', density=1)\n", + " _, bins, _ = plt.hist(bkg_df[var], bins=bins, histtype='step', label='Background', density=1)\n", + " plt.xlabel(var)\n", + " plt.xlim(bins[0], bins[-1])\n", + " plt.legend(loc='best')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def plot_roc(bdt, training_data, training_columns, label=None):\n", + " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", + " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", + " area = auc(fpr, tpr)\n", + "\n", + " plt.plot([0, 1], [0, 1], color='grey', linestyle='--')\n", + " if label:\n", + " plt.plot(fpr, tpr, label=f'{label} (area = {area:.2f})')\n", + " else:\n", + " plt.plot(fpr, tpr, label=f'ROC curve (area = {area:.2f})')\n", + " plt.xlim(0.0, 1.0)\n", + " plt.ylim(0.0, 1.0)\n", + " plt.xlabel('False Positive Rate')\n", + " plt.ylabel('True Positive Rate')\n", + " plt.legend(loc='lower right')\n", + " # We can make the plot look nicer by forcing the grid to be square\n", + " plt.gca().set_aspect('equal', adjustable='box')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def plot_significance(bdt, training_data, training_columns, label=None):\n", + " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", + " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", + "\n", + " n_sig = 1200\n", + " n_bkg = 23000\n", + " S = n_sig*tpr\n", + " B = n_bkg*fpr\n", + " metric = S/np.sqrt(S+B)\n", + "\n", + " plt.plot(thresholds, metric, label=label)\n", + " plt.xlabel('BDT cut value')\n", + " plt.ylabel('$\\\\frac{S}{\\\\sqrt{S+B}}$')\n", + " plt.xlim(0, 1.0)\n", + "\n", + " optimal_cut = thresholds[np.argmax(metric)]\n", + " plt.axvline(optimal_cut, color='black', linestyle='--')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#max_entries = 1000 # try running with low stats for bug fixing your changes quickly \n", + "data_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/real_data.root',\n", + " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", + " )['DecayTree'].pandas.df()#(entrystop=max_entries)\n", + "mc_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/simulated_data.root',\n", + " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", + " )['DecayTree'].pandas.df()#(entrystop=max_entries)\n", + "bkg_df = data_df.query('~(3.0 < Jpsi_M < 3.2)')\n", + "\n", + "for df in [mc_df, data_df, bkg_df]:\n", + " df.eval('Jpsi_eta = arctanh(Jpsi_PZ/Jpsi_P)', inplace=True)\n", + " df.eval('mup_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", + " df.eval('mum_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", + "\n", + "bkg_df['catagory'] = 0 # Use 0 for background\n", + "mc_df['catagory'] = 1 # Use 1 for signal\n", + "training_data = pd.concat([bkg_df, mc_df], copy=True, ignore_index=True)\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['IPdiff'] = np.abs(df['mum_PT'] - df['mup_PT'])\n", + " \n", + "training_columns = [\n", + " 'Jpsi_PT',\n", + " 'mup_PT', 'mup_eta', 'mup_ProbNNmu', 'mup_IP',\n", + " 'mum_PT', 'mum_eta', 'mum_ProbNNmu', 'mum_IP',\n", + "]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Previously we trained an XGBClassifier with the default settings, with learning rate = 0.3 and maximum iterations = 100. This cut off to the training process may be limiting the performance of our model. We can monitor the performance of our model as a function of training iteration and stop the training when the gradient approximates zero. " + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "X1, y1 = training_data[training_columns], training_data['catagory']\n", + "X_train, X_test, y_train, y_test = train_test_split(X1, y1) \n", + "# default train_size = 0.25, this can be varied to suit your data\n", + "\n", + "LR = 0.3 # the coefficient of step size decay, eta, has alias 'learning_rate' with default 0.3\n", + "\n", + "stime = time.time()\n", + "bdt = XGBClassifier(learning_rate = LR, n_estimators=100, seed=123, n_threads=-1)\n", + "bdt.fit(training_data[training_columns], training_data['catagory'])\n", + "print(\"XGBoost --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGB'] = bdt.predict_proba(df[training_columns])[:,1]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cross-validation\n", + "\n", + "Splitting the data into randomised subsets for training allows you to monitor your model's performance on the fly using the statistically independant remainder of your sample - this is called cross-validation (CV). We can see below that at the 100th iteration the metrics still show a trend of improvement." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def training_monitor(alg): \n", + "\n", + " # A model trained with eval_set and eval_metric will return evals_result\n", + " results = alg.evals_result()\n", + " epochs = len(results['validation_0']['logloss'])\n", + " x_axis = range(0, epochs)\n", + "\n", + " # Plotting logLoss as a function of training iteration\n", + " fig, ax = plt.subplots()\n", + " ax.plot(x_axis, results['validation_0']['logloss'], label='Train') # for each eval_set\n", + " if results['validation_1']: ax.plot(x_axis, results['validation_1']['logloss'], label='Test') \n", + " ax.legend()\n", + " plt.ylabel('LogLoss')\n", + " plt.title('LogLoss')\n", + " plt.show()\n", + " \n", + " # Plotting classification error as a function of training iteration\n", + " fig, ax = plt.subplots()\n", + " ax.plot(x_axis, results['validation_0']['error'], label='Train') # for each eval_set\n", + " if results['validation_1']: ax.plot(x_axis, results['validation_1']['error'], label='Test')\n", + " ax.legend()\n", + " plt.ylabel('Error')\n", + " plt.title('Error')\n", + " plt.show()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This involves training on less data but allows us to monitor progress to check if the model is becoming over-specific to our training sample. The minimisation of loss and classification error are common metrics for model assessment. As shown below, the cost to performance is negligible. If the test sample gradient were to invert this would be considered overtraining and is why monitoring performance without CV can be a time costly pitfall." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Defining a model with multi-threading set to maximum\n", + "bdt_cv = XGBClassifier(learning_rate = LR, n_estimators=100, seed=123, n_threads=-1)\n", + "\n", + "# Model fitting with CV and printing out processing time\n", + "stime = time.time()\n", + "bdt_cv.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", + " eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)\n", + "print(\"\\nXGBoost cross-validation --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Writing model predictions out for data\n", + "training_monitor(bdt_cv)\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBcv'] = bdt_cv.predict_proba(df[training_columns])[:,1]\n", + "\n", + "# Drawing plot of model respone for signal and background classes\n", + "plt.figure()\n", + "plot_comparision('XGB', mc_df, bkg_df)\n", + "plot_comparision('XGBcv', mc_df, bkg_df)\n", + "\n", + "# Drawing signal significance comparison as a function of minimum cut on model response\n", + "plt.figure()\n", + "plot_significance(bdt, training_data, training_columns)\n", + "plot_significance(bdt_cv, training_data, training_columns)\n", + "\n", + "# Drawing the signal efficiency vs background rejection curve (ROC)\n", + "plt.figure()\n", + "plot_roc(bdt, training_data, training_columns)\n", + "plot_roc(bdt_cv, training_data, training_columns)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### $k$-folding & early stopping\n", + "\n", + "Performing CV on each of a number, k, of ways to split your data gives you k models to choose from. Some choose to average the performance across the models from each fold as any instability might imply the model will not be reliable. The results below seem stable; each fold provides a consistant performance across multiple metrics, so we'll just choose the best one." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Defining the folds with a seed to test consistently \n", + "splits = 4 # to match 0.25 value of test_train_split default though this may not be optimal\n", + "kf = KFold(splits, True, random_state=123)\n", + "\n", + "# Printing processing time of the kfold cross-validation\n", + "stime = time.time()\n", + "for train, test in kf.split(X1):\n", + " X_train, X_test = X1.iloc[train], X1.iloc[test]\n", + " y_train, y_test = y1.iloc[train], y1.iloc[test]\n", + " bdt.fit(X_train,y_train)\n", + "print(\"\\nXGBoost k-folding --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Calculating scores of each fold using variety of CV-metrics\n", + "cv_acc = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"accuracy\", n_jobs=-1)\n", + "cv_los = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"neg_log_loss\", n_jobs=-1)\n", + "cv_auc = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"roc_auc\", n_jobs=-1)\n", + "\n", + "# Printing results and indicating best fold\n", + "print(\"accuracy: \",cv_acc, \" -> best fold =\", np.argmax(cv_acc) )\n", + "print(\"-logloss: \",cv_los, \" -> best fold =\", np.argmax(cv_los) )\n", + "print(\"roc_auc: \",cv_auc, \" -> best fold =\", np.argmax(cv_auc) )\n", + "bestfold = np.argmax(cv_acc)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Early stopping defines a maximum number of rounds the cross-validation metric (we'll use 'error'=1-accuracy) is allowed to not improve before training is terminated. As is standard, we will be reverting back to a 'previous best' model based on test sample score, this helps avoid overtraining. Early stopping prevents us training too many of extra models thus saving time. Set the limit too small though and your training might be cut off prematurely." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def modelfit(alg, metric, params, label, predictors, kfold, fbest, early_stop=10):\n", + "\n", + " # Loading data split inputs providing best fold result\n", + " for k, (train, test) in enumerate(kf.split(params)):\n", + " if (k==fbest):\n", + " X_train, X_test = params.iloc[train], params.iloc[test]\n", + " y_train, y_test = label.iloc[train], label.iloc[test]\n", + "\n", + " # Defining data in terms of training variables and class label\n", + " xgb_param = alg.get_xgb_params()\n", + " data = xgb.DMatrix(params, label=label, feature_names=predictors, nthread=-1)\n", + " \n", + " # Runs timed CV on our model using early stopping based on our metric\n", + " stime = time.time()\n", + " cvresult = xgb.cv(xgb_param,\n", + " data,\n", + " num_boost_round=alg.get_params()['n_estimators'],\n", + " #nfold=cv_folds, # to use in build folding\n", + " folds=kfold, # use -> ignores nfold \n", + " metrics=metric,\n", + " early_stopping_rounds=early_stop)\n", + " alg.set_params(n_estimators=cvresult.shape[0])\n", + " print(\"\\nXGBoost early-stop folding --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + " # Fitting the algorithm on the data with CV evaluation early stopping\n", + " stime = time.time()\n", + " alg.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", + " eval_set=[(X_train, y_train), (X_test, y_test)],\n", + " verbose=False, early_stopping_rounds=early_stop)\n", + " training_monitor(alg)\n", + " print(\"XGBoost early-stop limit --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + " # Predicting training set:\n", + " train_predictions = alg.predict(X_train) \n", + " test_predictions = alg.predict(X_test)\n", + "\n", + " # Printing model report: \n", + " print(\"\\nModel Report : best iteration \"+str(cvresult.shape[0]))\n", + " print(\"Train Accuracy : \"+str(metrics.accuracy_score(y_train, train_predictions)))\n", + " print(\"Test Accuracy : \"+str(metrics.accuracy_score(y_test, test_predictions)))\n", + " return cvresult.shape[0]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This function incorporates the k-folding CV and early stopping, saving not only the optimal model but also the index of its training iteration. This means, in our subsequent steps, we can apply an upper limit on training for models based on the convergence of the default hyperparameters, saving us some time. " + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Defining model with high maximum estimators for use with early stopping\n", + "bdt_es = XGBClassifier(learning_rate = LR, n_estimators=1000,\n", + " # Default values of other hyperparamters\n", + " #max_depth=6, min_child_weight=1,\n", + " #gamma=0, subsample=0.8,\n", + " #colsample_bytree=0.8, scale_pos_weight=1,\n", + " #objective='binary:logistic', # default for binary classification\n", + " #objective='mutli:softprob', num_class=3, # for multiclassifiers\n", + " seed=123, n_threads=-1)\n", + "\n", + "# Timing the CV using early stopping\n", + "stime = time.time()\n", + "estimators = modelfit(bdt_es, \"error\", X1, y1, training_columns, kf, bestfold)\n", + "print(\"\\nmodelfit(bdt_es) --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Saving model predictions\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBes'] = bdt_es.predict_proba(df[training_columns])[:,1]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This provides us with an improved model as well as a benchmark to test against in both performance and training efficiency. When training using new combinations of hyperparameters, the maximum number of estimators from our model report will cut off any new models improving more slowly than our default, while, for more efficient models, the early stopping will kick in." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Drawing plot to compare model response for signal and background classes\n", + "plt.figure()\n", + "plot_comparision('XGBcv', mc_df, bkg_df)\n", + "plot_comparision('XGBes', mc_df, bkg_df)\n", + "\n", + "# Drawing signal significance comparison as a function of minimum cut on model response\n", + "plt.figure()\n", + "plot_significance(bdt_cv, training_data, training_columns)\n", + "plot_significance(bdt_es, training_data, training_columns)\n", + "\n", + "# Drawing comaprison of the signal efficiency vs background rejection curve (ROC)\n", + "plt.figure()\n", + "plot_roc(bdt_cv, training_data, training_columns)\n", + "plot_roc(bdt_es, training_data, training_columns)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Hyperameter optimisation\n", + "\n", + "Below we provide a \"grid\" of hyperparameters, defining the structure of the trees and constraints on the learning, but there are many more values to choose from and a larger parameter space to be explored. These optimsations are very problem specific and their impact will have to be weighed against the computing resources and timeframe you have at your disposal. For the sake of expedient demonstration we are comparing the default parameters to only one predetermined variation in 2 parameters. " + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Define a function that performs a gridscan of HPs\n", + "def hpgridscan(alg, metric, params, label, kfold, fbest, early_stop=10):\n", + "\n", + " # Load data fold with best performance\n", + " for k, (train, test) in enumerate(kf.split(params)):\n", + " if (k==fbest):\n", + " X_train, X_test = params.iloc[train], params.iloc[test]\n", + " y_train, y_test = label.iloc[train], label.iloc[test]\n", + "\n", + " # Define a dictionary of numpy arrays for our HPs\n", + " params = {\n", + " 'max_depth':np.array([7]),\n", + " 'min_child_weight':np.array([3]),\n", + " #'max_depth':np.arange( 5, 9, 1 ),\n", + " #'min_child_weight':np.arange( 1, 5, 1 ),\n", + " ##'gamma':np.arange( 0.0, 1.0, 0.1 ),\n", + " ##'colsample_bytree':np.arange( 0.4, 1.0, 0.1 ),\n", + " ##'subsample':np.arange( 0.4, 1.0, 0.1 ),\n", + " ##'scale_pos_weight':np.arange( 0.4, 1.6, 0.1 )\n", + " }\n", + "\n", + " # Perform timed grid scan with established n_estimator cutoff and early stopping\n", + " stime = time.time()\n", + " gs = GridSearchCV(estimator=alg,\n", + " param_grid=params,\n", + " scoring=metric,\n", + " iid=False,\n", + " cv=kf,\n", + " n_jobs=-1) \n", + " gs.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", + " eval_set=[(X_train, y_train), (X_test, y_test)],\n", + " verbose=False, early_stopping_rounds=early_stop)\n", + " print(\"XGBoost grid-scan --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + " # Return suggested parameters, performance and best model\n", + " training_monitor(gs.best_estimator_)\n", + " print(\"Suggestion:\", gs.best_params_)\n", + " print(\"Accuracy:\" ,gs.best_score_)\n", + " return gs" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Running with estimators maximum for shortened training\n", + "bdt_st = XGBClassifier( learning_rate = LR, n_estimators=estimators,\n", + " seed=123, n_threads=-1)\n", + "\n", + "# Running timed hyperparameter gridscan\n", + "stime = time.time()\n", + "gs = hpgridscan(bdt_st, \"accuracy\", X1, y1, kf, bestfold)\n", + "bdt_gs = gs.best_estimator_\n", + "print(\"\\nhpgridscan(bdt_st) --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Get model predictions\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBgs'] = bdt_gs.predict_proba(df[training_columns])[:,1]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Even this naive grid scan, using the same fold as before for fair comparison, can provide significant improvements as demonstrated above. These may be pushed further by including more hyperparameters for a trade off with processing time. However, even with parrallisation these tasks can take hours or longer and might only provide improvement of O(>1%)." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "## We could define a model using optimal hyperparameters from our grid scan\n", + "#bdt_opt = XGBClassifier( learning_rate = LR, n_estimators=1000, \n", + "# max_depth=gs.best_params_['max_depth'],\n", + "# min_child_weight=gs.best_params_['min_child_weight'], \n", + "# seed=123, n_threads=-1 )\n", + "\n", + "## Run with CV early stopping\n", + "#stime = time.time()\n", + "#estimators = modelfit(bdt_opt, 'error', X1, y1, training_columns, kf, bestfold)\n", + "#print(\"\\nmodelfit(bdt_opt) --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "## Get model predictions\n", + "#for df in [mc_df, bkg_df, data_df, training_data]:\n", + "# df['XGBopt'] = bdt_opt.predict_proba(df[training_columns])[:,1]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Comapring model response from the end of last session to the end of this one\n", + "plt.figure()\n", + "plot_comparision('XGB', mc_df, bkg_df)\n", + "plot_comparision('XGBgs', mc_df, bkg_df)\n", + "\n", + "# Comparing the impact on projected performance at each stage of the tutorial\n", + "plt.figure()\n", + "plot_significance(bdt, training_data, training_columns)\n", + "plot_significance(bdt_cv, training_data, training_columns)\n", + "plot_significance(bdt_es, training_data, training_columns)\n", + "plot_significance(bdt_gs, training_data, training_columns)\n", + "#plot_significance(bdt_opt, training_data, training_columns)\n", + "\n", + "# Comparing model performance for each level of tuning\n", + "plt.figure()\n", + "plot_roc(bdt, training_data, training_columns)\n", + "plot_roc(bdt_cv, training_data, training_columns)\n", + "plot_roc(bdt_es, training_data, training_columns)\n", + "plot_roc(bdt_gs, training_data, training_columns)\n", + "#plot_roc(bdt_opt, training_data, training_columns)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also choose a higher learning rate to perform course scans of your space and decrease it again to retrain your final model. If you can afford to, it might be best to include learning rate itself as a parameter in your grid. With some libraries you can specify your choice of kernel. Both these choices will impact your optimal maximum number of iterations, so setting it sufficiently high and using early stopping might be a good strategy.\n", + "\n", + "For less exhaustive and non-discritised methods try smart combinations of the following to perform adaptive scans or build your own: \n", + "* sklearn.model_selection.RandomizedSearchCV\n", + "* sklearn.model_selection.GridSearchCV\n", + "\n", + "Moving to higher dimentional optimisation problems may require more sophisticated solutions:\n", + "* skopt.BayesSearchCV\n", + "* hyperopt.tpe" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } From 0c31cd4ddef70192a929056b66b6d89bdcfefd8c Mon Sep 17 00:00:00 2001 From: jvmead Date: Fri, 28 Feb 2020 20:35:20 +0000 Subject: [PATCH 16/54] removing fresh installs --- advanced-python/4bModelTuning.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/advanced-python/4bModelTuning.ipynb b/advanced-python/4bModelTuning.ipynb index 00a728a6..54be394f 100644 --- a/advanced-python/4bModelTuning.ipynb +++ b/advanced-python/4bModelTuning.ipynb @@ -22,8 +22,8 @@ "metadata": {}, "source": [ "#@title\n", - "!pip install uproot\n", - "!pip install sklearn\n", + "#!pip install uproot\n", + "#!pip install sklearn\n", "\n", "import time\n", "from matplotlib import pyplot as plt\n", From 1d6c14b85a9262c1626a3d22ace678df5e927aa8 Mon Sep 17 00:00:00 2001 From: Chris Burr Date: Sat, 29 Feb 2020 06:03:59 +0100 Subject: [PATCH 17/54] Avoid build timeouts for an hour --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index c64d58dd..6788b5b9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,7 +17,7 @@ install: - pip install starterkit-ci>=0.0.12 script: - - starterkit_ci build + - travis_wait 60 starterkit_ci build - starterkit_ci check # With instructions from http://www.steveklabnik.com/automatically_update_github_pages_with_travis_example/ From 302316f2045c2e6b6b42d57956da25f8c8785c2f Mon Sep 17 00:00:00 2001 From: jvmead Date: Sat, 29 Feb 2020 11:17:55 +0000 Subject: [PATCH 18/54] Update and rename .travis.yml to .github.yml --- .github.yml | 36 ++++++++++++++++++++++++++++++++++++ .travis.yml | 29 ----------------------------- 2 files changed, 36 insertions(+), 29 deletions(-) create mode 100644 .github.yml delete mode 100644 .travis.yml diff --git a/.github.yml b/.github.yml new file mode 100644 index 00000000..60dbf5f4 --- /dev/null +++ b/.github.yml @@ -0,0 +1,36 @@ +name: SK Build +on: [push] +env: + global: + secure: "S+3Gu4EZXl6pOkoy93wo5RB3qawI+TEThIOCutyQCk3gwNw3s2QBgVzqTol6plzdi2JfhNpELE9KJovEqBr9RWK7eD7k0TJcw+PJzFaVEi/iV0ZpjYyTg2ttmAGTTPz382LuXeO+4L0GB3GYIVXN5P6waHppV6D8vPbqUARl/INaj6o/ik5Gsf1FUWSimshwECOpjuWdPQq7Ju5zoTLmnBFfubJmuXf79T4trTs5/XFCbgIC4zVLvMF6tW9XSdVSeBGQqF1QWjvMNRPorpGHj8kkaON86oEFxAN3Wnw/nfN+PKplV90XqpM1Z7kUM+vBz1jL32kbxeCYltv61CdLBzlaNaPi6F/V/jvM6ABU5g7i1BXCoBOGAdg3tzihqV8VH4vLRomCRZrp2GFpAE80ljkXcIwtAv1uNynQA5KbAVL78ocxwxlj3K100X+ZqWUtuWHJ6ZC5v3RIdyZb8m4zrvx6GAhk+5nDZauLHbCopcIPvFT2mwWkXd1dbExWP4o2190pk1CyUV5udF9B5NB1ReitVoCCgn1MTT5nWueWfsU9asSUcUZR9BqMPLQW9zXKJw9SRDeuuK6gWqWP0nze+ExOtZIabHcoJr5d+pMsbi7p+cS/JUbbFzbt9CVFjr5tYEezdlMRn91sqBE01I7VEavOxnr8/iAr7iBtBZpg6Dk=" +jobs: + build: + name: Set up Python 3.7 + uses: actions/setup-python@v1 + with: + python-version: 3.7.3 + runs-on: ubuntu-latest + #strategy: + # fail-fast: true + # max-parallel: -1 + # matrix: + # go: ["1.12.x", "1.13.x"] + steps: + - name: Install dependencies + run: + wget -nv http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh + bash miniconda.sh -b -p $PWD/miniconda + source $PWD/miniconda/etc/profile.d/conda.sh + conda config --add channels conda-forge + conda env create -f environment.yml -n my-analysis-env + conda activate my-analysis-env + conda install --yes jupyterlab + pip install git+https://github.com/chrisburr/recommonmark.git@patch-1 + pip install starterkit-ci>=0.0.12 + - name: Starterkit CI + run: | + starterkit_ci build + starterkit_ci check + #- name: Test + # run: | + # test $TRAVIS_PULL_REQUEST == "false" && test $TRAVIS_BRANCH == "master" && starterkit_ci deploy diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 6788b5b9..00000000 --- a/.travis.yml +++ /dev/null @@ -1,29 +0,0 @@ -language: python -python: - - "3.7" -cache: false -dist: xenial - -install: - - wget -nv http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - - bash miniconda.sh -b -p $PWD/miniconda - - source $PWD/miniconda/etc/profile.d/conda.sh - - conda config --add channels conda-forge - - conda env create -f environment.yml -n my-analysis-env - - conda activate my-analysis-env - - conda install --yes jupyterlab - # FIXME: Inline math is broken in upstream recommonmark - - pip install git+https://github.com/chrisburr/recommonmark.git@patch-1 - - pip install starterkit-ci>=0.0.12 - -script: - - travis_wait 60 starterkit_ci build - - starterkit_ci check - -# With instructions from http://www.steveklabnik.com/automatically_update_github_pages_with_travis_example/ -after_success: - - test $TRAVIS_PULL_REQUEST == "false" && test $TRAVIS_BRANCH == "master" && starterkit_ci deploy - -env: - global: - secure: "S+3Gu4EZXl6pOkoy93wo5RB3qawI+TEThIOCutyQCk3gwNw3s2QBgVzqTol6plzdi2JfhNpELE9KJovEqBr9RWK7eD7k0TJcw+PJzFaVEi/iV0ZpjYyTg2ttmAGTTPz382LuXeO+4L0GB3GYIVXN5P6waHppV6D8vPbqUARl/INaj6o/ik5Gsf1FUWSimshwECOpjuWdPQq7Ju5zoTLmnBFfubJmuXf79T4trTs5/XFCbgIC4zVLvMF6tW9XSdVSeBGQqF1QWjvMNRPorpGHj8kkaON86oEFxAN3Wnw/nfN+PKplV90XqpM1Z7kUM+vBz1jL32kbxeCYltv61CdLBzlaNaPi6F/V/jvM6ABU5g7i1BXCoBOGAdg3tzihqV8VH4vLRomCRZrp2GFpAE80ljkXcIwtAv1uNynQA5KbAVL78ocxwxlj3K100X+ZqWUtuWHJ6ZC5v3RIdyZb8m4zrvx6GAhk+5nDZauLHbCopcIPvFT2mwWkXd1dbExWP4o2190pk1CyUV5udF9B5NB1ReitVoCCgn1MTT5nWueWfsU9asSUcUZR9BqMPLQW9zXKJw9SRDeuuK6gWqWP0nze+ExOtZIabHcoJr5d+pMsbi7p+cS/JUbbFzbt9CVFjr5tYEezdlMRn91sqBE01I7VEavOxnr8/iAr7iBtBZpg6Dk=" From 057006cd927a93b7ddc0cf081332e68f0e59e17d Mon Sep 17 00:00:00 2001 From: jvmead Date: Sat, 29 Feb 2020 11:20:57 +0000 Subject: [PATCH 19/54] Update .github.yml --- .github.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github.yml b/.github.yml index 60dbf5f4..41ccad7c 100644 --- a/.github.yml +++ b/.github.yml @@ -31,6 +31,6 @@ jobs: run: | starterkit_ci build starterkit_ci check - #- name: Test - # run: | - # test $TRAVIS_PULL_REQUEST == "false" && test $TRAVIS_BRANCH == "master" && starterkit_ci deploy + - name: Test + run: | + test $TRAVIS_PULL_REQUEST == "false" && test $TRAVIS_BRANCH == "master" && starterkit_ci deploy From 5f677ac955a208d8782a311ae430a513bc14d3b8 Mon Sep 17 00:00:00 2001 From: jvmead Date: Sat, 29 Feb 2020 11:29:39 +0000 Subject: [PATCH 20/54] Rename .github.yml to .github/workflows/ci.yml --- .github.yml => .github/workflows/ci.yml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github.yml => .github/workflows/ci.yml (100%) diff --git a/.github.yml b/.github/workflows/ci.yml similarity index 100% rename from .github.yml rename to .github/workflows/ci.yml From b12b8fbc3127245b3fd2a2d47f5213d603c83a91 Mon Sep 17 00:00:00 2001 From: jvmead Date: Sat, 29 Feb 2020 11:30:48 +0000 Subject: [PATCH 21/54] Rename ci.yml to ci.yaml --- .github/workflows/{ci.yml => ci.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{ci.yml => ci.yaml} (100%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yaml similarity index 100% rename from .github/workflows/ci.yml rename to .github/workflows/ci.yaml From 55049d90ffb8118b02c40df0e30b42719f6fc8ab Mon Sep 17 00:00:00 2001 From: jvmead Date: Sat, 29 Feb 2020 11:53:03 +0000 Subject: [PATCH 22/54] run: | wget -> wget: with: --- .github/workflows/ci.yaml | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 41ccad7c..b7f7a85e 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -15,22 +15,29 @@ jobs: # max-parallel: -1 # matrix: # go: ["1.12.x", "1.13.x"] + wget: + runs-on: ubuntu-latest + steps: + - name: wget + uses: wei/wget@v1 + with: + args: -nv http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh steps: - - name: Install dependencies - run: - wget -nv http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - bash miniconda.sh -b -p $PWD/miniconda - source $PWD/miniconda/etc/profile.d/conda.sh - conda config --add channels conda-forge - conda env create -f environment.yml -n my-analysis-env - conda activate my-analysis-env - conda install --yes jupyterlab - pip install git+https://github.com/chrisburr/recommonmark.git@patch-1 - pip install starterkit-ci>=0.0.12 - - name: Starterkit CI - run: | - starterkit_ci build - starterkit_ci check + - name: Install dependencies + run: | + #wget -nv http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh + bash miniconda.sh -b -p $PWD/miniconda + source $PWD/miniconda/etc/profile.d/conda.sh + conda config --add channels conda-forge + conda env create -f environment.yml -n my-analysis-env + conda activate my-analysis-env + conda install --yes jupyterlab + pip install git+https://github.com/chrisburr/recommonmark.git@patch-1 + pip install starterkit-ci>=0.0.12 + - name: Starterkit CI + run: | + starterkit_ci build + starterkit_ci check - name: Test run: | test $TRAVIS_PULL_REQUEST == "false" && test $TRAVIS_BRANCH == "master" && starterkit_ci deploy From 485f442075f2d3616c3d4295808e7fd4f991fcc6 Mon Sep 17 00:00:00 2001 From: jvmead Date: Sat, 29 Feb 2020 11:57:13 +0000 Subject: [PATCH 23/54] source -> sh --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b7f7a85e..979a479e 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -27,7 +27,7 @@ jobs: run: | #wget -nv http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh bash miniconda.sh -b -p $PWD/miniconda - source $PWD/miniconda/etc/profile.d/conda.sh + sh $PWD/miniconda/etc/profile.d/conda.sh conda config --add channels conda-forge conda env create -f environment.yml -n my-analysis-env conda activate my-analysis-env From 1a36a39c39ea45d8f1f172848e0564c442af5797 Mon Sep 17 00:00:00 2001 From: Chris Burr Date: Sat, 29 Feb 2020 13:01:39 +0100 Subject: [PATCH 24/54] Remove travis token --- .github/workflows/ci.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 979a479e..3c634ded 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -1,8 +1,6 @@ name: SK Build on: [push] -env: - global: - secure: "S+3Gu4EZXl6pOkoy93wo5RB3qawI+TEThIOCutyQCk3gwNw3s2QBgVzqTol6plzdi2JfhNpELE9KJovEqBr9RWK7eD7k0TJcw+PJzFaVEi/iV0ZpjYyTg2ttmAGTTPz382LuXeO+4L0GB3GYIVXN5P6waHppV6D8vPbqUARl/INaj6o/ik5Gsf1FUWSimshwECOpjuWdPQq7Ju5zoTLmnBFfubJmuXf79T4trTs5/XFCbgIC4zVLvMF6tW9XSdVSeBGQqF1QWjvMNRPorpGHj8kkaON86oEFxAN3Wnw/nfN+PKplV90XqpM1Z7kUM+vBz1jL32kbxeCYltv61CdLBzlaNaPi6F/V/jvM6ABU5g7i1BXCoBOGAdg3tzihqV8VH4vLRomCRZrp2GFpAE80ljkXcIwtAv1uNynQA5KbAVL78ocxwxlj3K100X+ZqWUtuWHJ6ZC5v3RIdyZb8m4zrvx6GAhk+5nDZauLHbCopcIPvFT2mwWkXd1dbExWP4o2190pk1CyUV5udF9B5NB1ReitVoCCgn1MTT5nWueWfsU9asSUcUZR9BqMPLQW9zXKJw9SRDeuuK6gWqWP0nze+ExOtZIabHcoJr5d+pMsbi7p+cS/JUbbFzbt9CVFjr5tYEezdlMRn91sqBE01I7VEavOxnr8/iAr7iBtBZpg6Dk=" + jobs: build: name: Set up Python 3.7 From 7401f272132900c3a8145cf5d6607e11da43993f Mon Sep 17 00:00:00 2001 From: Chris Burr Date: Sat, 29 Feb 2020 13:03:40 +0100 Subject: [PATCH 25/54] Fix indentation --- .github/workflows/ci.yaml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 3c634ded..e00a2503 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -23,15 +23,15 @@ jobs: steps: - name: Install dependencies run: | - #wget -nv http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - bash miniconda.sh -b -p $PWD/miniconda - sh $PWD/miniconda/etc/profile.d/conda.sh - conda config --add channels conda-forge - conda env create -f environment.yml -n my-analysis-env - conda activate my-analysis-env - conda install --yes jupyterlab - pip install git+https://github.com/chrisburr/recommonmark.git@patch-1 - pip install starterkit-ci>=0.0.12 + wget -nv http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh + bash miniconda.sh -b -p $PWD/miniconda + sh $PWD/miniconda/etc/profile.d/conda.sh + conda config --add channels conda-forge + conda env create -f environment.yml -n my-analysis-env + conda activate my-analysis-env + conda install --yes jupyterlab + pip install git+https://github.com/chrisburr/recommonmark.git@patch-1 + pip install starterkit-ci>=0.0.12 - name: Starterkit CI run: | starterkit_ci build From 91a9f5ca282a504b4d5cfed6757b55b28ed82712 Mon Sep 17 00:00:00 2001 From: jvmead Date: Sat, 29 Feb 2020 12:05:27 +0000 Subject: [PATCH 26/54] Update ci.yaml --- .github/workflows/ci.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index e00a2503..94e051cd 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -13,13 +13,6 @@ jobs: # max-parallel: -1 # matrix: # go: ["1.12.x", "1.13.x"] - wget: - runs-on: ubuntu-latest - steps: - - name: wget - uses: wei/wget@v1 - with: - args: -nv http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh steps: - name: Install dependencies run: | From dc198b485dc431ea39272fc53b53442cec37d0f7 Mon Sep 17 00:00:00 2001 From: Chris Burr Date: Sat, 29 Feb 2020 13:07:58 +0100 Subject: [PATCH 27/54] Fix more linter errors --- .github/workflows/ci.yaml | 40 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 94e051cd..a7ce9c76 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -4,31 +4,29 @@ on: [push] jobs: build: name: Set up Python 3.7 - uses: actions/setup-python@v1 - with: - python-version: 3.7.3 runs-on: ubuntu-latest #strategy: # fail-fast: true # max-parallel: -1 # matrix: # go: ["1.12.x", "1.13.x"] - steps: - - name: Install dependencies - run: | - wget -nv http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - bash miniconda.sh -b -p $PWD/miniconda - sh $PWD/miniconda/etc/profile.d/conda.sh - conda config --add channels conda-forge - conda env create -f environment.yml -n my-analysis-env - conda activate my-analysis-env - conda install --yes jupyterlab - pip install git+https://github.com/chrisburr/recommonmark.git@patch-1 - pip install starterkit-ci>=0.0.12 - - name: Starterkit CI - run: | - starterkit_ci build - starterkit_ci check - - name: Test - run: | + steps: + - uses: actions/checkout@v1 + - name: Install dependencies + run: | + wget -nv http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh + bash miniconda.sh -b -p $PWD/miniconda + sh $PWD/miniconda/etc/profile.d/conda.sh + conda config --add channels conda-forge + conda env create -f environment.yml -n my-analysis-env + conda activate my-analysis-env + conda install --yes jupyterlab + pip install git+https://github.com/chrisburr/recommonmark.git@patch-1 + pip install starterkit-ci>=0.0.12 + - name: Starterkit CI + run: | + starterkit_ci build + starterkit_ci check + - name: Test + run: | test $TRAVIS_PULL_REQUEST == "false" && test $TRAVIS_BRANCH == "master" && starterkit_ci deploy From c529a89c9fff31935ec68eb6585aedc9e0ecacc4 Mon Sep 17 00:00:00 2001 From: jvmead Date: Sat, 29 Feb 2020 12:14:21 +0000 Subject: [PATCH 28/54] timing info --- advanced-python/4bModelTuning.ipynb | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/advanced-python/4bModelTuning.ipynb b/advanced-python/4bModelTuning.ipynb index 54be394f..4fe1a6c2 100644 --- a/advanced-python/4bModelTuning.ipynb +++ b/advanced-python/4bModelTuning.ipynb @@ -44,6 +44,17 @@ "execution_count": null, "outputs": [] }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Time and processing check for the lesson\n", + "stt = time.time()\n", + "stc = time.clock()" + ], + "execution_count": null, + "outputs": [] + }, { "cell_type": "code", "metadata": {}, @@ -571,7 +582,18 @@ "* skopt.BayesSearchCV\n", "* hyperopt.tpe" ] - } + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Final lesson time and processing time check\n", + "print(\"Notebook real time --- %s seconds ---\" % (time.time() - stt))\n", + "print(\"Notebook CPU time --- %s seconds ---\" % (time.clock() - stc))" + ], + "execution_count": null, + "outputs": [] + } ], "metadata": { "kernelspec": { @@ -592,6 +614,7 @@ "version": "3.7.3" } }, + "nbformat": 4, "nbformat_minor": 4 } From b7fd112c1ca6badabeb48c8acec508e194d32204 Mon Sep 17 00:00:00 2001 From: jvmead Date: Sat, 29 Feb 2020 12:28:03 +0000 Subject: [PATCH 29/54] initialise with conda init bash --- .github/workflows/ci.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index a7ce9c76..8a95daa6 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -17,6 +17,7 @@ jobs: wget -nv http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh bash miniconda.sh -b -p $PWD/miniconda sh $PWD/miniconda/etc/profile.d/conda.sh + conda init bash conda config --add channels conda-forge conda env create -f environment.yml -n my-analysis-env conda activate my-analysis-env From e578fa29102615b58475d69f7c54402a8cbcd6df Mon Sep 17 00:00:00 2001 From: Chris Burr Date: Sat, 29 Feb 2020 13:56:55 +0100 Subject: [PATCH 30/54] Use conda from GitHub --- .github/workflows/ci.yaml | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8a95daa6..fcd49582 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -14,10 +14,7 @@ jobs: - uses: actions/checkout@v1 - name: Install dependencies run: | - wget -nv http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - bash miniconda.sh -b -p $PWD/miniconda - sh $PWD/miniconda/etc/profile.d/conda.sh - conda init bash + source ${CONDA}/etc/profile.d/conda.sh conda config --add channels conda-forge conda env create -f environment.yml -n my-analysis-env conda activate my-analysis-env @@ -28,6 +25,6 @@ jobs: run: | starterkit_ci build starterkit_ci check - - name: Test - run: | - test $TRAVIS_PULL_REQUEST == "false" && test $TRAVIS_BRANCH == "master" && starterkit_ci deploy + # - name: Test + # run: | + # test $TRAVIS_PULL_REQUEST == "false" && test $TRAVIS_BRANCH == "master" && starterkit_ci deploy From 54d244612e91de7d89a7092c460790f1e318c789 Mon Sep 17 00:00:00 2001 From: Chris Burr Date: Sat, 29 Feb 2020 13:57:34 +0100 Subject: [PATCH 31/54] Run on pull requests as well as push --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index fcd49582..60301d58 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -1,5 +1,5 @@ name: SK Build -on: [push] +on: [push, pull_request] jobs: build: From 29bcb9aba32179e153667e7de04b4b87ca17f546 Mon Sep 17 00:00:00 2001 From: Chris Burr Date: Sat, 29 Feb 2020 14:04:07 +0100 Subject: [PATCH 32/54] Activate my-analysis-env when building webpage --- .github/workflows/ci.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 60301d58..9c70ce08 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -23,6 +23,7 @@ jobs: pip install starterkit-ci>=0.0.12 - name: Starterkit CI run: | + source ${CONDA}/bin/activate my-analysis-env starterkit_ci build starterkit_ci check # - name: Test From a360e427d70ea637e482b290b923a5411cf195c4 Mon Sep 17 00:00:00 2001 From: jvmead Date: Sat, 29 Feb 2020 16:04:59 +0000 Subject: [PATCH 33/54] optimal significance cut fix, add mass comparison --- advanced-python/4bModelTuning.ipynb | 1264 ++++++++++++++------------- 1 file changed, 646 insertions(+), 618 deletions(-) diff --git a/advanced-python/4bModelTuning.ipynb b/advanced-python/4bModelTuning.ipynb index 4fe1a6c2..7f8a4fa3 100644 --- a/advanced-python/4bModelTuning.ipynb +++ b/advanced-python/4bModelTuning.ipynb @@ -1,620 +1,648 @@ { - "cells": [{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model tuning setup" - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "#%store -r bkg_df\n", - "#%store -r mc_df\n", - "#%store -r data_df" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "#@title\n", - "#!pip install uproot\n", - "#!pip install sklearn\n", - "\n", - "import time\n", - "from matplotlib import pyplot as plt\n", - "import uproot\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "import xgboost as xgb\n", - "\n", - "from xgboost.sklearn import XGBClassifier\n", - "from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier\n", - "\n", - "from sklearn import metrics\n", - "from sklearn.metrics import roc_curve, auc\n", - "\n", - "from sklearn.model_selection import KFold, train_test_split, cross_validate, cross_val_score, GridSearchCV" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Time and processing check for the lesson\n", - "stt = time.time()\n", - "stc = time.clock()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "def plot_mass(df):\n", - " counts, bins, _ = plt.hist(df['Jpsi_M'], bins=100, range=[2.75, 3.5], histtype='step')\n", - " # You can also use LaTeX in the axis label\n", - " plt.xlabel('$J/\\\\psi$ mass [GeV]')\n", - " plt.xlim(bins[0], bins[-1])" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "def plot_comparision(var, mc_df, bkg_df):\n", - " _, bins, _ = plt.hist(mc_df[var], bins=100, histtype='step', label='MC', density=1)\n", - " _, bins, _ = plt.hist(bkg_df[var], bins=bins, histtype='step', label='Background', density=1)\n", - " plt.xlabel(var)\n", - " plt.xlim(bins[0], bins[-1])\n", - " plt.legend(loc='best')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "def plot_roc(bdt, training_data, training_columns, label=None):\n", - " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", - " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", - " area = auc(fpr, tpr)\n", - "\n", - " plt.plot([0, 1], [0, 1], color='grey', linestyle='--')\n", - " if label:\n", - " plt.plot(fpr, tpr, label=f'{label} (area = {area:.2f})')\n", - " else:\n", - " plt.plot(fpr, tpr, label=f'ROC curve (area = {area:.2f})')\n", - " plt.xlim(0.0, 1.0)\n", - " plt.ylim(0.0, 1.0)\n", - " plt.xlabel('False Positive Rate')\n", - " plt.ylabel('True Positive Rate')\n", - " plt.legend(loc='lower right')\n", - " # We can make the plot look nicer by forcing the grid to be square\n", - " plt.gca().set_aspect('equal', adjustable='box')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "def plot_significance(bdt, training_data, training_columns, label=None):\n", - " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", - " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", - "\n", - " n_sig = 1200\n", - " n_bkg = 23000\n", - " S = n_sig*tpr\n", - " B = n_bkg*fpr\n", - " metric = S/np.sqrt(S+B)\n", - "\n", - " plt.plot(thresholds, metric, label=label)\n", - " plt.xlabel('BDT cut value')\n", - " plt.ylabel('$\\\\frac{S}{\\\\sqrt{S+B}}$')\n", - " plt.xlim(0, 1.0)\n", - "\n", - " optimal_cut = thresholds[np.argmax(metric)]\n", - " plt.axvline(optimal_cut, color='black', linestyle='--')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "#max_entries = 1000 # try running with low stats for bug fixing your changes quickly \n", - "data_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/real_data.root',\n", - " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", - " )['DecayTree'].pandas.df()#(entrystop=max_entries)\n", - "mc_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/simulated_data.root',\n", - " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", - " )['DecayTree'].pandas.df()#(entrystop=max_entries)\n", - "bkg_df = data_df.query('~(3.0 < Jpsi_M < 3.2)')\n", - "\n", - "for df in [mc_df, data_df, bkg_df]:\n", - " df.eval('Jpsi_eta = arctanh(Jpsi_PZ/Jpsi_P)', inplace=True)\n", - " df.eval('mup_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", - " df.eval('mum_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", - "\n", - "bkg_df['catagory'] = 0 # Use 0 for background\n", - "mc_df['catagory'] = 1 # Use 1 for signal\n", - "training_data = pd.concat([bkg_df, mc_df], copy=True, ignore_index=True)\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['IPdiff'] = np.abs(df['mum_PT'] - df['mup_PT'])\n", - " \n", - "training_columns = [\n", - " 'Jpsi_PT',\n", - " 'mup_PT', 'mup_eta', 'mup_ProbNNmu', 'mup_IP',\n", - " 'mum_PT', 'mum_eta', 'mum_ProbNNmu', 'mum_IP',\n", - "]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Previously we trained an XGBClassifier with the default settings, with learning rate = 0.3 and maximum iterations = 100. This cut off to the training process may be limiting the performance of our model. We can monitor the performance of our model as a function of training iteration and stop the training when the gradient approximates zero. " - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "X1, y1 = training_data[training_columns], training_data['catagory']\n", - "X_train, X_test, y_train, y_test = train_test_split(X1, y1) \n", - "# default train_size = 0.25, this can be varied to suit your data\n", - "\n", - "LR = 0.3 # the coefficient of step size decay, eta, has alias 'learning_rate' with default 0.3\n", - "\n", - "stime = time.time()\n", - "bdt = XGBClassifier(learning_rate = LR, n_estimators=100, seed=123, n_threads=-1)\n", - "bdt.fit(training_data[training_columns], training_data['catagory'])\n", - "print(\"XGBoost --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['XGB'] = bdt.predict_proba(df[training_columns])[:,1]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Cross-validation\n", - "\n", - "Splitting the data into randomised subsets for training allows you to monitor your model's performance on the fly using the statistically independant remainder of your sample - this is called cross-validation (CV). We can see below that at the 100th iteration the metrics still show a trend of improvement." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "def training_monitor(alg): \n", - "\n", - " # A model trained with eval_set and eval_metric will return evals_result\n", - " results = alg.evals_result()\n", - " epochs = len(results['validation_0']['logloss'])\n", - " x_axis = range(0, epochs)\n", - "\n", - " # Plotting logLoss as a function of training iteration\n", - " fig, ax = plt.subplots()\n", - " ax.plot(x_axis, results['validation_0']['logloss'], label='Train') # for each eval_set\n", - " if results['validation_1']: ax.plot(x_axis, results['validation_1']['logloss'], label='Test') \n", - " ax.legend()\n", - " plt.ylabel('LogLoss')\n", - " plt.title('LogLoss')\n", - " plt.show()\n", - " \n", - " # Plotting classification error as a function of training iteration\n", - " fig, ax = plt.subplots()\n", - " ax.plot(x_axis, results['validation_0']['error'], label='Train') # for each eval_set\n", - " if results['validation_1']: ax.plot(x_axis, results['validation_1']['error'], label='Test')\n", - " ax.legend()\n", - " plt.ylabel('Error')\n", - " plt.title('Error')\n", - " plt.show()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This involves training on less data but allows us to monitor progress to check if the model is becoming over-specific to our training sample. The minimisation of loss and classification error are common metrics for model assessment. As shown below, the cost to performance is negligible. If the test sample gradient were to invert this would be considered overtraining and is why monitoring performance without CV can be a time costly pitfall." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Defining a model with multi-threading set to maximum\n", - "bdt_cv = XGBClassifier(learning_rate = LR, n_estimators=100, seed=123, n_threads=-1)\n", - "\n", - "# Model fitting with CV and printing out processing time\n", - "stime = time.time()\n", - "bdt_cv.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", - " eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)\n", - "print(\"\\nXGBoost cross-validation --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "# Writing model predictions out for data\n", - "training_monitor(bdt_cv)\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['XGBcv'] = bdt_cv.predict_proba(df[training_columns])[:,1]\n", - "\n", - "# Drawing plot of model respone for signal and background classes\n", - "plt.figure()\n", - "plot_comparision('XGB', mc_df, bkg_df)\n", - "plot_comparision('XGBcv', mc_df, bkg_df)\n", - "\n", - "# Drawing signal significance comparison as a function of minimum cut on model response\n", - "plt.figure()\n", - "plot_significance(bdt, training_data, training_columns)\n", - "plot_significance(bdt_cv, training_data, training_columns)\n", - "\n", - "# Drawing the signal efficiency vs background rejection curve (ROC)\n", - "plt.figure()\n", - "plot_roc(bdt, training_data, training_columns)\n", - "plot_roc(bdt_cv, training_data, training_columns)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### $k$-folding & early stopping\n", - "\n", - "Performing CV on each of a number, k, of ways to split your data gives you k models to choose from. Some choose to average the performance across the models from each fold as any instability might imply the model will not be reliable. The results below seem stable; each fold provides a consistant performance across multiple metrics, so we'll just choose the best one." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Defining the folds with a seed to test consistently \n", - "splits = 4 # to match 0.25 value of test_train_split default though this may not be optimal\n", - "kf = KFold(splits, True, random_state=123)\n", - "\n", - "# Printing processing time of the kfold cross-validation\n", - "stime = time.time()\n", - "for train, test in kf.split(X1):\n", - " X_train, X_test = X1.iloc[train], X1.iloc[test]\n", - " y_train, y_test = y1.iloc[train], y1.iloc[test]\n", - " bdt.fit(X_train,y_train)\n", - "print(\"\\nXGBoost k-folding --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "# Calculating scores of each fold using variety of CV-metrics\n", - "cv_acc = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"accuracy\", n_jobs=-1)\n", - "cv_los = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"neg_log_loss\", n_jobs=-1)\n", - "cv_auc = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"roc_auc\", n_jobs=-1)\n", - "\n", - "# Printing results and indicating best fold\n", - "print(\"accuracy: \",cv_acc, \" -> best fold =\", np.argmax(cv_acc) )\n", - "print(\"-logloss: \",cv_los, \" -> best fold =\", np.argmax(cv_los) )\n", - "print(\"roc_auc: \",cv_auc, \" -> best fold =\", np.argmax(cv_auc) )\n", - "bestfold = np.argmax(cv_acc)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Early stopping defines a maximum number of rounds the cross-validation metric (we'll use 'error'=1-accuracy) is allowed to not improve before training is terminated. As is standard, we will be reverting back to a 'previous best' model based on test sample score, this helps avoid overtraining. Early stopping prevents us training too many of extra models thus saving time. Set the limit too small though and your training might be cut off prematurely." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "def modelfit(alg, metric, params, label, predictors, kfold, fbest, early_stop=10):\n", - "\n", - " # Loading data split inputs providing best fold result\n", - " for k, (train, test) in enumerate(kf.split(params)):\n", - " if (k==fbest):\n", - " X_train, X_test = params.iloc[train], params.iloc[test]\n", - " y_train, y_test = label.iloc[train], label.iloc[test]\n", - "\n", - " # Defining data in terms of training variables and class label\n", - " xgb_param = alg.get_xgb_params()\n", - " data = xgb.DMatrix(params, label=label, feature_names=predictors, nthread=-1)\n", - " \n", - " # Runs timed CV on our model using early stopping based on our metric\n", - " stime = time.time()\n", - " cvresult = xgb.cv(xgb_param,\n", - " data,\n", - " num_boost_round=alg.get_params()['n_estimators'],\n", - " #nfold=cv_folds, # to use in build folding\n", - " folds=kfold, # use -> ignores nfold \n", - " metrics=metric,\n", - " early_stopping_rounds=early_stop)\n", - " alg.set_params(n_estimators=cvresult.shape[0])\n", - " print(\"\\nXGBoost early-stop folding --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - " # Fitting the algorithm on the data with CV evaluation early stopping\n", - " stime = time.time()\n", - " alg.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", - " eval_set=[(X_train, y_train), (X_test, y_test)],\n", - " verbose=False, early_stopping_rounds=early_stop)\n", - " training_monitor(alg)\n", - " print(\"XGBoost early-stop limit --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - " # Predicting training set:\n", - " train_predictions = alg.predict(X_train) \n", - " test_predictions = alg.predict(X_test)\n", - "\n", - " # Printing model report: \n", - " print(\"\\nModel Report : best iteration \"+str(cvresult.shape[0]))\n", - " print(\"Train Accuracy : \"+str(metrics.accuracy_score(y_train, train_predictions)))\n", - " print(\"Test Accuracy : \"+str(metrics.accuracy_score(y_test, test_predictions)))\n", - " return cvresult.shape[0]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This function incorporates the k-folding CV and early stopping, saving not only the optimal model but also the index of its training iteration. This means, in our subsequent steps, we can apply an upper limit on training for models based on the convergence of the default hyperparameters, saving us some time. " - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Defining model with high maximum estimators for use with early stopping\n", - "bdt_es = XGBClassifier(learning_rate = LR, n_estimators=1000,\n", - " # Default values of other hyperparamters\n", - " #max_depth=6, min_child_weight=1,\n", - " #gamma=0, subsample=0.8,\n", - " #colsample_bytree=0.8, scale_pos_weight=1,\n", - " #objective='binary:logistic', # default for binary classification\n", - " #objective='mutli:softprob', num_class=3, # for multiclassifiers\n", - " seed=123, n_threads=-1)\n", - "\n", - "# Timing the CV using early stopping\n", - "stime = time.time()\n", - "estimators = modelfit(bdt_es, \"error\", X1, y1, training_columns, kf, bestfold)\n", - "print(\"\\nmodelfit(bdt_es) --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "# Saving model predictions\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['XGBes'] = bdt_es.predict_proba(df[training_columns])[:,1]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This provides us with an improved model as well as a benchmark to test against in both performance and training efficiency. When training using new combinations of hyperparameters, the maximum number of estimators from our model report will cut off any new models improving more slowly than our default, while, for more efficient models, the early stopping will kick in." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Drawing plot to compare model response for signal and background classes\n", - "plt.figure()\n", - "plot_comparision('XGBcv', mc_df, bkg_df)\n", - "plot_comparision('XGBes', mc_df, bkg_df)\n", - "\n", - "# Drawing signal significance comparison as a function of minimum cut on model response\n", - "plt.figure()\n", - "plot_significance(bdt_cv, training_data, training_columns)\n", - "plot_significance(bdt_es, training_data, training_columns)\n", - "\n", - "# Drawing comaprison of the signal efficiency vs background rejection curve (ROC)\n", - "plt.figure()\n", - "plot_roc(bdt_cv, training_data, training_columns)\n", - "plot_roc(bdt_es, training_data, training_columns)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Hyperameter optimisation\n", - "\n", - "Below we provide a \"grid\" of hyperparameters, defining the structure of the trees and constraints on the learning, but there are many more values to choose from and a larger parameter space to be explored. These optimsations are very problem specific and their impact will have to be weighed against the computing resources and timeframe you have at your disposal. For the sake of expedient demonstration we are comparing the default parameters to only one predetermined variation in 2 parameters. " - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Define a function that performs a gridscan of HPs\n", - "def hpgridscan(alg, metric, params, label, kfold, fbest, early_stop=10):\n", - "\n", - " # Load data fold with best performance\n", - " for k, (train, test) in enumerate(kf.split(params)):\n", - " if (k==fbest):\n", - " X_train, X_test = params.iloc[train], params.iloc[test]\n", - " y_train, y_test = label.iloc[train], label.iloc[test]\n", - "\n", - " # Define a dictionary of numpy arrays for our HPs\n", - " params = {\n", - " 'max_depth':np.array([7]),\n", - " 'min_child_weight':np.array([3]),\n", - " #'max_depth':np.arange( 5, 9, 1 ),\n", - " #'min_child_weight':np.arange( 1, 5, 1 ),\n", - " ##'gamma':np.arange( 0.0, 1.0, 0.1 ),\n", - " ##'colsample_bytree':np.arange( 0.4, 1.0, 0.1 ),\n", - " ##'subsample':np.arange( 0.4, 1.0, 0.1 ),\n", - " ##'scale_pos_weight':np.arange( 0.4, 1.6, 0.1 )\n", - " }\n", - "\n", - " # Perform timed grid scan with established n_estimator cutoff and early stopping\n", - " stime = time.time()\n", - " gs = GridSearchCV(estimator=alg,\n", - " param_grid=params,\n", - " scoring=metric,\n", - " iid=False,\n", - " cv=kf,\n", - " n_jobs=-1) \n", - " gs.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", - " eval_set=[(X_train, y_train), (X_test, y_test)],\n", - " verbose=False, early_stopping_rounds=early_stop)\n", - " print(\"XGBoost grid-scan --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - " # Return suggested parameters, performance and best model\n", - " training_monitor(gs.best_estimator_)\n", - " print(\"Suggestion:\", gs.best_params_)\n", - " print(\"Accuracy:\" ,gs.best_score_)\n", - " return gs" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Running with estimators maximum for shortened training\n", - "bdt_st = XGBClassifier( learning_rate = LR, n_estimators=estimators,\n", - " seed=123, n_threads=-1)\n", - "\n", - "# Running timed hyperparameter gridscan\n", - "stime = time.time()\n", - "gs = hpgridscan(bdt_st, \"accuracy\", X1, y1, kf, bestfold)\n", - "bdt_gs = gs.best_estimator_\n", - "print(\"\\nhpgridscan(bdt_st) --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "# Get model predictions\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['XGBgs'] = bdt_gs.predict_proba(df[training_columns])[:,1]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Even this naive grid scan, using the same fold as before for fair comparison, can provide significant improvements as demonstrated above. These may be pushed further by including more hyperparameters for a trade off with processing time. However, even with parrallisation these tasks can take hours or longer and might only provide improvement of O(>1%)." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "## We could define a model using optimal hyperparameters from our grid scan\n", - "#bdt_opt = XGBClassifier( learning_rate = LR, n_estimators=1000, \n", - "# max_depth=gs.best_params_['max_depth'],\n", - "# min_child_weight=gs.best_params_['min_child_weight'], \n", - "# seed=123, n_threads=-1 )\n", - "\n", - "## Run with CV early stopping\n", - "#stime = time.time()\n", - "#estimators = modelfit(bdt_opt, 'error', X1, y1, training_columns, kf, bestfold)\n", - "#print(\"\\nmodelfit(bdt_opt) --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "## Get model predictions\n", - "#for df in [mc_df, bkg_df, data_df, training_data]:\n", - "# df['XGBopt'] = bdt_opt.predict_proba(df[training_columns])[:,1]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Comapring model response from the end of last session to the end of this one\n", - "plt.figure()\n", - "plot_comparision('XGB', mc_df, bkg_df)\n", - "plot_comparision('XGBgs', mc_df, bkg_df)\n", - "\n", - "# Comparing the impact on projected performance at each stage of the tutorial\n", - "plt.figure()\n", - "plot_significance(bdt, training_data, training_columns)\n", - "plot_significance(bdt_cv, training_data, training_columns)\n", - "plot_significance(bdt_es, training_data, training_columns)\n", - "plot_significance(bdt_gs, training_data, training_columns)\n", - "#plot_significance(bdt_opt, training_data, training_columns)\n", - "\n", - "# Comparing model performance for each level of tuning\n", - "plt.figure()\n", - "plot_roc(bdt, training_data, training_columns)\n", - "plot_roc(bdt_cv, training_data, training_columns)\n", - "plot_roc(bdt_es, training_data, training_columns)\n", - "plot_roc(bdt_gs, training_data, training_columns)\n", - "#plot_roc(bdt_opt, training_data, training_columns)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also choose a higher learning rate to perform course scans of your space and decrease it again to retrain your final model. If you can afford to, it might be best to include learning rate itself as a parameter in your grid. With some libraries you can specify your choice of kernel. Both these choices will impact your optimal maximum number of iterations, so setting it sufficiently high and using early stopping might be a good strategy.\n", - "\n", - "For less exhaustive and non-discritised methods try smart combinations of the following to perform adaptive scans or build your own: \n", - "* sklearn.model_selection.RandomizedSearchCV\n", - "* sklearn.model_selection.GridSearchCV\n", - "\n", - "Moving to higher dimentional optimisation problems may require more sophisticated solutions:\n", - "* skopt.BayesSearchCV\n", - "* hyperopt.tpe" - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Final lesson time and processing time check\n", - "print(\"Notebook real time --- %s seconds ---\" % (time.time() - stt))\n", - "print(\"Notebook CPU time --- %s seconds ---\" % (time.clock() - stc))" - ], - "execution_count": null, - "outputs": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - } - }, - - "nbformat": 4, - "nbformat_minor": 4 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model tuning setup" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#%store -r training_data\n", + "#%store -r training_columns\n", + "#%store -r bkg_df\n", + "#%store -r mc_df\n", + "#%store -r data_df" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#@title\n", + "!pip install uproot\n", + "!pip install sklearn\n", + "\n", + "import time\n", + "from matplotlib import pyplot as plt\n", + "import uproot\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import xgboost as xgb\n", + "\n", + "from xgboost.sklearn import XGBClassifier\n", + "from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier\n", + "\n", + "from sklearn import metrics\n", + "from sklearn.metrics import roc_curve, auc\n", + "\n", + "from sklearn.model_selection import KFold, train_test_split, cross_validate, cross_val_score, GridSearchCV" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Time and processing check for the lesson\n", + "stt = time.time()\n", + "stc = time.clock()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def plot_mass(df, label=\"\", norm=True):\n", + " counts, bins, _ = plt.hist(df['Jpsi_M'], label=label, bins=100, range=[2.75, 3.5], histtype='step', density=norm)\n", + " # You can also use LaTeX in the axis label\n", + " plt.xlabel('$J/\\\\psi$ mass [GeV]')\n", + " plt.xlim(bins[0], bins[-1])" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def plot_comparision(var, mc_df, bkg_df):\n", + " _, bins, _ = plt.hist(mc_df[var], bins=100, histtype='step', label='MC', density=1)\n", + " _, bins, _ = plt.hist(bkg_df[var], bins=bins, histtype='step', label='Background', density=1)\n", + " plt.xlabel(var)\n", + " plt.xlim(bins[0], bins[-1])\n", + " plt.legend(loc='best')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def plot_roc(bdt, training_data, training_columns, label=None):\n", + " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", + " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", + " area = auc(fpr, tpr)\n", + "\n", + " plt.plot([0, 1], [0, 1], color='grey', linestyle='--')\n", + " if label:\n", + " plt.plot(fpr, tpr, label=f'{label} (area = {area:.2f})')\n", + " else:\n", + " plt.plot(fpr, tpr, label=f'ROC curve (area = {area:.2f})')\n", + " plt.xlim(0.0, 1.0)\n", + " plt.ylim(0.0, 1.0)\n", + " plt.xlabel('False Positive Rate')\n", + " plt.ylabel('True Positive Rate')\n", + " plt.legend(loc='lower right')\n", + " # We can make the plot look nicer by forcing the grid to be square\n", + " plt.gca().set_aspect('equal', adjustable='box')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def plot_significance(bdt, training_data, training_columns, label):\n", + " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", + " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", + " \n", + " n_sig = 1200\n", + " n_bkg = 23000\n", + " S = n_sig*tpr + (n_sig*tpr==0)*1\n", + " B = n_bkg*fpr + (n_bkg*tpr==0)*1\n", + " metric = S/np.sqrt(S+B)\n", + "\n", + " plt.plot(thresholds, metric, label=label)\n", + " plt.xlabel('BDT cut value')\n", + " plt.ylabel('$\\\\frac{S}{\\\\sqrt{S+B}}$')\n", + " plt.xlim(0, 1.0)\n", + "\n", + " optimum = np.max(metric)\n", + " optimal_cut = thresholds[np.argmax(metric)]\n", + " print(label, \": S/sqrt(S+B) =\", optimum, \" at x =\", optimal_cut)\n", + " plt.axvline(x=optimal_cut, color='black', linewidth=1.0, linestyle='--')\n", + "\n", + " return optimal_cut" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#max_entries = 1000 # try running with low stats for bug fixing your changes quickly \n", + "data_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/real_data.root',\n", + " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", + " )['DecayTree'].pandas.df()#(entrystop=max_entries)\n", + "mc_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/simulated_data.root',\n", + " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", + " )['DecayTree'].pandas.df()#(entrystop=max_entries)\n", + "bkg_df = data_df.query('~(3.0 < Jpsi_M < 3.2)')\n", + "\n", + "for df in [mc_df, data_df, bkg_df]:\n", + " df.eval('Jpsi_eta = arctanh(Jpsi_PZ/Jpsi_P)', inplace=True)\n", + " df.eval('mup_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", + " df.eval('mum_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", + "\n", + "bkg_df['catagory'] = 0 # Use 0 for background\n", + "mc_df['catagory'] = 1 # Use 1 for signal\n", + "training_data = pd.concat([bkg_df, mc_df], copy=True, ignore_index=True)\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['IPdiff'] = np.abs(df['mum_PT'] - df['mup_PT'])\n", + " \n", + "training_columns = [\n", + " 'Jpsi_PT',\n", + " 'mup_PT', 'mup_eta', 'mup_ProbNNmu', 'mup_IP',\n", + " 'mum_PT', 'mum_eta', 'mum_ProbNNmu', 'mum_IP',\n", + "]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Previously we trained an XGBClassifier with the default settings, with learning rate = 0.3 and maximum iterations = 100. This cut off to the training process may be limiting the performance of our model. We can monitor the performance of our model as a function of training iteration and stop the training when the gradient approximates zero. " + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "X1, y1 = training_data[training_columns], training_data['catagory']\n", + "X_train, X_test, y_train, y_test = train_test_split(X1, y1) \n", + "# default train_size = 0.25, this can be varied to suit your data\n", + "\n", + "LR = 0.3 # the coefficient of step size decay, eta, has alias 'learning_rate' with default 0.3\n", + "\n", + "stime = time.time()\n", + "bdt = XGBClassifier(learning_rate = LR, n_estimators=100, seed=123, n_threads=-1)\n", + "bdt.fit(training_data[training_columns], training_data['catagory'])\n", + "print(\"XGBoost --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGB'] = bdt.predict_proba(df[training_columns])[:,1]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cross-validation\n", + "\n", + "Splitting the data into randomised subsets for training allows you to monitor your model's performance on the fly using the statistically independant remainder of your sample - this is called cross-validation (CV). We can see below that at the 100th iteration the metrics still show a trend of improvement." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def training_monitor(alg): \n", + "\n", + " # A model trained with eval_set and eval_metric will return evals_result\n", + " results = alg.evals_result()\n", + " epochs = len(results['validation_0']['logloss'])\n", + " x_axis = range(0, epochs)\n", + "\n", + " # Plotting logLoss as a function of training iteration\n", + " fig, ax = plt.subplots()\n", + " ax.plot(x_axis, results['validation_0']['logloss'], label='Train') # for each eval_set\n", + " if results['validation_1']: ax.plot(x_axis, results['validation_1']['logloss'], label='Test') \n", + " ax.legend()\n", + " plt.ylabel('LogLoss')\n", + " plt.title('LogLoss')\n", + " plt.show()\n", + " \n", + " # Plotting classification error as a function of training iteration\n", + " fig, ax = plt.subplots()\n", + " ax.plot(x_axis, results['validation_0']['error'], label='Train') # for each eval_set\n", + " if results['validation_1']: ax.plot(x_axis, results['validation_1']['error'], label='Test')\n", + " ax.legend()\n", + " plt.ylabel('Error')\n", + " plt.title('Error')\n", + " plt.show()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This involves training on less data but allows us to monitor progress to check if the model is becoming over-specific to our training sample. The minimisation of loss and classification error are common metrics for model assessment. As shown below, the cost to performance is negligible. If the test sample gradient were to invert this would be considered overtraining and is why monitoring performance without CV can be a time costly pitfall." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Defining a model with multi-threading set to maximum\n", + "bdt_cv = XGBClassifier(learning_rate = LR, n_estimators=100, seed=123, n_threads=-1)\n", + "\n", + "# Model fitting with CV and printing out processing time\n", + "stime = time.time()\n", + "bdt_cv.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", + " eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)\n", + "print(\"\\nXGBoost cross-validation --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Writing model predictions out for data\n", + "training_monitor(bdt_cv)\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBcv'] = bdt_cv.predict_proba(df[training_columns])[:,1]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Drawing plot of model respone for signal and background classes\n", + "plt.figure()\n", + "plot_comparision('XGB', mc_df, bkg_df)\n", + "plot_comparision('XGBcv', mc_df, bkg_df)\n", + "\n", + "# Drawing the signal efficiency vs background rejection curve (ROC)\n", + "plt.figure()\n", + "plot_roc(bdt, training_data, training_columns)\n", + "plot_roc(bdt_cv, training_data, training_columns)\n", + "\n", + "# Drawing signal significance comparison as a function of minimum cut on model response\n", + "plt.figure()\n", + "bdt_cut = plot_significance(bdt, training_data, training_columns, \"bdt\")\n", + "bdt_cv_cut = plot_significance(bdt_cv, training_data, training_columns, \"bdt_cv\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### $k$-folding & early stopping\n", + "\n", + "Performing CV on each of a number, k, of ways to split your data gives you k models to choose from. Some choose to average the performance across the models from each fold as any instability might imply the model will not be reliable. The results below seem stable; each fold provides a consistant performance across multiple metrics, so we'll just choose the best one." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Defining the folds with a seed to test consistently \n", + "splits = 4 # to match 0.25 value of test_train_split default though this may not be optimal\n", + "kf = KFold(splits, True, random_state=123)\n", + "\n", + "# Printing processing time of the kfold cross-validation\n", + "stime = time.time()\n", + "for train, test in kf.split(X1):\n", + " X_train, X_test = X1.iloc[train], X1.iloc[test]\n", + " y_train, y_test = y1.iloc[train], y1.iloc[test]\n", + " bdt.fit(X_train,y_train)\n", + "print(\"\\nXGBoost k-folding --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Calculating scores of each fold using variety of CV-metrics\n", + "cv_acc = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"accuracy\", n_jobs=-1)\n", + "cv_los = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"neg_log_loss\", n_jobs=-1)\n", + "cv_auc = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"roc_auc\", n_jobs=-1)\n", + "\n", + "# Printing results and indicating best fold\n", + "print(\"accuracy: \",cv_acc, \" -> best fold =\", np.argmax(cv_acc) )\n", + "print(\"-logloss: \",cv_los, \" -> best fold =\", np.argmax(cv_los) )\n", + "print(\"roc_auc: \",cv_auc, \" -> best fold =\", np.argmax(cv_auc) )\n", + "bestfold = np.argmax(cv_acc)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Early stopping defines a maximum number of rounds the cross-validation metric (we'll use 'error'=1-accuracy) is allowed to not improve before training is terminated. As is standard, we will be reverting back to a 'previous best' model based on test sample score, this helps avoid overtraining. Early stopping prevents us training too many of extra models thus saving time. Set the limit too small though and your training might be cut off prematurely." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def modelfit(alg, metric, params, label, predictors, kfold, fbest, early_stop=10):\n", + "\n", + " # Loading data split inputs providing best fold result\n", + " for k, (train, test) in enumerate(kf.split(params)):\n", + " if (k==fbest):\n", + " X_train, X_test = params.iloc[train], params.iloc[test]\n", + " y_train, y_test = label.iloc[train], label.iloc[test]\n", + "\n", + " # Defining data in terms of training variables and class label\n", + " xgb_param = alg.get_xgb_params()\n", + " data = xgb.DMatrix(params, label=label, feature_names=predictors, nthread=-1)\n", + " \n", + " # Runs timed CV on our model using early stopping based on our metric\n", + " stime = time.time()\n", + " cvresult = xgb.cv(xgb_param,\n", + " data,\n", + " num_boost_round=alg.get_params()['n_estimators'],\n", + " #nfold=cv_folds, # to use in build folding\n", + " folds=kfold, # use -> ignores nfold \n", + " metrics=metric,\n", + " early_stopping_rounds=early_stop)\n", + " alg.set_params(n_estimators=cvresult.shape[0])\n", + " print(\"\\nXGBoost early-stop folding --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + " # Fitting the algorithm on the data with CV evaluation early stopping\n", + " stime = time.time()\n", + " alg.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", + " eval_set=[(X_train, y_train), (X_test, y_test)],\n", + " verbose=False, early_stopping_rounds=early_stop)\n", + " training_monitor(alg)\n", + " print(\"XGBoost early-stop limit --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + " # Predicting training set:\n", + " train_predictions = alg.predict(X_train) \n", + " test_predictions = alg.predict(X_test)\n", + "\n", + " # Printing model report: \n", + " print(\"\\nModel Report : best iteration \"+str(cvresult.shape[0]))\n", + " print(\"Train Accuracy : \"+str(metrics.accuracy_score(y_train, train_predictions)))\n", + " print(\"Test Accuracy : \"+str(metrics.accuracy_score(y_test, test_predictions)))\n", + " return cvresult.shape[0]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This function incorporates the k-folding CV and early stopping, saving not only the optimal model but also the index of its training iteration. This means, in our subsequent steps, we can apply an upper limit on training for models based on the convergence of the default hyperparameters, saving us some time. " + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Defining model with high maximum estimators for use with early stopping\n", + "bdt_es = XGBClassifier(learning_rate = LR, n_estimators=1000,\n", + " # Default values of other hyperparamters\n", + " #max_depth=6, min_child_weight=1,\n", + " #gamma=0, subsample=0.8,\n", + " #colsample_bytree=0.8, scale_pos_weight=1,\n", + " #objective='binary:logistic', # default for binary classification\n", + " #objective='mutli:softprob', num_class=3, # for multiclassifiers\n", + " seed=123, n_threads=-1)\n", + "\n", + "# Timing the CV using early stopping\n", + "stime = time.time()\n", + "estimators = modelfit(bdt_es, \"error\", X1, y1, training_columns, kf, bestfold)\n", + "print(\"\\nmodelfit(bdt_es) --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Saving model predictions\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBes'] = bdt_es.predict_proba(df[training_columns])[:,1]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This provides us with an improved model as well as a benchmark to test against in both performance and training efficiency. When training using new combinations of hyperparameters, the maximum number of estimators from our model report will cut off any new models improving more slowly than our default, while, for more efficient models, the early stopping will kick in." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Drawing plot to compare model response for signal and background classes\n", + "plt.figure()\n", + "plot_comparision('XGBcv', mc_df, bkg_df)\n", + "plot_comparision('XGBes', mc_df, bkg_df)\n", + "\n", + "# Drawing comaprison of the signal efficiency vs background rejection curve (ROC)\n", + "plt.figure()\n", + "plot_roc(bdt_cv, training_data, training_columns)\n", + "plot_roc(bdt_es, training_data, training_columns)\n", + "\n", + "# Drawing signal significance comparison as a function of minimum cut on model response\n", + "plt.figure()\n", + "bdt_cut_cv = plot_significance(bdt_cv, training_data, training_columns, \"bdt_cv\")\n", + "bdt_cut_es = plot_significance(bdt_es, training_data, training_columns, \"bdt_es\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Hyperameter optimisation\n", + "\n", + "Below we provide a \"grid\" of hyperparameters, defining the structure of the trees and constraints on the learning, but there are many more values to choose from and a larger parameter space to be explored. These optimsations are very problem specific and their impact will have to be weighed against the computing resources and timeframe you have at your disposal. For the sake of expedient demonstration we are comparing the default parameters to only one predetermined variation in 2 parameters. " + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Define a function that performs a gridscan of HPs\n", + "def hpgridscan(alg, metric, params, label, kfold, fbest, early_stop=10):\n", + "\n", + " # Load data fold with best performance\n", + " for k, (train, test) in enumerate(kf.split(params)):\n", + " if (k==fbest):\n", + " X_train, X_test = params.iloc[train], params.iloc[test]\n", + " y_train, y_test = label.iloc[train], label.iloc[test]\n", + "\n", + " # Define a dictionary of numpy arrays for our HPs\n", + " params = {\n", + " 'max_depth':np.array([7]),\n", + " 'min_child_weight':np.array([3]),\n", + " #'max_depth':np.arange( 5, 9, 1 ),\n", + " #'min_child_weight':np.arange( 1, 5, 1 ),\n", + " ##'gamma':np.arange( 0.0, 1.0, 0.1 ),\n", + " ##'colsample_bytree':np.arange( 0.4, 1.0, 0.1 ),\n", + " ##'subsample':np.arange( 0.4, 1.0, 0.1 ),\n", + " ##'scale_pos_weight':np.arange( 0.4, 1.6, 0.1 )\n", + " }\n", + "\n", + " # Perform timed grid scan with established n_estimator cutoff and early stopping\n", + " stime = time.time()\n", + " gs = GridSearchCV(estimator=alg,\n", + " param_grid=params,\n", + " scoring=metric,\n", + " iid=False,\n", + " cv=kf,\n", + " n_jobs=-1) \n", + " gs.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", + " eval_set=[(X_train, y_train), (X_test, y_test)],\n", + " verbose=False, early_stopping_rounds=early_stop)\n", + " print(\"XGBoost grid-scan --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + " # Return suggested parameters, performance and best model\n", + " training_monitor(gs.best_estimator_)\n", + " print(\"Suggestion:\", gs.best_params_)\n", + " print(\"Accuracy:\" ,gs.best_score_)\n", + " return gs" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Running with estimators maximum for shortened training\n", + "bdt_st = XGBClassifier( learning_rate = LR, n_estimators=estimators,\n", + " seed=123, n_threads=-1)\n", + "\n", + "# Running timed hyperparameter gridscan\n", + "stime = time.time()\n", + "gs = hpgridscan(bdt_st, \"accuracy\", X1, y1, kf, bestfold)\n", + "bdt_gs = gs.best_estimator_\n", + "print(\"\\nhpgridscan(bdt_st) --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Get model predictions\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBgs'] = bdt_gs.predict_proba(df[training_columns])[:,1]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Even this naive grid scan, using the same fold as before for fair comparison, can provide significant improvements as demonstrated above. These may be pushed further by including more hyperparameters for a trade off with processing time. However, even with parrallisation these tasks can take hours or longer and might only provide improvement of O(>1%)." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "## We could define a model using optimal hyperparameters from our grid scan\n", + "#bdt_opt = XGBClassifier( learning_rate = LR, n_estimators=1000, \n", + "# max_depth=gs.best_params_['max_depth'],\n", + "# min_child_weight=gs.best_params_['min_child_weight'], \n", + "# seed=123, n_threads=-1 )\n", + "\n", + "## Run with CV early stopping\n", + "#stime = time.time()\n", + "#estimators = modelfit(bdt_opt, 'error', X1, y1, training_columns, kf, bestfold)\n", + "#print(\"\\nmodelfit(bdt_opt) --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "## Get model predictions\n", + "#for df in [mc_df, bkg_df, data_df, training_data]:\n", + "# df['XGBopt'] = bdt_opt.predict_proba(df[training_columns])[:,1]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Comapring model response from the end of last session to the end of this one\n", + "plt.figure()\n", + "plot_comparision('XGB', mc_df, bkg_df)\n", + "plot_comparision('XGBgs', mc_df, bkg_df)\n", + "\n", + "# Comparing model performance for each level of tuning\n", + "plt.figure()\n", + "plot_roc(bdt, training_data, training_columns)\n", + "plot_roc(bdt_cv, training_data, training_columns)\n", + "plot_roc(bdt_es, training_data, training_columns)\n", + "plot_roc(bdt_gs, training_data, training_columns)\n", + "#plot_roc(bdt_opt, training_data, training_columns)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Comparing the impact on projected performance at each stage of the tutorial\n", + "plt.figure()\n", + "bdt_cut = plot_significance(bdt, training_data, training_columns, \"bdt\")\n", + "bdt_cv_cut = plot_significance(bdt_cv, training_data, training_columns, \"bdt_cv\")\n", + "bdt_es_cut = plot_significance(bdt_es, training_data, training_columns, \"bdt_es\")\n", + "bdt_gs_cut = plot_significance(bdt_gs, training_data, training_columns, \"bdt_gs\")\n", + "#bdt_opt_cut = plot_significance(bdt_opt, training_data, training_columns, \"bdt_opt\")\n", + "\n", + "# Comparing best cuts impact on mass for original and tuned model\n", + "plt.figure()\n", + "data_bdt_cut = data_df.query('XGB > %f' %bdt_cut )\n", + "plot_mass(data_bdt_cut, label='XGB default', norm=True)\n", + "data_gs_cut = data_df.query('XGBgs > %f' %bdt_gs_cut )\n", + "plot_mass(data_gs_cut, label='XGB tuned', norm=True)\n", + "plt.legend(loc='best')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also choose a higher learning rate to perform course scans of your space and decrease it again to retrain your final model. If you can afford to, it might be best to include learning rate itself as a parameter in your grid. With some libraries you can specify your choice of kernel. Both these choices will impact your optimal maximum number of iterations, so setting it sufficiently high and using early stopping might be a good strategy.\n", + "\n", + "For less exhaustive and non-discritised methods try smart combinations of the following to perform adaptive scans or build your own: \n", + "* sklearn.model_selection.RandomizedSearchCV\n", + "* sklearn.model_selection.GridSearchCV\n", + "\n", + "Moving to higher dimentional optimisation problems may require more sophisticated solutions:\n", + "* skopt.BayesSearchCV\n", + "* hyperopt.tpe" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Final lesson time and processing time check\n", + "print(\"Notebook real time --- %s seconds ---\" % (time.time() - stt))\n", + "print(\"Notebook CPU time --- %s seconds ---\" % (time.clock() - stc))" + ], + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } From 58d95fb8db08459e09e67c83f1b33080fdaffd29 Mon Sep 17 00:00:00 2001 From: jvmead Date: Sat, 29 Feb 2020 16:09:06 +0000 Subject: [PATCH 34/54] remove fresh installs --- advanced-python/4bModelTuning.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/advanced-python/4bModelTuning.ipynb b/advanced-python/4bModelTuning.ipynb index 7f8a4fa3..ecb91b99 100644 --- a/advanced-python/4bModelTuning.ipynb +++ b/advanced-python/4bModelTuning.ipynb @@ -25,8 +25,8 @@ "metadata": {}, "source": [ "#@title\n", - "!pip install uproot\n", - "!pip install sklearn\n", + "#!pip install uproot\n", + "#!pip install sklearn\n", "\n", "import time\n", "from matplotlib import pyplot as plt\n", From 4c256cd26c1d798c981de019b56443592a343cb1 Mon Sep 17 00:00:00 2001 From: jvmead Date: Mon, 2 Mar 2020 17:47:07 +0000 Subject: [PATCH 35/54] added to toctree --- advanced-python/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/advanced-python/README.md b/advanced-python/README.md index f7a98ab6..a33aad03 100644 --- a/advanced-python/README.md +++ b/advanced-python/README.md @@ -12,6 +12,7 @@ TODO... 2DataAndPlotting.ipynb 3Classification.ipynb 4Extension.ipynb + 4bModelTuning.ipynb 5BoostingToUniformity.ipynb 6DemoNeuralNetworks.ipynb 7DemoReweighting.ipynb From 069795365d7aec0543f0020567b06fa34eb17e06 Mon Sep 17 00:00:00 2001 From: jvmead Date: Tue, 24 Nov 2020 20:32:56 +0100 Subject: [PATCH 36/54] checking expired logs --- advanced-python/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/advanced-python/README.md b/advanced-python/README.md index a33aad03..6b10bd21 100644 --- a/advanced-python/README.md +++ b/advanced-python/README.md @@ -1,5 +1,4 @@ # Advanced Python Tutorial - TODO... ```eval_rst From 657c72345d483e1e01352d36fe6e70d0b3246cce Mon Sep 17 00:00:00 2001 From: jvmead Date: Sun, 5 Dec 2021 14:06:54 +0100 Subject: [PATCH 37/54] Update 4bModelTuning.ipynb --- advanced-python/4bModelTuning.ipynb | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/advanced-python/4bModelTuning.ipynb b/advanced-python/4bModelTuning.ipynb index ecb91b99..c74bc3af 100644 --- a/advanced-python/4bModelTuning.ipynb +++ b/advanced-python/4bModelTuning.ipynb @@ -143,13 +143,13 @@ "cell_type": "code", "metadata": {}, "source": [ - "#max_entries = 1000 # try running with low stats for bug fixing your changes quickly \n", + "max_entries = 1000 # try running with low stats for bug fixing your changes quickly \n", "data_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/real_data.root',\n", " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", - " )['DecayTree'].pandas.df()#(entrystop=max_entries)\n", + " )['DecayTree'].pandas.df(entrystop=max_entries)\n", "mc_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/simulated_data.root',\n", " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", - " )['DecayTree'].pandas.df()#(entrystop=max_entries)\n", + " )['DecayTree'].pandas.df(entrystop=max_entries)\n", "bkg_df = data_df.query('~(3.0 < Jpsi_M < 3.2)')\n", "\n", "for df in [mc_df, data_df, bkg_df]:\n", @@ -609,7 +609,10 @@ "\n", "Moving to higher dimentional optimisation problems may require more sophisticated solutions:\n", "* skopt.BayesSearchCV\n", - "* hyperopt.tpe" + "* hyperopt.tpe", + "\n", + "Full stats plots saved here: https://colab.research.google.com/drive/13dwXKKHxqQfk8Zo2Gzh46_pCc9xBgi1H?usp=sharing \n", + "Run with full stats by removing entrystop at max_events in cell 8." ] }, { From 5a7ee43ae29ea70074c804b6e8cb49683d9c78f7 Mon Sep 17 00:00:00 2001 From: jvmead Date: Mon, 31 Jan 2022 15:07:17 +0100 Subject: [PATCH 38/54] Update 4bModelTuning.ipynb link to full stats plots saved in google collab notebook --- advanced-python/4bModelTuning.ipynb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/advanced-python/4bModelTuning.ipynb b/advanced-python/4bModelTuning.ipynb index c74bc3af..a165c501 100644 --- a/advanced-python/4bModelTuning.ipynb +++ b/advanced-python/4bModelTuning.ipynb @@ -609,8 +609,7 @@ "\n", "Moving to higher dimentional optimisation problems may require more sophisticated solutions:\n", "* skopt.BayesSearchCV\n", - "* hyperopt.tpe", - "\n", + "* hyperopt.tpe\n\n", "Full stats plots saved here: https://colab.research.google.com/drive/13dwXKKHxqQfk8Zo2Gzh46_pCc9xBgi1H?usp=sharing \n", "Run with full stats by removing entrystop at max_events in cell 8." ] From 8479288cfb27eb0f009f43459f8a0c3518354934 Mon Sep 17 00:00:00 2001 From: jvmead Date: Thu, 3 Feb 2022 11:27:10 +0100 Subject: [PATCH 39/54] Update 4bModelTuning.ipynb time.clock() deprecated -> change to time.process_time() --- advanced-python/4bModelTuning.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/advanced-python/4bModelTuning.ipynb b/advanced-python/4bModelTuning.ipynb index a165c501..6b5be4c0 100644 --- a/advanced-python/4bModelTuning.ipynb +++ b/advanced-python/4bModelTuning.ipynb @@ -53,7 +53,7 @@ "source": [ "# Time and processing check for the lesson\n", "stt = time.time()\n", - "stc = time.clock()" + "stc = time.process_time()" ], "execution_count": null, "outputs": [] @@ -620,7 +620,7 @@ "source": [ "# Final lesson time and processing time check\n", "print(\"Notebook real time --- %s seconds ---\" % (time.time() - stt))\n", - "print(\"Notebook CPU time --- %s seconds ---\" % (time.clock() - stc))" + "print(\"Notebook CPU time --- %s seconds ---\" % (time.process_time() - stc))" ], "execution_count": null, "outputs": [] From 587043e222f80698fb391c730f566d7642412c78 Mon Sep 17 00:00:00 2001 From: jvmead Date: Thu, 3 Feb 2022 12:37:35 +0100 Subject: [PATCH 40/54] Update 4bModelTuning.ipynb uproot 4 changes to df interpretation using arrays(library=) --- advanced-python/4bModelTuning.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/advanced-python/4bModelTuning.ipynb b/advanced-python/4bModelTuning.ipynb index 6b5be4c0..fdfa2e8a 100644 --- a/advanced-python/4bModelTuning.ipynb +++ b/advanced-python/4bModelTuning.ipynb @@ -146,10 +146,10 @@ "max_entries = 1000 # try running with low stats for bug fixing your changes quickly \n", "data_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/real_data.root',\n", " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", - " )['DecayTree'].pandas.df(entrystop=max_entries)\n", + " )['DecayTree'].arrays(library='pd').pandas.df(entrystop=max_entries)\n", "mc_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/simulated_data.root',\n", " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", - " )['DecayTree'].pandas.df(entrystop=max_entries)\n", + " )['DecayTree'].arrays(library='pd').pandas.df(entrystop=max_entries)\n", "bkg_df = data_df.query('~(3.0 < Jpsi_M < 3.2)')\n", "\n", "for df in [mc_df, data_df, bkg_df]:\n", From e99761c3deaa30929f468a2e48399e6c1f4bd1d3 Mon Sep 17 00:00:00 2001 From: jvmead Date: Thu, 3 Feb 2022 13:33:31 +0100 Subject: [PATCH 41/54] Update 4bModelTuning.ipynb array(library=pd).pandas.df -> array(library=pd).df --- advanced-python/4bModelTuning.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/advanced-python/4bModelTuning.ipynb b/advanced-python/4bModelTuning.ipynb index fdfa2e8a..9986efb7 100644 --- a/advanced-python/4bModelTuning.ipynb +++ b/advanced-python/4bModelTuning.ipynb @@ -146,10 +146,10 @@ "max_entries = 1000 # try running with low stats for bug fixing your changes quickly \n", "data_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/real_data.root',\n", " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", - " )['DecayTree'].arrays(library='pd').pandas.df(entrystop=max_entries)\n", + " )['DecayTree'].arrays(library='pd').df(entrystop=max_entries)\n", "mc_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/simulated_data.root',\n", " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", - " )['DecayTree'].arrays(library='pd').pandas.df(entrystop=max_entries)\n", + " )['DecayTree'].arrays(library='pd').df(entrystop=max_entries)\n", "bkg_df = data_df.query('~(3.0 < Jpsi_M < 3.2)')\n", "\n", "for df in [mc_df, data_df, bkg_df]:\n", From 352af71d341234e67082b3bacd23a4d26d9dd39a Mon Sep 17 00:00:00 2001 From: jvmead Date: Thu, 3 Feb 2022 14:28:03 +0100 Subject: [PATCH 42/54] Update 4bModelTuning.ipynb .df(entrystop) -> .arrays(entry_stop) --- advanced-python/4bModelTuning.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/advanced-python/4bModelTuning.ipynb b/advanced-python/4bModelTuning.ipynb index 9986efb7..8d507984 100644 --- a/advanced-python/4bModelTuning.ipynb +++ b/advanced-python/4bModelTuning.ipynb @@ -146,10 +146,10 @@ "max_entries = 1000 # try running with low stats for bug fixing your changes quickly \n", "data_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/real_data.root',\n", " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", - " )['DecayTree'].arrays(library='pd').df(entrystop=max_entries)\n", + " )['DecayTree'].arrays(library='pd',entry_stop=max_entries)\n", "mc_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/simulated_data.root',\n", " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", - " )['DecayTree'].arrays(library='pd').df(entrystop=max_entries)\n", + " )['DecayTree'].arrays(library='pd',entry_stop=max_entries)\n", "bkg_df = data_df.query('~(3.0 < Jpsi_M < 3.2)')\n", "\n", "for df in [mc_df, data_df, bkg_df]:\n", From c3c19bb2fa607a8f181d649bb35882870e14c505 Mon Sep 17 00:00:00 2001 From: jvmead Date: Thu, 17 Mar 2022 11:25:17 +0100 Subject: [PATCH 43/54] Update 4bModelTuning.ipynb crashing with too few events->removed max events and specified kfold keyword args instead of positional --- advanced-python/4bModelTuning.ipynb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/advanced-python/4bModelTuning.ipynb b/advanced-python/4bModelTuning.ipynb index 8d507984..4fcd96b6 100644 --- a/advanced-python/4bModelTuning.ipynb +++ b/advanced-python/4bModelTuning.ipynb @@ -143,13 +143,13 @@ "cell_type": "code", "metadata": {}, "source": [ - "max_entries = 1000 # try running with low stats for bug fixing your changes quickly \n", + "#max_entries = 1000 # try running with low stats for bug fixing your changes quickly \n", "data_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/real_data.root',\n", " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", - " )['DecayTree'].arrays(library='pd',entry_stop=max_entries)\n", + " )['DecayTree'].arrays(library='pd')#,entry_stop=max_entries)\n", "mc_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/simulated_data.root',\n", " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", - " )['DecayTree'].arrays(library='pd',entry_stop=max_entries)\n", + " )['DecayTree'].arrays(library='pd')#,entry_stop=max_entries)\n", "bkg_df = data_df.query('~(3.0 < Jpsi_M < 3.2)')\n", "\n", "for df in [mc_df, data_df, bkg_df]:\n", @@ -306,7 +306,7 @@ "source": [ "# Defining the folds with a seed to test consistently \n", "splits = 4 # to match 0.25 value of test_train_split default though this may not be optimal\n", - "kf = KFold(splits, True, random_state=123)\n", + "kf = KFold(n_splits=splits, shuffle=True, random_state=123)\n", "\n", "# Printing processing time of the kfold cross-validation\n", "stime = time.time()\n", From 38a09a80afb945574291c68860c99cfe949ac5f1 Mon Sep 17 00:00:00 2001 From: jvmead Date: Thu, 17 Mar 2022 16:30:29 +0100 Subject: [PATCH 44/54] Update 4bModelTuning.ipynb removing 'iid' argument in GridSearchCV --- advanced-python/4bModelTuning.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/advanced-python/4bModelTuning.ipynb b/advanced-python/4bModelTuning.ipynb index 4fcd96b6..90a7b86a 100644 --- a/advanced-python/4bModelTuning.ipynb +++ b/advanced-python/4bModelTuning.ipynb @@ -487,7 +487,7 @@ " gs = GridSearchCV(estimator=alg,\n", " param_grid=params,\n", " scoring=metric,\n", - " iid=False,\n", + " #iid=False,\n", " cv=kf,\n", " n_jobs=-1) \n", " gs.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", From 86e42c3842cdeb14f1b8b3065e296d2078656d9a Mon Sep 17 00:00:00 2001 From: jvmead Date: Sat, 7 May 2022 14:01:56 +0200 Subject: [PATCH 45/54] Update 4bModelTuning.ipynb --- advanced-python/4bModelTuning.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/advanced-python/4bModelTuning.ipynb b/advanced-python/4bModelTuning.ipynb index 90a7b86a..605828f7 100644 --- a/advanced-python/4bModelTuning.ipynb +++ b/advanced-python/4bModelTuning.ipynb @@ -610,7 +610,7 @@ "Moving to higher dimentional optimisation problems may require more sophisticated solutions:\n", "* skopt.BayesSearchCV\n", "* hyperopt.tpe\n\n", - "Full stats plots saved here: https://colab.research.google.com/drive/13dwXKKHxqQfk8Zo2Gzh46_pCc9xBgi1H?usp=sharing \n", + "Full stats plots saved here: bit.ly/LHCb_XGB_Tuning \n", "Run with full stats by removing entrystop at max_events in cell 8." ] }, From b20afb974673802221352234d3f3f8ae13543e50 Mon Sep 17 00:00:00 2001 From: jvmead Date: Mon, 9 May 2022 15:21:38 +0200 Subject: [PATCH 46/54] adding 4b to toctree --- advanced-python/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/advanced-python/README.md b/advanced-python/README.md index 986a9bdd..3e324eaa 100644 --- a/advanced-python/README.md +++ b/advanced-python/README.md @@ -15,6 +15,7 @@ a knowledge base that one can always come back to lock up things. 12AdvancedClasses.ipynb 20DataAndPlotting.ipynb 30Classification.ipynb + 4bModelTuning.ipynb 31ClassificationExtension.ipynb 32BoostingToUniformity.ipynb 40Histograms.ipynb From 1782f034a328eb1839ef20a3cd7e0e83c756fc4a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Nov 2023 20:17:42 +0000 Subject: [PATCH 47/54] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- advanced-python/10Basics.ipynb | 2 +- advanced-python/11AdvancedPython.ipynb | 4 + advanced-python/4bModelTuning.ipynb | 1296 ++++++++++++------------ 3 files changed, 653 insertions(+), 649 deletions(-) diff --git a/advanced-python/10Basics.ipynb b/advanced-python/10Basics.ipynb index 87d8d778..751e29b1 100644 --- a/advanced-python/10Basics.ipynb +++ b/advanced-python/10Basics.ipynb @@ -481,7 +481,7 @@ }, "outputs": [], "source": [ - "{'a': 'b'}.get?" + "get?" ] }, { diff --git a/advanced-python/11AdvancedPython.ipynb b/advanced-python/11AdvancedPython.ipynb index ea1344cb..af848aa1 100644 --- a/advanced-python/11AdvancedPython.ipynb +++ b/advanced-python/11AdvancedPython.ipynb @@ -380,6 +380,8 @@ "outputs": [], "source": [ "# SOLUTION\n", + "\n", + "\n", "@contextlib.contextmanager\n", "def func(x):\n", " yield x\n", @@ -619,6 +621,8 @@ "outputs": [], "source": [ "# SOLUTION\n", + "\n", + "\n", "def timed_func(func):\n", " def wrapped_func(*args, **kwargs):\n", " print(args)\n", diff --git a/advanced-python/4bModelTuning.ipynb b/advanced-python/4bModelTuning.ipynb index 605828f7..f24a99ec 100644 --- a/advanced-python/4bModelTuning.ipynb +++ b/advanced-python/4bModelTuning.ipynb @@ -1,650 +1,650 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model tuning setup" - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "#%store -r training_data\n", - "#%store -r training_columns\n", - "#%store -r bkg_df\n", - "#%store -r mc_df\n", - "#%store -r data_df" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "#@title\n", - "#!pip install uproot\n", - "#!pip install sklearn\n", - "\n", - "import time\n", - "from matplotlib import pyplot as plt\n", - "import uproot\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "import xgboost as xgb\n", - "\n", - "from xgboost.sklearn import XGBClassifier\n", - "from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier\n", - "\n", - "from sklearn import metrics\n", - "from sklearn.metrics import roc_curve, auc\n", - "\n", - "from sklearn.model_selection import KFold, train_test_split, cross_validate, cross_val_score, GridSearchCV" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Time and processing check for the lesson\n", - "stt = time.time()\n", - "stc = time.process_time()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "def plot_mass(df, label=\"\", norm=True):\n", - " counts, bins, _ = plt.hist(df['Jpsi_M'], label=label, bins=100, range=[2.75, 3.5], histtype='step', density=norm)\n", - " # You can also use LaTeX in the axis label\n", - " plt.xlabel('$J/\\\\psi$ mass [GeV]')\n", - " plt.xlim(bins[0], bins[-1])" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "def plot_comparision(var, mc_df, bkg_df):\n", - " _, bins, _ = plt.hist(mc_df[var], bins=100, histtype='step', label='MC', density=1)\n", - " _, bins, _ = plt.hist(bkg_df[var], bins=bins, histtype='step', label='Background', density=1)\n", - " plt.xlabel(var)\n", - " plt.xlim(bins[0], bins[-1])\n", - " plt.legend(loc='best')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "def plot_roc(bdt, training_data, training_columns, label=None):\n", - " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", - " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", - " area = auc(fpr, tpr)\n", - "\n", - " plt.plot([0, 1], [0, 1], color='grey', linestyle='--')\n", - " if label:\n", - " plt.plot(fpr, tpr, label=f'{label} (area = {area:.2f})')\n", - " else:\n", - " plt.plot(fpr, tpr, label=f'ROC curve (area = {area:.2f})')\n", - " plt.xlim(0.0, 1.0)\n", - " plt.ylim(0.0, 1.0)\n", - " plt.xlabel('False Positive Rate')\n", - " plt.ylabel('True Positive Rate')\n", - " plt.legend(loc='lower right')\n", - " # We can make the plot look nicer by forcing the grid to be square\n", - " plt.gca().set_aspect('equal', adjustable='box')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "def plot_significance(bdt, training_data, training_columns, label):\n", - " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", - " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", - " \n", - " n_sig = 1200\n", - " n_bkg = 23000\n", - " S = n_sig*tpr + (n_sig*tpr==0)*1\n", - " B = n_bkg*fpr + (n_bkg*tpr==0)*1\n", - " metric = S/np.sqrt(S+B)\n", - "\n", - " plt.plot(thresholds, metric, label=label)\n", - " plt.xlabel('BDT cut value')\n", - " plt.ylabel('$\\\\frac{S}{\\\\sqrt{S+B}}$')\n", - " plt.xlim(0, 1.0)\n", - "\n", - " optimum = np.max(metric)\n", - " optimal_cut = thresholds[np.argmax(metric)]\n", - " print(label, \": S/sqrt(S+B) =\", optimum, \" at x =\", optimal_cut)\n", - " plt.axvline(x=optimal_cut, color='black', linewidth=1.0, linestyle='--')\n", - "\n", - " return optimal_cut" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "#max_entries = 1000 # try running with low stats for bug fixing your changes quickly \n", - "data_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/real_data.root',\n", - " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", - " )['DecayTree'].arrays(library='pd')#,entry_stop=max_entries)\n", - "mc_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/simulated_data.root',\n", - " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", - " )['DecayTree'].arrays(library='pd')#,entry_stop=max_entries)\n", - "bkg_df = data_df.query('~(3.0 < Jpsi_M < 3.2)')\n", - "\n", - "for df in [mc_df, data_df, bkg_df]:\n", - " df.eval('Jpsi_eta = arctanh(Jpsi_PZ/Jpsi_P)', inplace=True)\n", - " df.eval('mup_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", - " df.eval('mum_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", - "\n", - "bkg_df['catagory'] = 0 # Use 0 for background\n", - "mc_df['catagory'] = 1 # Use 1 for signal\n", - "training_data = pd.concat([bkg_df, mc_df], copy=True, ignore_index=True)\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['IPdiff'] = np.abs(df['mum_PT'] - df['mup_PT'])\n", - " \n", - "training_columns = [\n", - " 'Jpsi_PT',\n", - " 'mup_PT', 'mup_eta', 'mup_ProbNNmu', 'mup_IP',\n", - " 'mum_PT', 'mum_eta', 'mum_ProbNNmu', 'mum_IP',\n", - "]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Previously we trained an XGBClassifier with the default settings, with learning rate = 0.3 and maximum iterations = 100. This cut off to the training process may be limiting the performance of our model. We can monitor the performance of our model as a function of training iteration and stop the training when the gradient approximates zero. " - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "X1, y1 = training_data[training_columns], training_data['catagory']\n", - "X_train, X_test, y_train, y_test = train_test_split(X1, y1) \n", - "# default train_size = 0.25, this can be varied to suit your data\n", - "\n", - "LR = 0.3 # the coefficient of step size decay, eta, has alias 'learning_rate' with default 0.3\n", - "\n", - "stime = time.time()\n", - "bdt = XGBClassifier(learning_rate = LR, n_estimators=100, seed=123, n_threads=-1)\n", - "bdt.fit(training_data[training_columns], training_data['catagory'])\n", - "print(\"XGBoost --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['XGB'] = bdt.predict_proba(df[training_columns])[:,1]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Cross-validation\n", - "\n", - "Splitting the data into randomised subsets for training allows you to monitor your model's performance on the fly using the statistically independant remainder of your sample - this is called cross-validation (CV). We can see below that at the 100th iteration the metrics still show a trend of improvement." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "def training_monitor(alg): \n", - "\n", - " # A model trained with eval_set and eval_metric will return evals_result\n", - " results = alg.evals_result()\n", - " epochs = len(results['validation_0']['logloss'])\n", - " x_axis = range(0, epochs)\n", - "\n", - " # Plotting logLoss as a function of training iteration\n", - " fig, ax = plt.subplots()\n", - " ax.plot(x_axis, results['validation_0']['logloss'], label='Train') # for each eval_set\n", - " if results['validation_1']: ax.plot(x_axis, results['validation_1']['logloss'], label='Test') \n", - " ax.legend()\n", - " plt.ylabel('LogLoss')\n", - " plt.title('LogLoss')\n", - " plt.show()\n", - " \n", - " # Plotting classification error as a function of training iteration\n", - " fig, ax = plt.subplots()\n", - " ax.plot(x_axis, results['validation_0']['error'], label='Train') # for each eval_set\n", - " if results['validation_1']: ax.plot(x_axis, results['validation_1']['error'], label='Test')\n", - " ax.legend()\n", - " plt.ylabel('Error')\n", - " plt.title('Error')\n", - " plt.show()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This involves training on less data but allows us to monitor progress to check if the model is becoming over-specific to our training sample. The minimisation of loss and classification error are common metrics for model assessment. As shown below, the cost to performance is negligible. If the test sample gradient were to invert this would be considered overtraining and is why monitoring performance without CV can be a time costly pitfall." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Defining a model with multi-threading set to maximum\n", - "bdt_cv = XGBClassifier(learning_rate = LR, n_estimators=100, seed=123, n_threads=-1)\n", - "\n", - "# Model fitting with CV and printing out processing time\n", - "stime = time.time()\n", - "bdt_cv.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", - " eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)\n", - "print(\"\\nXGBoost cross-validation --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "# Writing model predictions out for data\n", - "training_monitor(bdt_cv)\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['XGBcv'] = bdt_cv.predict_proba(df[training_columns])[:,1]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Drawing plot of model respone for signal and background classes\n", - "plt.figure()\n", - "plot_comparision('XGB', mc_df, bkg_df)\n", - "plot_comparision('XGBcv', mc_df, bkg_df)\n", - "\n", - "# Drawing the signal efficiency vs background rejection curve (ROC)\n", - "plt.figure()\n", - "plot_roc(bdt, training_data, training_columns)\n", - "plot_roc(bdt_cv, training_data, training_columns)\n", - "\n", - "# Drawing signal significance comparison as a function of minimum cut on model response\n", - "plt.figure()\n", - "bdt_cut = plot_significance(bdt, training_data, training_columns, \"bdt\")\n", - "bdt_cv_cut = plot_significance(bdt_cv, training_data, training_columns, \"bdt_cv\")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### $k$-folding & early stopping\n", - "\n", - "Performing CV on each of a number, k, of ways to split your data gives you k models to choose from. Some choose to average the performance across the models from each fold as any instability might imply the model will not be reliable. The results below seem stable; each fold provides a consistant performance across multiple metrics, so we'll just choose the best one." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Defining the folds with a seed to test consistently \n", - "splits = 4 # to match 0.25 value of test_train_split default though this may not be optimal\n", - "kf = KFold(n_splits=splits, shuffle=True, random_state=123)\n", - "\n", - "# Printing processing time of the kfold cross-validation\n", - "stime = time.time()\n", - "for train, test in kf.split(X1):\n", - " X_train, X_test = X1.iloc[train], X1.iloc[test]\n", - " y_train, y_test = y1.iloc[train], y1.iloc[test]\n", - " bdt.fit(X_train,y_train)\n", - "print(\"\\nXGBoost k-folding --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "# Calculating scores of each fold using variety of CV-metrics\n", - "cv_acc = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"accuracy\", n_jobs=-1)\n", - "cv_los = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"neg_log_loss\", n_jobs=-1)\n", - "cv_auc = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"roc_auc\", n_jobs=-1)\n", - "\n", - "# Printing results and indicating best fold\n", - "print(\"accuracy: \",cv_acc, \" -> best fold =\", np.argmax(cv_acc) )\n", - "print(\"-logloss: \",cv_los, \" -> best fold =\", np.argmax(cv_los) )\n", - "print(\"roc_auc: \",cv_auc, \" -> best fold =\", np.argmax(cv_auc) )\n", - "bestfold = np.argmax(cv_acc)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Early stopping defines a maximum number of rounds the cross-validation metric (we'll use 'error'=1-accuracy) is allowed to not improve before training is terminated. As is standard, we will be reverting back to a 'previous best' model based on test sample score, this helps avoid overtraining. Early stopping prevents us training too many of extra models thus saving time. Set the limit too small though and your training might be cut off prematurely." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "def modelfit(alg, metric, params, label, predictors, kfold, fbest, early_stop=10):\n", - "\n", - " # Loading data split inputs providing best fold result\n", - " for k, (train, test) in enumerate(kf.split(params)):\n", - " if (k==fbest):\n", - " X_train, X_test = params.iloc[train], params.iloc[test]\n", - " y_train, y_test = label.iloc[train], label.iloc[test]\n", - "\n", - " # Defining data in terms of training variables and class label\n", - " xgb_param = alg.get_xgb_params()\n", - " data = xgb.DMatrix(params, label=label, feature_names=predictors, nthread=-1)\n", - " \n", - " # Runs timed CV on our model using early stopping based on our metric\n", - " stime = time.time()\n", - " cvresult = xgb.cv(xgb_param,\n", - " data,\n", - " num_boost_round=alg.get_params()['n_estimators'],\n", - " #nfold=cv_folds, # to use in build folding\n", - " folds=kfold, # use -> ignores nfold \n", - " metrics=metric,\n", - " early_stopping_rounds=early_stop)\n", - " alg.set_params(n_estimators=cvresult.shape[0])\n", - " print(\"\\nXGBoost early-stop folding --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - " # Fitting the algorithm on the data with CV evaluation early stopping\n", - " stime = time.time()\n", - " alg.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", - " eval_set=[(X_train, y_train), (X_test, y_test)],\n", - " verbose=False, early_stopping_rounds=early_stop)\n", - " training_monitor(alg)\n", - " print(\"XGBoost early-stop limit --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - " # Predicting training set:\n", - " train_predictions = alg.predict(X_train) \n", - " test_predictions = alg.predict(X_test)\n", - "\n", - " # Printing model report: \n", - " print(\"\\nModel Report : best iteration \"+str(cvresult.shape[0]))\n", - " print(\"Train Accuracy : \"+str(metrics.accuracy_score(y_train, train_predictions)))\n", - " print(\"Test Accuracy : \"+str(metrics.accuracy_score(y_test, test_predictions)))\n", - " return cvresult.shape[0]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This function incorporates the k-folding CV and early stopping, saving not only the optimal model but also the index of its training iteration. This means, in our subsequent steps, we can apply an upper limit on training for models based on the convergence of the default hyperparameters, saving us some time. " - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Defining model with high maximum estimators for use with early stopping\n", - "bdt_es = XGBClassifier(learning_rate = LR, n_estimators=1000,\n", - " # Default values of other hyperparamters\n", - " #max_depth=6, min_child_weight=1,\n", - " #gamma=0, subsample=0.8,\n", - " #colsample_bytree=0.8, scale_pos_weight=1,\n", - " #objective='binary:logistic', # default for binary classification\n", - " #objective='mutli:softprob', num_class=3, # for multiclassifiers\n", - " seed=123, n_threads=-1)\n", - "\n", - "# Timing the CV using early stopping\n", - "stime = time.time()\n", - "estimators = modelfit(bdt_es, \"error\", X1, y1, training_columns, kf, bestfold)\n", - "print(\"\\nmodelfit(bdt_es) --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "# Saving model predictions\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['XGBes'] = bdt_es.predict_proba(df[training_columns])[:,1]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This provides us with an improved model as well as a benchmark to test against in both performance and training efficiency. When training using new combinations of hyperparameters, the maximum number of estimators from our model report will cut off any new models improving more slowly than our default, while, for more efficient models, the early stopping will kick in." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Drawing plot to compare model response for signal and background classes\n", - "plt.figure()\n", - "plot_comparision('XGBcv', mc_df, bkg_df)\n", - "plot_comparision('XGBes', mc_df, bkg_df)\n", - "\n", - "# Drawing comaprison of the signal efficiency vs background rejection curve (ROC)\n", - "plt.figure()\n", - "plot_roc(bdt_cv, training_data, training_columns)\n", - "plot_roc(bdt_es, training_data, training_columns)\n", - "\n", - "# Drawing signal significance comparison as a function of minimum cut on model response\n", - "plt.figure()\n", - "bdt_cut_cv = plot_significance(bdt_cv, training_data, training_columns, \"bdt_cv\")\n", - "bdt_cut_es = plot_significance(bdt_es, training_data, training_columns, \"bdt_es\")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Hyperameter optimisation\n", - "\n", - "Below we provide a \"grid\" of hyperparameters, defining the structure of the trees and constraints on the learning, but there are many more values to choose from and a larger parameter space to be explored. These optimsations are very problem specific and their impact will have to be weighed against the computing resources and timeframe you have at your disposal. For the sake of expedient demonstration we are comparing the default parameters to only one predetermined variation in 2 parameters. " - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Define a function that performs a gridscan of HPs\n", - "def hpgridscan(alg, metric, params, label, kfold, fbest, early_stop=10):\n", - "\n", - " # Load data fold with best performance\n", - " for k, (train, test) in enumerate(kf.split(params)):\n", - " if (k==fbest):\n", - " X_train, X_test = params.iloc[train], params.iloc[test]\n", - " y_train, y_test = label.iloc[train], label.iloc[test]\n", - "\n", - " # Define a dictionary of numpy arrays for our HPs\n", - " params = {\n", - " 'max_depth':np.array([7]),\n", - " 'min_child_weight':np.array([3]),\n", - " #'max_depth':np.arange( 5, 9, 1 ),\n", - " #'min_child_weight':np.arange( 1, 5, 1 ),\n", - " ##'gamma':np.arange( 0.0, 1.0, 0.1 ),\n", - " ##'colsample_bytree':np.arange( 0.4, 1.0, 0.1 ),\n", - " ##'subsample':np.arange( 0.4, 1.0, 0.1 ),\n", - " ##'scale_pos_weight':np.arange( 0.4, 1.6, 0.1 )\n", - " }\n", - "\n", - " # Perform timed grid scan with established n_estimator cutoff and early stopping\n", - " stime = time.time()\n", - " gs = GridSearchCV(estimator=alg,\n", - " param_grid=params,\n", - " scoring=metric,\n", - " #iid=False,\n", - " cv=kf,\n", - " n_jobs=-1) \n", - " gs.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", - " eval_set=[(X_train, y_train), (X_test, y_test)],\n", - " verbose=False, early_stopping_rounds=early_stop)\n", - " print(\"XGBoost grid-scan --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - " # Return suggested parameters, performance and best model\n", - " training_monitor(gs.best_estimator_)\n", - " print(\"Suggestion:\", gs.best_params_)\n", - " print(\"Accuracy:\" ,gs.best_score_)\n", - " return gs" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Running with estimators maximum for shortened training\n", - "bdt_st = XGBClassifier( learning_rate = LR, n_estimators=estimators,\n", - " seed=123, n_threads=-1)\n", - "\n", - "# Running timed hyperparameter gridscan\n", - "stime = time.time()\n", - "gs = hpgridscan(bdt_st, \"accuracy\", X1, y1, kf, bestfold)\n", - "bdt_gs = gs.best_estimator_\n", - "print(\"\\nhpgridscan(bdt_st) --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "# Get model predictions\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['XGBgs'] = bdt_gs.predict_proba(df[training_columns])[:,1]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Even this naive grid scan, using the same fold as before for fair comparison, can provide significant improvements as demonstrated above. These may be pushed further by including more hyperparameters for a trade off with processing time. However, even with parrallisation these tasks can take hours or longer and might only provide improvement of O(>1%)." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "## We could define a model using optimal hyperparameters from our grid scan\n", - "#bdt_opt = XGBClassifier( learning_rate = LR, n_estimators=1000, \n", - "# max_depth=gs.best_params_['max_depth'],\n", - "# min_child_weight=gs.best_params_['min_child_weight'], \n", - "# seed=123, n_threads=-1 )\n", - "\n", - "## Run with CV early stopping\n", - "#stime = time.time()\n", - "#estimators = modelfit(bdt_opt, 'error', X1, y1, training_columns, kf, bestfold)\n", - "#print(\"\\nmodelfit(bdt_opt) --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "## Get model predictions\n", - "#for df in [mc_df, bkg_df, data_df, training_data]:\n", - "# df['XGBopt'] = bdt_opt.predict_proba(df[training_columns])[:,1]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Comapring model response from the end of last session to the end of this one\n", - "plt.figure()\n", - "plot_comparision('XGB', mc_df, bkg_df)\n", - "plot_comparision('XGBgs', mc_df, bkg_df)\n", - "\n", - "# Comparing model performance for each level of tuning\n", - "plt.figure()\n", - "plot_roc(bdt, training_data, training_columns)\n", - "plot_roc(bdt_cv, training_data, training_columns)\n", - "plot_roc(bdt_es, training_data, training_columns)\n", - "plot_roc(bdt_gs, training_data, training_columns)\n", - "#plot_roc(bdt_opt, training_data, training_columns)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Comparing the impact on projected performance at each stage of the tutorial\n", - "plt.figure()\n", - "bdt_cut = plot_significance(bdt, training_data, training_columns, \"bdt\")\n", - "bdt_cv_cut = plot_significance(bdt_cv, training_data, training_columns, \"bdt_cv\")\n", - "bdt_es_cut = plot_significance(bdt_es, training_data, training_columns, \"bdt_es\")\n", - "bdt_gs_cut = plot_significance(bdt_gs, training_data, training_columns, \"bdt_gs\")\n", - "#bdt_opt_cut = plot_significance(bdt_opt, training_data, training_columns, \"bdt_opt\")\n", - "\n", - "# Comparing best cuts impact on mass for original and tuned model\n", - "plt.figure()\n", - "data_bdt_cut = data_df.query('XGB > %f' %bdt_cut )\n", - "plot_mass(data_bdt_cut, label='XGB default', norm=True)\n", - "data_gs_cut = data_df.query('XGBgs > %f' %bdt_gs_cut )\n", - "plot_mass(data_gs_cut, label='XGB tuned', norm=True)\n", - "plt.legend(loc='best')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also choose a higher learning rate to perform course scans of your space and decrease it again to retrain your final model. If you can afford to, it might be best to include learning rate itself as a parameter in your grid. With some libraries you can specify your choice of kernel. Both these choices will impact your optimal maximum number of iterations, so setting it sufficiently high and using early stopping might be a good strategy.\n", - "\n", - "For less exhaustive and non-discritised methods try smart combinations of the following to perform adaptive scans or build your own: \n", - "* sklearn.model_selection.RandomizedSearchCV\n", - "* sklearn.model_selection.GridSearchCV\n", - "\n", - "Moving to higher dimentional optimisation problems may require more sophisticated solutions:\n", - "* skopt.BayesSearchCV\n", - "* hyperopt.tpe\n\n", - "Full stats plots saved here: bit.ly/LHCb_XGB_Tuning \n", - "Run with full stats by removing entrystop at max_events in cell 8." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Final lesson time and processing time check\n", - "print(\"Notebook real time --- %s seconds ---\" % (time.time() - stt))\n", - "print(\"Notebook CPU time --- %s seconds ---\" % (time.process_time() - stc))" - ], - "execution_count": null, - "outputs": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model tuning setup" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#%store -r training_data\n", + "#%store -r training_columns\n", + "#%store -r bkg_df\n", + "#%store -r mc_df\n", + "#%store -r data_df" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#@title\n", + "#!pip install uproot\n", + "#!pip install sklearn\n", + "\n", + "import time\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import uproot\n", + "import xgboost as xgb\n", + "from matplotlib import pyplot as plt\n", + "from sklearn import metrics\n", + "from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier\n", + "from sklearn.metrics import auc, roc_curve\n", + "from sklearn.model_selection import (GridSearchCV, KFold, cross_val_score,\n", + " cross_validate, train_test_split)\n", + "from xgboost.sklearn import XGBClassifier" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Time and processing check for the lesson\n", + "stt = time.time()\n", + "stc = time.process_time()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def plot_mass(df, label=\"\", norm=True):\n", + " counts, bins, _ = plt.hist(df['Jpsi_M'], label=label, bins=100, range=[2.75, 3.5], histtype='step', density=norm)\n", + " # You can also use LaTeX in the axis label\n", + " plt.xlabel('$J/\\\\psi$ mass [GeV]')\n", + " plt.xlim(bins[0], bins[-1])" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def plot_comparision(var, mc_df, bkg_df):\n", + " _, bins, _ = plt.hist(mc_df[var], bins=100, histtype='step', label='MC', density=1)\n", + " _, bins, _ = plt.hist(bkg_df[var], bins=bins, histtype='step', label='Background', density=1)\n", + " plt.xlabel(var)\n", + " plt.xlim(bins[0], bins[-1])\n", + " plt.legend(loc='best')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def plot_roc(bdt, training_data, training_columns, label=None):\n", + " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", + " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", + " area = auc(fpr, tpr)\n", + "\n", + " plt.plot([0, 1], [0, 1], color='grey', linestyle='--')\n", + " if label:\n", + " plt.plot(fpr, tpr, label=f'{label} (area = {area:.2f})')\n", + " else:\n", + " plt.plot(fpr, tpr, label=f'ROC curve (area = {area:.2f})')\n", + " plt.xlim(0.0, 1.0)\n", + " plt.ylim(0.0, 1.0)\n", + " plt.xlabel('False Positive Rate')\n", + " plt.ylabel('True Positive Rate')\n", + " plt.legend(loc='lower right')\n", + " # We can make the plot look nicer by forcing the grid to be square\n", + " plt.gca().set_aspect('equal', adjustable='box')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def plot_significance(bdt, training_data, training_columns, label):\n", + " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", + " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", + " \n", + " n_sig = 1200\n", + " n_bkg = 23000\n", + " S = n_sig*tpr + (n_sig*tpr==0)*1\n", + " B = n_bkg*fpr + (n_bkg*tpr==0)*1\n", + " metric = S/np.sqrt(S+B)\n", + "\n", + " plt.plot(thresholds, metric, label=label)\n", + " plt.xlabel('BDT cut value')\n", + " plt.ylabel('$\\\\frac{S}{\\\\sqrt{S+B}}$')\n", + " plt.xlim(0, 1.0)\n", + "\n", + " optimum = np.max(metric)\n", + " optimal_cut = thresholds[np.argmax(metric)]\n", + " print(label, \": S/sqrt(S+B) =\", optimum, \" at x =\", optimal_cut)\n", + " plt.axvline(x=optimal_cut, color='black', linewidth=1.0, linestyle='--')\n", + "\n", + " return optimal_cut" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#max_entries = 1000 # try running with low stats for bug fixing your changes quickly \n", + "data_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/real_data.root',\n", + " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", + " )['DecayTree'].arrays(library='pd')#,entry_stop=max_entries)\n", + "mc_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/simulated_data.root',\n", + " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", + " )['DecayTree'].arrays(library='pd')#,entry_stop=max_entries)\n", + "bkg_df = data_df.query('~(3.0 < Jpsi_M < 3.2)')\n", + "\n", + "for df in [mc_df, data_df, bkg_df]:\n", + " df.eval('Jpsi_eta = arctanh(Jpsi_PZ/Jpsi_P)', inplace=True)\n", + " df.eval('mup_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", + " df.eval('mum_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", + "\n", + "bkg_df['catagory'] = 0 # Use 0 for background\n", + "mc_df['catagory'] = 1 # Use 1 for signal\n", + "training_data = pd.concat([bkg_df, mc_df], copy=True, ignore_index=True)\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['IPdiff'] = np.abs(df['mum_PT'] - df['mup_PT'])\n", + " \n", + "training_columns = [\n", + " 'Jpsi_PT',\n", + " 'mup_PT', 'mup_eta', 'mup_ProbNNmu', 'mup_IP',\n", + " 'mum_PT', 'mum_eta', 'mum_ProbNNmu', 'mum_IP',\n", + "]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Previously we trained an XGBClassifier with the default settings, with learning rate = 0.3 and maximum iterations = 100. This cut off to the training process may be limiting the performance of our model. We can monitor the performance of our model as a function of training iteration and stop the training when the gradient approximates zero. " + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "X1, y1 = training_data[training_columns], training_data['catagory']\n", + "X_train, X_test, y_train, y_test = train_test_split(X1, y1) \n", + "# default train_size = 0.25, this can be varied to suit your data\n", + "\n", + "LR = 0.3 # the coefficient of step size decay, eta, has alias 'learning_rate' with default 0.3\n", + "\n", + "stime = time.time()\n", + "bdt = XGBClassifier(learning_rate = LR, n_estimators=100, seed=123, n_threads=-1)\n", + "bdt.fit(training_data[training_columns], training_data['catagory'])\n", + "print(\"XGBoost --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGB'] = bdt.predict_proba(df[training_columns])[:,1]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cross-validation\n", + "\n", + "Splitting the data into randomised subsets for training allows you to monitor your model's performance on the fly using the statistically independant remainder of your sample - this is called cross-validation (CV). We can see below that at the 100th iteration the metrics still show a trend of improvement." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def training_monitor(alg): \n", + "\n", + " # A model trained with eval_set and eval_metric will return evals_result\n", + " results = alg.evals_result()\n", + " epochs = len(results['validation_0']['logloss'])\n", + " x_axis = range(0, epochs)\n", + "\n", + " # Plotting logLoss as a function of training iteration\n", + " fig, ax = plt.subplots()\n", + " ax.plot(x_axis, results['validation_0']['logloss'], label='Train') # for each eval_set\n", + " if results['validation_1']: ax.plot(x_axis, results['validation_1']['logloss'], label='Test') \n", + " ax.legend()\n", + " plt.ylabel('LogLoss')\n", + " plt.title('LogLoss')\n", + " plt.show()\n", + " \n", + " # Plotting classification error as a function of training iteration\n", + " fig, ax = plt.subplots()\n", + " ax.plot(x_axis, results['validation_0']['error'], label='Train') # for each eval_set\n", + " if results['validation_1']: ax.plot(x_axis, results['validation_1']['error'], label='Test')\n", + " ax.legend()\n", + " plt.ylabel('Error')\n", + " plt.title('Error')\n", + " plt.show()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This involves training on less data but allows us to monitor progress to check if the model is becoming over-specific to our training sample. The minimisation of loss and classification error are common metrics for model assessment. As shown below, the cost to performance is negligible. If the test sample gradient were to invert this would be considered overtraining and is why monitoring performance without CV can be a time costly pitfall." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Defining a model with multi-threading set to maximum\n", + "bdt_cv = XGBClassifier(learning_rate = LR, n_estimators=100, seed=123, n_threads=-1)\n", + "\n", + "# Model fitting with CV and printing out processing time\n", + "stime = time.time()\n", + "bdt_cv.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", + " eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)\n", + "print(\"\\nXGBoost cross-validation --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Writing model predictions out for data\n", + "training_monitor(bdt_cv)\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBcv'] = bdt_cv.predict_proba(df[training_columns])[:,1]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Drawing plot of model respone for signal and background classes\n", + "plt.figure()\n", + "plot_comparision('XGB', mc_df, bkg_df)\n", + "plot_comparision('XGBcv', mc_df, bkg_df)\n", + "\n", + "# Drawing the signal efficiency vs background rejection curve (ROC)\n", + "plt.figure()\n", + "plot_roc(bdt, training_data, training_columns)\n", + "plot_roc(bdt_cv, training_data, training_columns)\n", + "\n", + "# Drawing signal significance comparison as a function of minimum cut on model response\n", + "plt.figure()\n", + "bdt_cut = plot_significance(bdt, training_data, training_columns, \"bdt\")\n", + "bdt_cv_cut = plot_significance(bdt_cv, training_data, training_columns, \"bdt_cv\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### $k$-folding & early stopping\n", + "\n", + "Performing CV on each of a number, k, of ways to split your data gives you k models to choose from. Some choose to average the performance across the models from each fold as any instability might imply the model will not be reliable. The results below seem stable; each fold provides a consistant performance across multiple metrics, so we'll just choose the best one." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Defining the folds with a seed to test consistently \n", + "splits = 4 # to match 0.25 value of test_train_split default though this may not be optimal\n", + "kf = KFold(n_splits=splits, shuffle=True, random_state=123)\n", + "\n", + "# Printing processing time of the kfold cross-validation\n", + "stime = time.time()\n", + "for train, test in kf.split(X1):\n", + " X_train, X_test = X1.iloc[train], X1.iloc[test]\n", + " y_train, y_test = y1.iloc[train], y1.iloc[test]\n", + " bdt.fit(X_train,y_train)\n", + "print(\"\\nXGBoost k-folding --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Calculating scores of each fold using variety of CV-metrics\n", + "cv_acc = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"accuracy\", n_jobs=-1)\n", + "cv_los = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"neg_log_loss\", n_jobs=-1)\n", + "cv_auc = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"roc_auc\", n_jobs=-1)\n", + "\n", + "# Printing results and indicating best fold\n", + "print(\"accuracy: \",cv_acc, \" -> best fold =\", np.argmax(cv_acc) )\n", + "print(\"-logloss: \",cv_los, \" -> best fold =\", np.argmax(cv_los) )\n", + "print(\"roc_auc: \",cv_auc, \" -> best fold =\", np.argmax(cv_auc) )\n", + "bestfold = np.argmax(cv_acc)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Early stopping defines a maximum number of rounds the cross-validation metric (we'll use 'error'=1-accuracy) is allowed to not improve before training is terminated. As is standard, we will be reverting back to a 'previous best' model based on test sample score, this helps avoid overtraining. Early stopping prevents us training too many of extra models thus saving time. Set the limit too small though and your training might be cut off prematurely." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def modelfit(alg, metric, params, label, predictors, kfold, fbest, early_stop=10):\n", + "\n", + " # Loading data split inputs providing best fold result\n", + " for k, (train, test) in enumerate(kf.split(params)):\n", + " if (k==fbest):\n", + " X_train, X_test = params.iloc[train], params.iloc[test]\n", + " y_train, y_test = label.iloc[train], label.iloc[test]\n", + "\n", + " # Defining data in terms of training variables and class label\n", + " xgb_param = alg.get_xgb_params()\n", + " data = xgb.DMatrix(params, label=label, feature_names=predictors, nthread=-1)\n", + " \n", + " # Runs timed CV on our model using early stopping based on our metric\n", + " stime = time.time()\n", + " cvresult = xgb.cv(xgb_param,\n", + " data,\n", + " num_boost_round=alg.get_params()['n_estimators'],\n", + " #nfold=cv_folds, # to use in build folding\n", + " folds=kfold, # use -> ignores nfold \n", + " metrics=metric,\n", + " early_stopping_rounds=early_stop)\n", + " alg.set_params(n_estimators=cvresult.shape[0])\n", + " print(\"\\nXGBoost early-stop folding --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + " # Fitting the algorithm on the data with CV evaluation early stopping\n", + " stime = time.time()\n", + " alg.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", + " eval_set=[(X_train, y_train), (X_test, y_test)],\n", + " verbose=False, early_stopping_rounds=early_stop)\n", + " training_monitor(alg)\n", + " print(\"XGBoost early-stop limit --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + " # Predicting training set:\n", + " train_predictions = alg.predict(X_train) \n", + " test_predictions = alg.predict(X_test)\n", + "\n", + " # Printing model report: \n", + " print(\"\\nModel Report : best iteration \"+str(cvresult.shape[0]))\n", + " print(\"Train Accuracy : \"+str(metrics.accuracy_score(y_train, train_predictions)))\n", + " print(\"Test Accuracy : \"+str(metrics.accuracy_score(y_test, test_predictions)))\n", + " return cvresult.shape[0]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This function incorporates the k-folding CV and early stopping, saving not only the optimal model but also the index of its training iteration. This means, in our subsequent steps, we can apply an upper limit on training for models based on the convergence of the default hyperparameters, saving us some time. " + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Defining model with high maximum estimators for use with early stopping\n", + "bdt_es = XGBClassifier(learning_rate = LR, n_estimators=1000,\n", + " # Default values of other hyperparamters\n", + " #max_depth=6, min_child_weight=1,\n", + " #gamma=0, subsample=0.8,\n", + " #colsample_bytree=0.8, scale_pos_weight=1,\n", + " #objective='binary:logistic', # default for binary classification\n", + " #objective='mutli:softprob', num_class=3, # for multiclassifiers\n", + " seed=123, n_threads=-1)\n", + "\n", + "# Timing the CV using early stopping\n", + "stime = time.time()\n", + "estimators = modelfit(bdt_es, \"error\", X1, y1, training_columns, kf, bestfold)\n", + "print(\"\\nmodelfit(bdt_es) --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Saving model predictions\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBes'] = bdt_es.predict_proba(df[training_columns])[:,1]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This provides us with an improved model as well as a benchmark to test against in both performance and training efficiency. When training using new combinations of hyperparameters, the maximum number of estimators from our model report will cut off any new models improving more slowly than our default, while, for more efficient models, the early stopping will kick in." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Drawing plot to compare model response for signal and background classes\n", + "plt.figure()\n", + "plot_comparision('XGBcv', mc_df, bkg_df)\n", + "plot_comparision('XGBes', mc_df, bkg_df)\n", + "\n", + "# Drawing comaprison of the signal efficiency vs background rejection curve (ROC)\n", + "plt.figure()\n", + "plot_roc(bdt_cv, training_data, training_columns)\n", + "plot_roc(bdt_es, training_data, training_columns)\n", + "\n", + "# Drawing signal significance comparison as a function of minimum cut on model response\n", + "plt.figure()\n", + "bdt_cut_cv = plot_significance(bdt_cv, training_data, training_columns, \"bdt_cv\")\n", + "bdt_cut_es = plot_significance(bdt_es, training_data, training_columns, \"bdt_es\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Hyperameter optimisation\n", + "\n", + "Below we provide a \"grid\" of hyperparameters, defining the structure of the trees and constraints on the learning, but there are many more values to choose from and a larger parameter space to be explored. These optimsations are very problem specific and their impact will have to be weighed against the computing resources and timeframe you have at your disposal. For the sake of expedient demonstration we are comparing the default parameters to only one predetermined variation in 2 parameters. " + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Define a function that performs a gridscan of HPs\n", + "\n", + "\n", + "def hpgridscan(alg, metric, params, label, kfold, fbest, early_stop=10):\n", + "\n", + " # Load data fold with best performance\n", + " for k, (train, test) in enumerate(kf.split(params)):\n", + " if (k==fbest):\n", + " X_train, X_test = params.iloc[train], params.iloc[test]\n", + " y_train, y_test = label.iloc[train], label.iloc[test]\n", + "\n", + " # Define a dictionary of numpy arrays for our HPs\n", + " params = {\n", + " 'max_depth':np.array([7]),\n", + " 'min_child_weight':np.array([3]),\n", + " #'max_depth':np.arange( 5, 9, 1 ),\n", + " #'min_child_weight':np.arange( 1, 5, 1 ),\n", + " ##'gamma':np.arange( 0.0, 1.0, 0.1 ),\n", + " ##'colsample_bytree':np.arange( 0.4, 1.0, 0.1 ),\n", + " ##'subsample':np.arange( 0.4, 1.0, 0.1 ),\n", + " ##'scale_pos_weight':np.arange( 0.4, 1.6, 0.1 )\n", + " }\n", + "\n", + " # Perform timed grid scan with established n_estimator cutoff and early stopping\n", + " stime = time.time()\n", + " gs = GridSearchCV(estimator=alg,\n", + " param_grid=params,\n", + " scoring=metric,\n", + " #iid=False,\n", + " cv=kf,\n", + " n_jobs=-1) \n", + " gs.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", + " eval_set=[(X_train, y_train), (X_test, y_test)],\n", + " verbose=False, early_stopping_rounds=early_stop)\n", + " print(\"XGBoost grid-scan --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + " # Return suggested parameters, performance and best model\n", + " training_monitor(gs.best_estimator_)\n", + " print(\"Suggestion:\", gs.best_params_)\n", + " print(\"Accuracy:\" ,gs.best_score_)\n", + " return gs" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Running with estimators maximum for shortened training\n", + "bdt_st = XGBClassifier( learning_rate = LR, n_estimators=estimators,\n", + " seed=123, n_threads=-1)\n", + "\n", + "# Running timed hyperparameter gridscan\n", + "stime = time.time()\n", + "gs = hpgridscan(bdt_st, \"accuracy\", X1, y1, kf, bestfold)\n", + "bdt_gs = gs.best_estimator_\n", + "print(\"\\nhpgridscan(bdt_st) --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Get model predictions\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBgs'] = bdt_gs.predict_proba(df[training_columns])[:,1]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Even this naive grid scan, using the same fold as before for fair comparison, can provide significant improvements as demonstrated above. These may be pushed further by including more hyperparameters for a trade off with processing time. However, even with parrallisation these tasks can take hours or longer and might only provide improvement of O(>1%)." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "## We could define a model using optimal hyperparameters from our grid scan\n", + "#bdt_opt = XGBClassifier( learning_rate = LR, n_estimators=1000, \n", + "# max_depth=gs.best_params_['max_depth'],\n", + "# min_child_weight=gs.best_params_['min_child_weight'], \n", + "# seed=123, n_threads=-1 )\n", + "\n", + "## Run with CV early stopping\n", + "#stime = time.time()\n", + "#estimators = modelfit(bdt_opt, 'error', X1, y1, training_columns, kf, bestfold)\n", + "#print(\"\\nmodelfit(bdt_opt) --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "## Get model predictions\n", + "#for df in [mc_df, bkg_df, data_df, training_data]:\n", + "# df['XGBopt'] = bdt_opt.predict_proba(df[training_columns])[:,1]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Comapring model response from the end of last session to the end of this one\n", + "plt.figure()\n", + "plot_comparision('XGB', mc_df, bkg_df)\n", + "plot_comparision('XGBgs', mc_df, bkg_df)\n", + "\n", + "# Comparing model performance for each level of tuning\n", + "plt.figure()\n", + "plot_roc(bdt, training_data, training_columns)\n", + "plot_roc(bdt_cv, training_data, training_columns)\n", + "plot_roc(bdt_es, training_data, training_columns)\n", + "plot_roc(bdt_gs, training_data, training_columns)\n", + "#plot_roc(bdt_opt, training_data, training_columns)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Comparing the impact on projected performance at each stage of the tutorial\n", + "plt.figure()\n", + "bdt_cut = plot_significance(bdt, training_data, training_columns, \"bdt\")\n", + "bdt_cv_cut = plot_significance(bdt_cv, training_data, training_columns, \"bdt_cv\")\n", + "bdt_es_cut = plot_significance(bdt_es, training_data, training_columns, \"bdt_es\")\n", + "bdt_gs_cut = plot_significance(bdt_gs, training_data, training_columns, \"bdt_gs\")\n", + "#bdt_opt_cut = plot_significance(bdt_opt, training_data, training_columns, \"bdt_opt\")\n", + "\n", + "# Comparing best cuts impact on mass for original and tuned model\n", + "plt.figure()\n", + "data_bdt_cut = data_df.query('XGB > %f' %bdt_cut )\n", + "plot_mass(data_bdt_cut, label='XGB default', norm=True)\n", + "data_gs_cut = data_df.query('XGBgs > %f' %bdt_gs_cut )\n", + "plot_mass(data_gs_cut, label='XGB tuned', norm=True)\n", + "plt.legend(loc='best')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also choose a higher learning rate to perform course scans of your space and decrease it again to retrain your final model. If you can afford to, it might be best to include learning rate itself as a parameter in your grid. With some libraries you can specify your choice of kernel. Both these choices will impact your optimal maximum number of iterations, so setting it sufficiently high and using early stopping might be a good strategy.\n", + "\n", + "For less exhaustive and non-discritised methods try smart combinations of the following to perform adaptive scans or build your own: \n", + "* sklearn.model_selection.RandomizedSearchCV\n", + "* sklearn.model_selection.GridSearchCV\n", + "\n", + "Moving to higher dimentional optimisation problems may require more sophisticated solutions:\n", + "* skopt.BayesSearchCV\n", + "* hyperopt.tpe\n\n", + "Full stats plots saved here: bit.ly/LHCb_XGB_Tuning \n", + "Run with full stats by removing entrystop at max_events in cell 8." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Final lesson time and processing time check\n", + "print(\"Notebook real time --- %s seconds ---\" % (time.time() - stt))\n", + "print(\"Notebook CPU time --- %s seconds ---\" % (time.process_time() - stc))" + ], + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } From 358f40f11a2c52dfa2e8e7604aeeacbc95672c05 Mon Sep 17 00:00:00 2001 From: "J. V. Mead" Date: Mon, 13 Nov 2023 14:41:41 +0100 Subject: [PATCH 48/54] Rename 4bModelTuning.ipynb to 33ModelTuning.ipynb updating name / index to fit current scheme --- advanced-python/{4bModelTuning.ipynb => 33ModelTuning.ipynb} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename advanced-python/{4bModelTuning.ipynb => 33ModelTuning.ipynb} (100%) diff --git a/advanced-python/4bModelTuning.ipynb b/advanced-python/33ModelTuning.ipynb similarity index 100% rename from advanced-python/4bModelTuning.ipynb rename to advanced-python/33ModelTuning.ipynb From 697490a685d62d3dc3dcbd7e5f826b93934f11e0 Mon Sep 17 00:00:00 2001 From: Jonas Eschle Date: Tue, 28 Nov 2023 12:56:44 +0100 Subject: [PATCH 49/54] Update README.md --- advanced-python/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/advanced-python/README.md b/advanced-python/README.md index 3e324eaa..2dde2d2d 100644 --- a/advanced-python/README.md +++ b/advanced-python/README.md @@ -15,9 +15,9 @@ a knowledge base that one can always come back to lock up things. 12AdvancedClasses.ipynb 20DataAndPlotting.ipynb 30Classification.ipynb - 4bModelTuning.ipynb 31ClassificationExtension.ipynb 32BoostingToUniformity.ipynb + 33ModelTuning.ipynb 40Histograms.ipynb 45DemoReweighting.ipynb 50LikelihoodInference.ipynb From 1ab01ecf7998696242e1a7893c71336ffa0dd047 Mon Sep 17 00:00:00 2001 From: James Mead Date: Tue, 28 Nov 2023 14:28:52 +0100 Subject: [PATCH 50/54] removed ci.yaml --- .github/workflows/ci.yaml | 31 ------------------------------- 1 file changed, 31 deletions(-) delete mode 100644 .github/workflows/ci.yaml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml deleted file mode 100644 index 9c70ce08..00000000 --- a/.github/workflows/ci.yaml +++ /dev/null @@ -1,31 +0,0 @@ -name: SK Build -on: [push, pull_request] - -jobs: - build: - name: Set up Python 3.7 - runs-on: ubuntu-latest - #strategy: - # fail-fast: true - # max-parallel: -1 - # matrix: - # go: ["1.12.x", "1.13.x"] - steps: - - uses: actions/checkout@v1 - - name: Install dependencies - run: | - source ${CONDA}/etc/profile.d/conda.sh - conda config --add channels conda-forge - conda env create -f environment.yml -n my-analysis-env - conda activate my-analysis-env - conda install --yes jupyterlab - pip install git+https://github.com/chrisburr/recommonmark.git@patch-1 - pip install starterkit-ci>=0.0.12 - - name: Starterkit CI - run: | - source ${CONDA}/bin/activate my-analysis-env - starterkit_ci build - starterkit_ci check - # - name: Test - # run: | - # test $TRAVIS_PULL_REQUEST == "false" && test $TRAVIS_BRANCH == "master" && starterkit_ci deploy From 16006727657466c0e86d33c8a9d247cd727439f6 Mon Sep 17 00:00:00 2001 From: "J. V. Mead" Date: Tue, 28 Nov 2023 14:41:56 +0100 Subject: [PATCH 51/54] removing unnecessary changes to 10Basics.ipynb --- advanced-python/10Basics.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/advanced-python/10Basics.ipynb b/advanced-python/10Basics.ipynb index 751e29b1..93be8a95 100644 --- a/advanced-python/10Basics.ipynb +++ b/advanced-python/10Basics.ipynb @@ -481,7 +481,7 @@ }, "outputs": [], "source": [ - "get?" + "{'a': 'b'}.get?" ] }, { From d23e76710e3cd73c8a55e123b495bea65a75e98f Mon Sep 17 00:00:00 2001 From: "J. V. Mead" Date: Tue, 28 Nov 2023 14:42:52 +0100 Subject: [PATCH 52/54] Removing unnecessary changes to 11AdvancedPython.ipynb --- advanced-python/11AdvancedPython.ipynb | 4 ---- 1 file changed, 4 deletions(-) diff --git a/advanced-python/11AdvancedPython.ipynb b/advanced-python/11AdvancedPython.ipynb index af848aa1..ea1344cb 100644 --- a/advanced-python/11AdvancedPython.ipynb +++ b/advanced-python/11AdvancedPython.ipynb @@ -380,8 +380,6 @@ "outputs": [], "source": [ "# SOLUTION\n", - "\n", - "\n", "@contextlib.contextmanager\n", "def func(x):\n", " yield x\n", @@ -621,8 +619,6 @@ "outputs": [], "source": [ "# SOLUTION\n", - "\n", - "\n", "def timed_func(func):\n", " def wrapped_func(*args, **kwargs):\n", " print(args)\n", From 4ad08cfa6000501ce7c4f743a0b015d1d7d2479d Mon Sep 17 00:00:00 2001 From: James Mead Date: Tue, 28 Nov 2023 14:53:54 +0100 Subject: [PATCH 53/54] added warning about class definitions dependance upon side bands --- advanced-python/33ModelTuning.ipynb | 196 ++++++++++++++++------------ 1 file changed, 113 insertions(+), 83 deletions(-) diff --git a/advanced-python/33ModelTuning.ipynb b/advanced-python/33ModelTuning.ipynb index f24a99ec..9eaef5a1 100644 --- a/advanced-python/33ModelTuning.ipynb +++ b/advanced-python/33ModelTuning.ipynb @@ -9,20 +9,22 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "#%store -r training_data\n", "#%store -r training_columns\n", "#%store -r bkg_df\n", "#%store -r mc_df\n", "#%store -r data_df" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "#@title\n", "#!pip install uproot\n", @@ -41,37 +43,37 @@ "from sklearn.model_selection import (GridSearchCV, KFold, cross_val_score,\n", " cross_validate, train_test_split)\n", "from xgboost.sklearn import XGBClassifier" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "# Time and processing check for the lesson\n", "stt = time.time()\n", "stc = time.process_time()" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "def plot_mass(df, label=\"\", norm=True):\n", " counts, bins, _ = plt.hist(df['Jpsi_M'], label=label, bins=100, range=[2.75, 3.5], histtype='step', density=norm)\n", " # You can also use LaTeX in the axis label\n", " plt.xlabel('$J/\\\\psi$ mass [GeV]')\n", " plt.xlim(bins[0], bins[-1])" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "def plot_comparision(var, mc_df, bkg_df):\n", " _, bins, _ = plt.hist(mc_df[var], bins=100, histtype='step', label='MC', density=1)\n", @@ -79,13 +81,13 @@ " plt.xlabel(var)\n", " plt.xlim(bins[0], bins[-1])\n", " plt.legend(loc='best')" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "def plot_roc(bdt, training_data, training_columns, label=None):\n", " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", @@ -104,18 +106,18 @@ " plt.legend(loc='lower right')\n", " # We can make the plot look nicer by forcing the grid to be square\n", " plt.gca().set_aspect('equal', adjustable='box')" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "def plot_significance(bdt, training_data, training_columns, label):\n", " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", - " \n", + "\n", " n_sig = 1200\n", " n_bkg = 23000\n", " S = n_sig*tpr + (n_sig*tpr==0)*1\n", @@ -133,15 +135,15 @@ " plt.axvline(x=optimal_cut, color='black', linewidth=1.0, linestyle='--')\n", "\n", " return optimal_cut" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "#max_entries = 1000 # try running with low stats for bug fixing your changes quickly \n", + "#max_entries = 1000 # try running with low stats for bug fixing your changes quickly\n", "data_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/real_data.root',\n", " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", " )['DecayTree'].arrays(library='pd')#,entry_stop=max_entries)\n", @@ -160,15 +162,13 @@ "training_data = pd.concat([bkg_df, mc_df], copy=True, ignore_index=True)\n", "for df in [mc_df, bkg_df, data_df, training_data]:\n", " df['IPdiff'] = np.abs(df['mum_PT'] - df['mup_PT'])\n", - " \n", + "\n", "training_columns = [\n", " 'Jpsi_PT',\n", " 'mup_PT', 'mup_eta', 'mup_ProbNNmu', 'mup_IP',\n", " 'mum_PT', 'mum_eta', 'mum_ProbNNmu', 'mum_IP',\n", "]" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -179,10 +179,12 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "X1, y1 = training_data[training_columns], training_data['catagory']\n", - "X_train, X_test, y_train, y_test = train_test_split(X1, y1) \n", + "X_train, X_test, y_train, y_test = train_test_split(X1, y1)\n", "# default train_size = 0.25, this can be varied to suit your data\n", "\n", "LR = 0.3 # the coefficient of step size decay, eta, has alias 'learning_rate' with default 0.3\n", @@ -194,9 +196,7 @@ "\n", "for df in [mc_df, bkg_df, data_df, training_data]:\n", " df['XGB'] = bdt.predict_proba(df[training_columns])[:,1]" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -209,9 +209,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "def training_monitor(alg): \n", + "def training_monitor(alg):\n", "\n", " # A model trained with eval_set and eval_metric will return evals_result\n", " results = alg.evals_result()\n", @@ -221,12 +223,12 @@ " # Plotting logLoss as a function of training iteration\n", " fig, ax = plt.subplots()\n", " ax.plot(x_axis, results['validation_0']['logloss'], label='Train') # for each eval_set\n", - " if results['validation_1']: ax.plot(x_axis, results['validation_1']['logloss'], label='Test') \n", + " if results['validation_1']: ax.plot(x_axis, results['validation_1']['logloss'], label='Test')\n", " ax.legend()\n", " plt.ylabel('LogLoss')\n", " plt.title('LogLoss')\n", " plt.show()\n", - " \n", + "\n", " # Plotting classification error as a function of training iteration\n", " fig, ax = plt.subplots()\n", " ax.plot(x_axis, results['validation_0']['error'], label='Train') # for each eval_set\n", @@ -235,9 +237,7 @@ " plt.ylabel('Error')\n", " plt.title('Error')\n", " plt.show()" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -248,7 +248,9 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "# Defining a model with multi-threading set to maximum\n", "bdt_cv = XGBClassifier(learning_rate = LR, n_estimators=100, seed=123, n_threads=-1)\n", @@ -263,13 +265,13 @@ "training_monitor(bdt_cv)\n", "for df in [mc_df, bkg_df, data_df, training_data]:\n", " df['XGBcv'] = bdt_cv.predict_proba(df[training_columns])[:,1]" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "# Drawing plot of model respone for signal and background classes\n", "plt.figure()\n", @@ -285,9 +287,7 @@ "plt.figure()\n", "bdt_cut = plot_significance(bdt, training_data, training_columns, \"bdt\")\n", "bdt_cv_cut = plot_significance(bdt_cv, training_data, training_columns, \"bdt_cv\")" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -300,9 +300,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "# Defining the folds with a seed to test consistently \n", + "# Defining the folds with a seed to test consistently\n", "splits = 4 # to match 0.25 value of test_train_split default though this may not be optimal\n", "kf = KFold(n_splits=splits, shuffle=True, random_state=123)\n", "\n", @@ -324,9 +326,7 @@ "print(\"-logloss: \",cv_los, \" -> best fold =\", np.argmax(cv_los) )\n", "print(\"roc_auc: \",cv_auc, \" -> best fold =\", np.argmax(cv_auc) )\n", "bestfold = np.argmax(cv_acc)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -337,7 +337,9 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "def modelfit(alg, metric, params, label, predictors, kfold, fbest, early_stop=10):\n", "\n", @@ -350,14 +352,14 @@ " # Defining data in terms of training variables and class label\n", " xgb_param = alg.get_xgb_params()\n", " data = xgb.DMatrix(params, label=label, feature_names=predictors, nthread=-1)\n", - " \n", + "\n", " # Runs timed CV on our model using early stopping based on our metric\n", " stime = time.time()\n", " cvresult = xgb.cv(xgb_param,\n", " data,\n", " num_boost_round=alg.get_params()['n_estimators'],\n", " #nfold=cv_folds, # to use in build folding\n", - " folds=kfold, # use -> ignores nfold \n", + " folds=kfold, # use -> ignores nfold\n", " metrics=metric,\n", " early_stopping_rounds=early_stop)\n", " alg.set_params(n_estimators=cvresult.shape[0])\n", @@ -372,17 +374,15 @@ " print(\"XGBoost early-stop limit --- %s seconds ---\" % (time.time() - stime))\n", "\n", " # Predicting training set:\n", - " train_predictions = alg.predict(X_train) \n", + " train_predictions = alg.predict(X_train)\n", " test_predictions = alg.predict(X_test)\n", "\n", - " # Printing model report: \n", + " # Printing model report:\n", " print(\"\\nModel Report : best iteration \"+str(cvresult.shape[0]))\n", " print(\"Train Accuracy : \"+str(metrics.accuracy_score(y_train, train_predictions)))\n", " print(\"Test Accuracy : \"+str(metrics.accuracy_score(y_test, test_predictions)))\n", " return cvresult.shape[0]" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -393,7 +393,9 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "# Defining model with high maximum estimators for use with early stopping\n", "bdt_es = XGBClassifier(learning_rate = LR, n_estimators=1000,\n", @@ -413,9 +415,7 @@ "# Saving model predictions\n", "for df in [mc_df, bkg_df, data_df, training_data]:\n", " df['XGBes'] = bdt_es.predict_proba(df[training_columns])[:,1]" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -426,7 +426,9 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "# Drawing plot to compare model response for signal and background classes\n", "plt.figure()\n", @@ -442,9 +444,7 @@ "plt.figure()\n", "bdt_cut_cv = plot_significance(bdt_cv, training_data, training_columns, \"bdt_cv\")\n", "bdt_cut_es = plot_significance(bdt_es, training_data, training_columns, \"bdt_es\")" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -457,7 +457,9 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "# Define a function that performs a gridscan of HPs\n", "\n", @@ -489,7 +491,7 @@ " scoring=metric,\n", " #iid=False,\n", " cv=kf,\n", - " n_jobs=-1) \n", + " n_jobs=-1)\n", " gs.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", " eval_set=[(X_train, y_train), (X_test, y_test)],\n", " verbose=False, early_stopping_rounds=early_stop)\n", @@ -500,13 +502,13 @@ " print(\"Suggestion:\", gs.best_params_)\n", " print(\"Accuracy:\" ,gs.best_score_)\n", " return gs" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "# Running with estimators maximum for shortened training\n", "bdt_st = XGBClassifier( learning_rate = LR, n_estimators=estimators,\n", @@ -521,9 +523,7 @@ "# Get model predictions\n", "for df in [mc_df, bkg_df, data_df, training_data]:\n", " df['XGBgs'] = bdt_gs.predict_proba(df[training_columns])[:,1]" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -534,12 +534,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "## We could define a model using optimal hyperparameters from our grid scan\n", - "#bdt_opt = XGBClassifier( learning_rate = LR, n_estimators=1000, \n", + "#bdt_opt = XGBClassifier( learning_rate = LR, n_estimators=1000,\n", "# max_depth=gs.best_params_['max_depth'],\n", - "# min_child_weight=gs.best_params_['min_child_weight'], \n", + "# min_child_weight=gs.best_params_['min_child_weight'],\n", "# seed=123, n_threads=-1 )\n", "\n", "## Run with CV early stopping\n", @@ -550,13 +552,13 @@ "## Get model predictions\n", "#for df in [mc_df, bkg_df, data_df, training_data]:\n", "# df['XGBopt'] = bdt_opt.predict_proba(df[training_columns])[:,1]" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "# Comapring model response from the end of last session to the end of this one\n", "plt.figure()\n", @@ -570,13 +572,13 @@ "plot_roc(bdt_es, training_data, training_columns)\n", "plot_roc(bdt_gs, training_data, training_columns)\n", "#plot_roc(bdt_opt, training_data, training_columns)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "# Comparing the impact on projected performance at each stage of the tutorial\n", "plt.figure()\n", @@ -593,9 +595,31 @@ "data_gs_cut = data_df.query('XGBgs > %f' %bdt_gs_cut )\n", "plot_mass(data_gs_cut, label='XGB tuned', norm=True)\n", "plt.legend(loc='best')" - ], + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Comparing our data sample's mass plot having applied the cut optimised for $\\sigma=\\frac{S}{\\sqrt{S+B}}$ from each BDT output, we can see how the improved model reduces relative background. However, while we define our signal training sample from MC you'll remember we defined our background training sample from the data !(3.0 < JPsi_M < 3.2).\n", + "\n", + "We can see shoulders at the edges of the regions where we define our background training sample in our data's mass spectrum now. Our training and validation samples include a subset of our data sample so there's potential that our model is learning the difference between MC and data and exploiting that or demonstrating overtraining on the 'previously seen' data (remember we could see our train and test samples beginning to diverge in our validation metrics with more iterations).\n", + "\n", + "Below you can see replotting the normalised mass distribution from just the data not included in training demonstrates no significant improvement. This is not ideal and might be addressed by choosing the setup of our training more carefully. For example, we could train using background from same-sign muon MC across the full mass range (a common practice in LHC experiments) or, using other libraries such as UGBoost to introduce a punishment to the training for introducing a depedance of efficiency on mass." + ] + }, + { + "cell_type": "code", "execution_count": null, - "outputs": [] + "metadata": {}, + "outputs": [], + "source": [ + "sig_bdt_cut = sig_df.query('XGB > %f' %bdt_cut )\n", + "plot_mass(sig_bdt_cut, label='XGB default', norm=True)\n", + "sig_gs_cut = sig_df.query('XGBgs > %f' %bdt_gs_cut )\n", + "plot_mass(sig_gs_cut, label='XGB tuned', norm=True)\n", + "plt.legend(loc='best')" + ] }, { "cell_type": "markdown", @@ -603,27 +627,33 @@ "source": [ "You can also choose a higher learning rate to perform course scans of your space and decrease it again to retrain your final model. If you can afford to, it might be best to include learning rate itself as a parameter in your grid. With some libraries you can specify your choice of kernel. Both these choices will impact your optimal maximum number of iterations, so setting it sufficiently high and using early stopping might be a good strategy.\n", "\n", - "For less exhaustive and non-discritised methods try smart combinations of the following to perform adaptive scans or build your own: \n", + "For less exhaustive and non-discritised methods try smart combinations of the following to perform adaptive scans or build your own:\n", "* sklearn.model_selection.RandomizedSearchCV\n", "* sklearn.model_selection.GridSearchCV\n", "\n", "Moving to higher dimentional optimisation problems may require more sophisticated solutions:\n", "* skopt.BayesSearchCV\n", - "* hyperopt.tpe\n\n", - "Full stats plots saved here: bit.ly/LHCb_XGB_Tuning \n", + "* hyperopt.tpe" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Full stats plots saved here: bit.ly/LHCb_XGB_Tuning\n", "Run with full stats by removing entrystop at max_events in cell 8." ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "# Final lesson time and processing time check\n", "print(\"Notebook real time --- %s seconds ---\" % (time.time() - stt))\n", "print(\"Notebook CPU time --- %s seconds ---\" % (time.process_time() - stc))" - ], - "execution_count": null, - "outputs": [] + ] } ], "metadata": { From d9afd6267915e1e58e441b125643a676550f3a81 Mon Sep 17 00:00:00 2001 From: James Mead Date: Tue, 5 Dec 2023 12:00:12 +0100 Subject: [PATCH 54/54] adding missing sig_df for last check --- advanced-python/33ModelTuning.ipynb | 1 + 1 file changed, 1 insertion(+) diff --git a/advanced-python/33ModelTuning.ipynb b/advanced-python/33ModelTuning.ipynb index 9eaef5a1..0ea4d465 100644 --- a/advanced-python/33ModelTuning.ipynb +++ b/advanced-python/33ModelTuning.ipynb @@ -614,6 +614,7 @@ "metadata": {}, "outputs": [], "source": [ + "sig_df = data_df.query('(3.0 < Jpsi_M < 3.2)')\n", "sig_bdt_cut = sig_df.query('XGB > %f' %bdt_cut )\n", "plot_mass(sig_bdt_cut, label='XGB default', norm=True)\n", "sig_gs_cut = sig_df.query('XGBgs > %f' %bdt_gs_cut )\n",