diff --git a/examples/boston_housing_example_Gamma.ipynb b/examples/boston_housing_example_Gamma.ipynb new file mode 100644 index 0000000..e2e9c56 --- /dev/null +++ b/examples/boston_housing_example_Gamma.ipynb @@ -0,0 +1,900 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/StatMixedML/LightGBMLSS/blob/master/examples/simulation_example_Gamma.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:39:40.836849400Z", + "start_time": "2023-05-18T06:39:40.819009700Z" + } + }, + "outputs": [], + "source": [ + "from lightgbmlss.model import *\n", + "from lightgbmlss.distributions.Gamma import *\n", + "\n", + "from sklearn import datasets\n", + "from sklearn.datasets import make_regression\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:29:58.792846200Z", + "start_time": "2023-05-18T06:29:57.927953500Z" + } + }, + "outputs": [], + "source": [ + "housing_data = datasets.fetch_california_housing()\n", + "X, y = housing_data[\"data\"], housing_data[\"target\"]\n", + "feature_names = housing_data[\"feature_names\"]\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)\n", + "\n", + "dtrain = lgb.Dataset(X_train, label=y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distribution Selection" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:29:58.807805300Z", + "start_time": "2023-05-18T06:29:58.794840400Z" + } + }, + "outputs": [], + "source": [ + "# Specifies Gamma distribution with exp response function and option to stabilize Gradient/Hessian. Type ?Gamma for an overview.\n", + "lgblss = LightGBMLSS(\n", + " Gamma(stabilization=\"L2\", # Options are \"None\", \"MAD\", \"L2\".\n", + " response_fn=\"exp\") # Function to transform the concentration and rate parameters, e.g., \"exp\" or \"softplus\".\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Hyper-Parameter Optimization" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:35:09.370575100Z", + "start_time": "2023-05-18T06:29:58.929480400Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m[I 2023-05-23 17:41:52,073]\u001b[0m A new study created in memory with name: LightGBMLSS Hyper-Parameter Optimization\u001b[0m\n", + "C:\\Users\\maerzale\\.virtualenvs\\LightGBMLSS-Dam57Fpb\\lib\\site-packages\\optuna\\progress_bar.py:56: ExperimentalWarning: Progress bar is experimental (supported from v1.2.0). The interface can change in the future.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4ff99c8c7a14428bb062f25aac857776", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 00:00/05:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m[I 2023-05-23 17:42:04,068]\u001b[0m Trial 0 finished with value: 1809.126050616011 and parameters: {'eta': 0.06556917943832136, 'max_depth': 6, 'subsample': 0.9282793681494648, 'feature_fraction': 0.9350388744635469, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:42:15,291]\u001b[0m Trial 1 finished with value: 3898.1266625065787 and parameters: {'eta': 0.001392277344889219, 'max_depth': 10, 'subsample': 0.26517462689142796, 'feature_fraction': 0.3760548915403128, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\maerzale\\.virtualenvs\\LightGBMLSS-Dam57Fpb\\lib\\site-packages\\numpy\\core\\_methods.py:236: RuntimeWarning: invalid value encountered in subtract\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m[I 2023-05-23 17:42:17,880]\u001b[0m Trial 2 finished with value: 1413688.756671772 and parameters: {'eta': 0.4138998661476644, 'max_depth': 7, 'subsample': 0.27350992643803274, 'feature_fraction': 0.876987994528255, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:42:28,911]\u001b[0m Trial 3 finished with value: 4691.744909704665 and parameters: {'eta': 2.9760410905941373e-05, 'max_depth': 9, 'subsample': 0.9166666887754653, 'feature_fraction': 0.44341694358352446, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:42:40,306]\u001b[0m Trial 4 finished with value: 4179.113031775802 and parameters: {'eta': 0.0006164042978486533, 'max_depth': 10, 'subsample': 0.6444397457765347, 'feature_fraction': 0.5636495559525195, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:42:50,929]\u001b[0m Trial 5 finished with value: 2080.0764891272347 and parameters: {'eta': 0.0492308739002064, 'max_depth': 5, 'subsample': 0.6449835149718066, 'feature_fraction': 0.5607888199767759, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:43:02,175]\u001b[0m Trial 6 finished with value: 4470.9750815425705 and parameters: {'eta': 0.0002590272028333297, 'max_depth': 10, 'subsample': 0.3670250634122595, 'feature_fraction': 0.5588145159299327, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:43:04,960]\u001b[0m Trial 7 finished with value: 3633479.3440835355 and parameters: {'eta': 0.4618123251706421, 'max_depth': 10, 'subsample': 0.4281105144510474, 'feature_fraction': 0.9778666983947115, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:43:16,989]\u001b[0m Trial 8 finished with value: 3790.5134756520843 and parameters: {'eta': 0.0012580784268804222, 'max_depth': 7, 'subsample': 0.4660636346126495, 'feature_fraction': 0.7412377596303807, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:43:28,326]\u001b[0m Trial 9 finished with value: 4656.9590130887955 and parameters: {'eta': 5.173638291710696e-05, 'max_depth': 10, 'subsample': 0.6995675047024632, 'feature_fraction': 0.8290437292047752, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:43:38,027]\u001b[0m Trial 10 finished with value: 3288.5595406302964 and parameters: {'eta': 0.01588129042634339, 'max_depth': 2, 'subsample': 0.9991564523736327, 'feature_fraction': 0.280385719936899, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:43:48,397]\u001b[0m Trial 11 finished with value: 2489.7106354811744 and parameters: {'eta': 0.029675302954054603, 'max_depth': 4, 'subsample': 0.8146610333350044, 'feature_fraction': 0.6880668421228011, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:43:59,145]\u001b[0m Trial 12 finished with value: 2312.750667896554 and parameters: {'eta': 0.032955414558760374, 'max_depth': 5, 'subsample': 0.7996124096152732, 'feature_fraction': 0.6495209266473924, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:44:09,543]\u001b[0m Trial 13 finished with value: 3281.120540190617 and parameters: {'eta': 0.007140192352699519, 'max_depth': 3, 'subsample': 0.5410244693918539, 'feature_fraction': 0.9845712683874053, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:44:21,016]\u001b[0m Trial 14 finished with value: 1857.554178608031 and parameters: {'eta': 0.08423570939349966, 'max_depth': 6, 'subsample': 0.5858884452570676, 'feature_fraction': 0.7834573830641842, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:44:31,363]\u001b[0m Trial 15 finished with value: 1972.4092387977012 and parameters: {'eta': 0.09583856197356405, 'max_depth': 7, 'subsample': 0.5484196920156863, 'feature_fraction': 0.8409611215301299, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:44:38,204]\u001b[0m Trial 16 finished with value: 2391.044865610313 and parameters: {'eta': 0.16269987711661196, 'max_depth': 6, 'subsample': 0.7443945903215214, 'feature_fraction': 0.7585331713379588, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:44:40,616]\u001b[0m Trial 17 pruned. Trial was pruned at iteration 20.\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\maerzale\\.virtualenvs\\LightGBMLSS-Dam57Fpb\\lib\\site-packages\\lightgbm\\basic.py:156: RuntimeWarning: overflow encountered in cast\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m[I 2023-05-23 17:44:43,200]\u001b[0m Trial 18 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:44:46,025]\u001b[0m Trial 19 finished with value: 3731.6333520039675 and parameters: {'eta': 0.14900613959292786, 'max_depth': 4, 'subsample': 0.5628509386138413, 'feature_fraction': 0.9048341062849129, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:44:57,248]\u001b[0m Trial 20 finished with value: 3185.2802973283956 and parameters: {'eta': 0.005143454873520682, 'max_depth': 6, 'subsample': 0.9816723480386734, 'feature_fraction': 0.7609432112746509, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:45:08,818]\u001b[0m Trial 21 finished with value: 1944.1343126676252 and parameters: {'eta': 0.08727883996553833, 'max_depth': 7, 'subsample': 0.5858415444410444, 'feature_fraction': 0.8575950001856819, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:45:19,957]\u001b[0m Trial 22 finished with value: 1871.6793143196296 and parameters: {'eta': 0.07736892631653892, 'max_depth': 8, 'subsample': 0.6265956514093546, 'feature_fraction': 0.9989466106026821, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:45:31,130]\u001b[0m Trial 23 finished with value: 2660.7288909591007 and parameters: {'eta': 0.017089066432843497, 'max_depth': 8, 'subsample': 0.8914723238295523, 'feature_fraction': 0.9727399176682539, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:45:34,882]\u001b[0m Trial 24 finished with value: 3064.4199208136815 and parameters: {'eta': 0.18932918601785867, 'max_depth': 8, 'subsample': 0.6644210801121371, 'feature_fraction': 0.9168564500051085, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:45:46,188]\u001b[0m Trial 25 finished with value: 1838.7094522308296 and parameters: {'eta': 0.07588195943575674, 'max_depth': 6, 'subsample': 0.7572885690962375, 'feature_fraction': 0.9290954445559388, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:45:56,728]\u001b[0m Trial 26 finished with value: 2523.3039650554656 and parameters: {'eta': 0.02827901963428478, 'max_depth': 4, 'subsample': 0.7551341347301008, 'feature_fraction': 0.9158896503415246, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:45:59,391]\u001b[0m Trial 27 pruned. Trial was pruned at iteration 21.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:46:10,348]\u001b[0m Trial 28 finished with value: 1936.1599988149326 and parameters: {'eta': 0.04919324334617989, 'max_depth': 6, 'subsample': 0.8556720533864639, 'feature_fraction': 0.8699611100580005, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:46:13,138]\u001b[0m Trial 29 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:46:16,124]\u001b[0m Trial 30 pruned. Trial was pruned at iteration 21.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:46:27,894]\u001b[0m Trial 31 finished with value: 1819.1402847753507 and parameters: {'eta': 0.06576306417347916, 'max_depth': 8, 'subsample': 0.7598158712683076, 'feature_fraction': 0.9954548382878625, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:46:39,407]\u001b[0m Trial 32 finished with value: 1931.2706679219186 and parameters: {'eta': 0.08054551651707077, 'max_depth': 9, 'subsample': 0.7555606626872196, 'feature_fraction': 0.9389622208734625, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:46:42,005]\u001b[0m Trial 33 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:46:44,578]\u001b[0m Trial 34 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:46:46,919]\u001b[0m Trial 35 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:46:58,674]\u001b[0m Trial 36 finished with value: 1954.1143737806547 and parameters: {'eta': 0.04905209923587467, 'max_depth': 6, 'subsample': 0.681476825832805, 'feature_fraction': 0.9597957342841579, 'boosting': 'gbdt'}. Best is trial 0 with value: 1809.126050616011.\u001b[0m\n", + "\n", + "Hyper-Parameter Optimization successfully finished.\n", + " Number of finished trials: 37\n", + " Best trial:\n", + " Value: 1809.126050616011\n", + " Params: \n", + " eta: 0.06556917943832136\n", + " max_depth: 6\n", + " subsample: 0.9282793681494648\n", + " feature_fraction: 0.9350388744635469\n", + " boosting: gbdt\n", + " opt_rounds: 100\n" + ] + } + ], + "source": [ + "# Any LightGBM hyperparameter can be tuned, where the structure of the parameter dictionary needs to be as follows:\n", + "\n", + " # Float/Int sample_type\n", + " # {\"param_name\": [\"sample_type\", low, high, log]}\n", + " # sample_type: str, Type of sampling, e.g., \"float\" or \"int\"\n", + " # low: int, Lower endpoint of the range of suggested values\n", + " # high: int, Upper endpoint of the range of suggested values\n", + " # log: bool, Flag to sample the value from the log domain or not\n", + " # Example: {\"eta\": \"float\", low=1e-5, high=1, log=True]}\n", + "\n", + " # Categorical sample_type\n", + " # {\"param_name\": [\"sample_type\", [\"choice1\", \"choice2\", \"choice3\", \"...\"]]}\n", + " # sample_type: str, Type of sampling, either \"categorical\"\n", + " # choice1, choice2, choice3, ...: str, Possible choices for the parameter\n", + " # Example: {\"boosting\": [\"categorical\", [\"gbdt\", \"dart\"]]}\n", + "\n", + " # For parameters without tunable choice (this is needed if tree_method = \"gpu_hist\" and gpu_id needs to be specified)\n", + " # {\"param_name\": [\"none\", [value]]},\n", + " # param_name: str, Name of the parameter\n", + " # value: int, Value of the parameter\n", + " # Example: {\"gpu_id\": [\"none\", [0]]}\n", + "\n", + "param_dict = {\n", + " \"eta\": [\"float\", {\"low\": 1e-5, \"high\": 1, \"log\": True}],\n", + " \"max_depth\": [\"int\", {\"low\": 1, \"high\": 10, \"log\": False}],\n", + " \"subsample\": [\"float\", {\"low\": 0.2, \"high\": 1.0, \"log\": False}],\n", + " \"feature_fraction\": [\"float\", {\"low\": 0.2, \"high\": 1.0, \"log\": False}],\n", + " \"boosting\": [\"categorical\", [\"gbdt\"]],\n", + "}\n", + "\n", + "np.random.seed(123)\n", + "opt_param = lgblss.hyper_opt(param_dict,\n", + " dtrain,\n", + " num_boost_round=100, # Number of boosting iterations.\n", + " nfold=5, # Number of cv-folds.\n", + " early_stopping_rounds=20, # Number of early-stopping rounds\n", + " max_minutes=5, # Time budget in minutes, i.e., stop study after the given number of minutes.\n", + " n_trials=None, # The number of trials. If this argument is set to None, there is no limitation on the number of trials.\n", + " silence=False, # Controls the verbosity of the trail, i.e., user can silence the outputs of the trail.\n", + " seed=123, # Seed used to generate cv-folds.\n", + " hp_seed=None # Seed for random number generator used in the Bayesian hyperparameter search.\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Model Training" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:35:12.260385100Z", + "start_time": "2023-05-18T06:35:09.372570200Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.random.seed(123)\n", + "\n", + "opt_params = opt_param.copy()\n", + "n_rounds = opt_params[\"opt_rounds\"]\n", + "del opt_params[\"opt_rounds\"]\n", + "\n", + "# Train Model with optimized hyperparameters\n", + "lgblss.train(opt_params,\n", + " dtrain,\n", + " num_boost_round=n_rounds\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prediction" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:35:13.557706600Z", + "start_time": "2023-05-18T06:35:12.260385100Z" + } + }, + "outputs": [], + "source": [ + "# Set seed for reproducibility\n", + "torch.manual_seed(123)\n", + "\n", + "# Number of samples to draw from predicted distribution\n", + "n_samples = 1000\n", + "quant_sel = [0.05, 0.95] # Quantiles to calculate from predicted distribution\n", + "\n", + "# Sample from predicted distribution\n", + "pred_samples = lgblss.predict(X_test,\n", + " pred_type=\"samples\",\n", + " n_samples=n_samples,\n", + " seed=123)\n", + "\n", + "# Calculate quantiles from predicted distribution\n", + "pred_quantiles = lgblss.predict(X_test,\n", + " pred_type=\"quantiles\",\n", + " n_samples=n_samples,\n", + " quantiles=quant_sel)\n", + "\n", + "# Returns predicted distributional parameters\n", + "pred_params = lgblss.predict(X_test,\n", + " pred_type=\"parameters\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:35:13.588949200Z", + "start_time": "2023-05-18T06:35:13.557706600Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
y_sample0y_sample1y_sample2y_sample3y_sample4y_sample5y_sample6y_sample7y_sample8y_sample9...y_sample990y_sample991y_sample992y_sample993y_sample994y_sample995y_sample996y_sample997y_sample998y_sample999
01.9570732.2108591.5398461.4739162.2380031.7815313.4252672.6060451.4540242.231545...2.2324581.5125931.6533292.4203651.8104312.1606112.7193261.7523911.5436821.977375
11.0070660.3457600.4134931.6228041.3247741.0631451.0165610.9406671.1739470.915472...1.0894990.7469150.8187130.8077951.3759440.8914391.2946450.5786771.1959481.057028
21.1305792.9132652.2563350.9310582.7104551.4589051.4499661.7322941.3342722.258079...1.5750801.4513962.4531511.4014441.1615542.0937321.7976831.5228101.2751101.698852
31.8787123.1546593.4920281.1760491.4670541.9700851.6123032.2796912.4353511.359894...0.8342301.8343182.3401711.9506442.5142842.1786652.5147531.6471261.7309292.753849
44.2092533.4164942.3774531.9222594.5828973.4096284.8221143.7512004.5392764.927194...2.0332184.4385955.2119552.4631563.4158713.8466483.0915272.7885974.2780623.400972
\n", + "

5 rows × 1000 columns

\n", + "
" + ], + "text/plain": [ + " y_sample0 y_sample1 y_sample2 y_sample3 y_sample4 y_sample5 \n", + "0 1.957073 2.210859 1.539846 1.473916 2.238003 1.781531 \\\n", + "1 1.007066 0.345760 0.413493 1.622804 1.324774 1.063145 \n", + "2 1.130579 2.913265 2.256335 0.931058 2.710455 1.458905 \n", + "3 1.878712 3.154659 3.492028 1.176049 1.467054 1.970085 \n", + "4 4.209253 3.416494 2.377453 1.922259 4.582897 3.409628 \n", + "\n", + " y_sample6 y_sample7 y_sample8 y_sample9 ... y_sample990 y_sample991 \n", + "0 3.425267 2.606045 1.454024 2.231545 ... 2.232458 1.512593 \\\n", + "1 1.016561 0.940667 1.173947 0.915472 ... 1.089499 0.746915 \n", + "2 1.449966 1.732294 1.334272 2.258079 ... 1.575080 1.451396 \n", + "3 1.612303 2.279691 2.435351 1.359894 ... 0.834230 1.834318 \n", + "4 4.822114 3.751200 4.539276 4.927194 ... 2.033218 4.438595 \n", + "\n", + " y_sample992 y_sample993 y_sample994 y_sample995 y_sample996 \n", + "0 1.653329 2.420365 1.810431 2.160611 2.719326 \\\n", + "1 0.818713 0.807795 1.375944 0.891439 1.294645 \n", + "2 2.453151 1.401444 1.161554 2.093732 1.797683 \n", + "3 2.340171 1.950644 2.514284 2.178665 2.514753 \n", + "4 5.211955 2.463156 3.415871 3.846648 3.091527 \n", + "\n", + " y_sample997 y_sample998 y_sample999 \n", + "0 1.752391 1.543682 1.977375 \n", + "1 0.578677 1.195948 1.057028 \n", + "2 1.522810 1.275110 1.698852 \n", + "3 1.647126 1.730929 2.753849 \n", + "4 2.788597 4.278062 3.400972 \n", + "\n", + "[5 rows x 1000 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred_samples.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:35:13.604570800Z", + "start_time": "2023-05-18T06:35:13.588949200Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
quant_0.05quant_0.95
01.2730173.055063
10.5455821.557167
20.9811692.519820
31.0726902.678862
42.3588466.036986
\n", + "
" + ], + "text/plain": [ + " quant_0.05 quant_0.95\n", + "0 1.273017 3.055063\n", + "1 0.545582 1.557167\n", + "2 0.981169 2.519820\n", + "3 1.072690 2.678862\n", + "4 2.358846 6.036986" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred_quantiles.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:35:13.651435200Z", + "start_time": "2023-05-18T06:35:13.604570800Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
concentrationrate
014.4612777.006943
110.97832910.965075
212.7509287.659188
313.2790867.299364
414.2400233.599187
\n", + "
" + ], + "text/plain": [ + " concentration rate\n", + "0 14.461277 7.006943\n", + "1 10.978329 10.965075\n", + "2 12.750928 7.659188\n", + "3 13.279086 7.299364\n", + "4 14.240023 3.599187" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred_params.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SHAP Interpretability" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:35:15.172419700Z", + "start_time": "2023-05-18T06:35:13.620191500Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Partial Dependence Plot\n", + "pdp_df = pd.DataFrame(X_train, columns=feature_names)\n", + "lgblss.plot(pdp_df,\n", + " parameter=\"concentration\",\n", + " feature=feature_names[0],\n", + " plot_type=\"Partial_Dependence\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:35:16.086656500Z", + "start_time": "2023-05-18T06:35:15.174414600Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Feature Importance\n", + "lgblss.plot(pdp_df,\n", + " parameter=\"concentration\",\n", + " plot_type=\"Feature_Importance\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/simulation_example_NegativeBinomial.ipynb b/examples/simulation_example_NegativeBinomial.ipynb new file mode 100644 index 0000000..969997d --- /dev/null +++ b/examples/simulation_example_NegativeBinomial.ipynb @@ -0,0 +1,766 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/StatMixedML/LightGBMLSS/blob/master/examples/simulation_example_NegativeBinomial.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:57:02.291276100Z", + "start_time": "2023-05-18T06:57:02.177579400Z" + } + }, + "outputs": [], + "source": [ + "from lightgbmlss.model import *\n", + "from lightgbmlss.distributions.NegativeBinomial import *\n", + "\n", + "from sklearn.datasets import make_regression\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:51:53.913176400Z", + "start_time": "2023-05-18T06:51:53.050199200Z" + } + }, + "outputs": [], + "source": [ + "def custom_transform(y, constr_val):\n", + " # Apply a custom transformation to restrict y between 0 and constr_val\n", + " transformed_y = np.abs(y) # Example transformation: logarithmic\n", + " constrained_y = constr_val * transformed_y / np.max(transformed_y) # Scale to desired range\n", + " int_y = constrained_y.astype(int)\n", + " return int_y\n", + "\n", + "# Generate a custom dataset\n", + "X, y = make_regression(n_samples=5000, n_features=10, n_informative=2, random_state=123)\n", + "\n", + "# Apply the custom transformation\n", + "y = custom_transform(y, 50)\n", + "\n", + "# Split into train and test\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)\n", + "\n", + "dtrain = lgb.Dataset(X_train, label=y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distribution Selection" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:51:53.928136100Z", + "start_time": "2023-05-18T06:51:53.911318300Z" + } + }, + "outputs": [], + "source": [ + "# Specifies NegativeBinomial distribution with corresponding response functions and option to stabilize Gradient/Hessian. See ?NegativeBinomial for more information.\n", + "lgblss = LightGBMLSS(\n", + " NegativeBinomial(stabilization=\"None\", # Options are \"None\", \"MAD\", \"L2\".\n", + " response_fn_total_count=\"relu\", # Function to transform the total_count-parameter, e.g., \"exp\", \"softplus\" or \"relu\".\n", + " response_fn_probs=\"sigmoid\") # Function to transform the probs-parameter, e.g., \"sigmoid\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Hyper-Parameter Optimization" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:56:59.310252600Z", + "start_time": "2023-05-18T06:51:54.046821100Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m[I 2023-05-23 17:29:42,931]\u001b[0m A new study created in memory with name: LightGBMLSS Hyper-Parameter Optimization\u001b[0m\n", + "C:\\Users\\maerzale\\.virtualenvs\\LightGBMLSS-Dam57Fpb\\lib\\site-packages\\optuna\\progress_bar.py:56: ExperimentalWarning: Progress bar is experimental (supported from v1.2.0). The interface can change in the future.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9ad4aada40064079a36e30d27877f6d9", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 00:00/05:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m[I 2023-05-23 17:29:45,843]\u001b[0m Trial 0 finished with value: 2362.6454823420363 and parameters: {'eta': 0.2237494745725795, 'max_depth': 8, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 21.718229204152497, 'min_sum_hessian_in_leaf': 0.41610733461582017, 'subsample': 0.5489136781162185, 'feature_fraction': 0.6795070894088597, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:29:51,154]\u001b[0m Trial 1 finished with value: 2391.3637035325364 and parameters: {'eta': 0.02993587048561261, 'max_depth': 10, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 8.375534018622261, 'min_sum_hessian_in_leaf': 0.0007753281443770188, 'subsample': 0.7819790850797737, 'feature_fraction': 0.2656157036043193, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:29:57,721]\u001b[0m Trial 2 finished with value: 2623.1299437982134 and parameters: {'eta': 0.0003193057644574507, 'max_depth': 5, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 23.97626450189067, 'min_sum_hessian_in_leaf': 3.2933540258954417, 'subsample': 0.3670045093424384, 'feature_fraction': 0.8373817369882068, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:30:05,054]\u001b[0m Trial 3 finished with value: 2627.8631382661983 and parameters: {'eta': 0.0003943692191718305, 'max_depth': 4, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 8.339211480675464, 'min_sum_hessian_in_leaf': 0.006141165700569994, 'subsample': 0.5706046622783398, 'feature_fraction': 0.6163495990468459, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:30:12,699]\u001b[0m Trial 4 finished with value: 2565.5259588715003 and parameters: {'eta': 0.0014262691550398058, 'max_depth': 5, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 7.219831849234432, 'min_sum_hessian_in_leaf': 1.5936701835149663, 'subsample': 0.5354838253750538, 'feature_fraction': 0.788024946379748, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:30:19,514]\u001b[0m Trial 5 finished with value: 2631.346853451946 and parameters: {'eta': 0.0022084824843702066, 'max_depth': 7, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 10.86638765193509, 'min_sum_hessian_in_leaf': 103.92774184129252, 'subsample': 0.3622280593782978, 'feature_fraction': 0.2389145946470812, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:30:26,608]\u001b[0m Trial 6 finished with value: 2612.9684412363945 and parameters: {'eta': 0.0008209854898713998, 'max_depth': 3, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 18.673462014081853, 'min_sum_hessian_in_leaf': 0.0010218570238958342, 'subsample': 0.41686832678116215, 'feature_fraction': 0.8114911457397536, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:30:33,724]\u001b[0m Trial 7 finished with value: 2532.562604268222 and parameters: {'eta': 0.003669296449292923, 'max_depth': 8, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 23.684189735941473, 'min_sum_hessian_in_leaf': 2.7800236286073807, 'subsample': 0.6318847745762357, 'feature_fraction': 0.4719848016495097, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:30:41,128]\u001b[0m Trial 8 finished with value: 2595.6401025229907 and parameters: {'eta': 0.0007935716589829182, 'max_depth': 10, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 14.44064799437206, 'min_sum_hessian_in_leaf': 6.703126821891454e-06, 'subsample': 0.5797709222455096, 'feature_fraction': 0.6389164300692631, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:30:48,262]\u001b[0m Trial 9 finished with value: 2640.7847464386095 and parameters: {'eta': 1.1344436912426338e-05, 'max_depth': 7, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 10.222627040352023, 'min_sum_hessian_in_leaf': 3.9161873129620988, 'subsample': 0.8997126328434504, 'feature_fraction': 0.3669305529286317, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:30:50,315]\u001b[0m Trial 10 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:30:57,316]\u001b[0m Trial 11 finished with value: 2485.0299401077405 and parameters: {'eta': 0.14061339325391398, 'max_depth': 10, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 3.3176056564845595, 'min_sum_hessian_in_leaf': 0.006341526111724655, 'subsample': 0.7866760261119364, 'feature_fraction': 0.2134887893636273, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:31:04,267]\u001b[0m Trial 12 finished with value: 2510.2544355205596 and parameters: {'eta': 0.05675290675580147, 'max_depth': 1, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 2.003583949003545, 'min_sum_hessian_in_leaf': 0.00016908985604525037, 'subsample': 0.7816141992035248, 'feature_fraction': 0.46873459066130746, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:31:11,235]\u001b[0m Trial 13 finished with value: 2434.691068953821 and parameters: {'eta': 0.035842318641714506, 'max_depth': 9, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 15.947278462239742, 'min_sum_hessian_in_leaf': 0.07586714684559995, 'subsample': 0.7081152982246103, 'feature_fraction': 0.3586290776759057, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:31:13,354]\u001b[0m Trial 14 finished with value: 2405.968047461268 and parameters: {'eta': 0.8255422981199733, 'max_depth': 9, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 39.50047511005906, 'min_sum_hessian_in_leaf': 3.2204109714104947e-05, 'subsample': 0.9694361903452793, 'feature_fraction': 0.5310595744689661, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:31:20,360]\u001b[0m Trial 15 finished with value: 2379.9450329268884 and parameters: {'eta': 0.013851953984282158, 'max_depth': 7, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 22.714437031568835, 'min_sum_hessian_in_leaf': 0.052384031718501985, 'subsample': 0.6944191500532103, 'feature_fraction': 0.6724673155456391, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:31:27,508]\u001b[0m Trial 16 finished with value: 2393.1921848071834 and parameters: {'eta': 0.010281869632474622, 'max_depth': 7, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 23.351756834693404, 'min_sum_hessian_in_leaf': 0.10151892027266446, 'subsample': 0.4897500323768813, 'feature_fraction': 0.6724458605458586, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:31:29,574]\u001b[0m Trial 17 finished with value: 2480.3819878566637 and parameters: {'eta': 0.2805850135216093, 'max_depth': 6, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 28.483058842088965, 'min_sum_hessian_in_leaf': 308.68527211100513, 'subsample': 0.6592915334148823, 'feature_fraction': 0.7329451535145914, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:31:32,549]\u001b[0m Trial 18 finished with value: 2377.9336991893247 and parameters: {'eta': 0.11387471034399045, 'max_depth': 6, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 20.139582479113876, 'min_sum_hessian_in_leaf': 0.06652271915542315, 'subsample': 0.46811038072509037, 'feature_fraction': 0.5628397906517216, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:31:35,514]\u001b[0m Trial 19 finished with value: 2388.9855898958435 and parameters: {'eta': 0.16981432110206116, 'max_depth': 3, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 18.136651877041942, 'min_sum_hessian_in_leaf': 42.072023397491115, 'subsample': 0.5025399122554826, 'feature_fraction': 0.5766245570763258, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:31:37,648]\u001b[0m Trial 20 finished with value: 2363.732245690417 and parameters: {'eta': 0.4372857275137455, 'max_depth': 6, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 14.073826823263609, 'min_sum_hessian_in_leaf': 0.26501464839624383, 'subsample': 0.44396373816325235, 'feature_fraction': 0.5522689203937352, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:31:39,863]\u001b[0m Trial 21 finished with value: 2362.524736051979 and parameters: {'eta': 0.34556990800519055, 'max_depth': 6, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 14.255512468522362, 'min_sum_hessian_in_leaf': 0.693320417184206, 'subsample': 0.45856093769809725, 'feature_fraction': 0.5465936335854907, 'boosting': 'gbdt'}. Best is trial 21 with value: 2362.524736051979.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:31:42,086]\u001b[0m Trial 22 finished with value: 2422.7315118540146 and parameters: {'eta': 0.36138700004612195, 'max_depth': 4, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 13.575809323111534, 'min_sum_hessian_in_leaf': 0.492364172045369, 'subsample': 0.424257577580033, 'feature_fraction': 0.5142555643940006, 'boosting': 'gbdt'}. Best is trial 21 with value: 2362.524736051979.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:31:44,298]\u001b[0m Trial 23 finished with value: 2326.6489839222204 and parameters: {'eta': 0.48041844540678624, 'max_depth': 8, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 13.484345479556708, 'min_sum_hessian_in_leaf': 16.431071107898838, 'subsample': 0.5624857227813972, 'feature_fraction': 0.6123630330183538, 'boosting': 'gbdt'}. Best is trial 23 with value: 2326.6489839222204.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:31:46,198]\u001b[0m Trial 24 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:31:49,638]\u001b[0m Trial 25 finished with value: 2455.753399349311 and parameters: {'eta': 0.09274107137257977, 'max_depth': 9, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 12.037440566858024, 'min_sum_hessian_in_leaf': 415.87564211338184, 'subsample': 0.5329968439672376, 'feature_fraction': 0.6180810201843444, 'boosting': 'gbdt'}. Best is trial 23 with value: 2326.6489839222204.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:31:51,956]\u001b[0m Trial 26 finished with value: 2337.9999985467084 and parameters: {'eta': 0.3198761042660349, 'max_depth': 8, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 17.362250870635982, 'min_sum_hessian_in_leaf': 21.916990640862718, 'subsample': 0.6307392213285038, 'feature_fraction': 0.6034895857694762, 'boosting': 'gbdt'}. Best is trial 23 with value: 2326.6489839222204.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:31:54,205]\u001b[0m Trial 27 finished with value: 2342.8801951807536 and parameters: {'eta': 0.37383407365164384, 'max_depth': 9, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 16.8456939886239, 'min_sum_hessian_in_leaf': 26.35122966162377, 'subsample': 0.6227501147607917, 'feature_fraction': 0.5985785240866417, 'boosting': 'gbdt'}. Best is trial 23 with value: 2326.6489839222204.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:31:57,606]\u001b[0m Trial 28 finished with value: 2361.158874178057 and parameters: {'eta': 0.0785487440687128, 'max_depth': 9, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 18.125916260614158, 'min_sum_hessian_in_leaf': 20.79584132909297, 'subsample': 0.6221006363725866, 'feature_fraction': 0.6066570638274086, 'boosting': 'gbdt'}. Best is trial 23 with value: 2326.6489839222204.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:31:59,727]\u001b[0m Trial 29 finished with value: 2367.1929996325844 and parameters: {'eta': 0.21238760340231297, 'max_depth': 8, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 20.92826562733092, 'min_sum_hessian_in_leaf': 18.417894683158824, 'subsample': 0.669890431142174, 'feature_fraction': 0.7294069371245535, 'boosting': 'gbdt'}. Best is trial 23 with value: 2326.6489839222204.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:32:02,052]\u001b[0m Trial 30 finished with value: 2394.8728840212125 and parameters: {'eta': 0.4411267672169036, 'max_depth': 9, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 5.775214524227474, 'min_sum_hessian_in_leaf': 153.1182095288567, 'subsample': 0.5975404530271493, 'feature_fraction': 0.4753151586692447, 'boosting': 'gbdt'}. Best is trial 23 with value: 2326.6489839222204.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:32:05,629]\u001b[0m Trial 31 finished with value: 2347.181534280952 and parameters: {'eta': 0.07872931297026926, 'max_depth': 9, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 17.549321224257636, 'min_sum_hessian_in_leaf': 17.059907860856622, 'subsample': 0.624900603204972, 'feature_fraction': 0.6061742100950961, 'boosting': 'gbdt'}. Best is trial 23 with value: 2326.6489839222204.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:32:08,140]\u001b[0m Trial 32 finished with value: 2328.7471642655028 and parameters: {'eta': 0.1973042598097023, 'max_depth': 10, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 16.62736748492315, 'min_sum_hessian_in_leaf': 9.125298639304418, 'subsample': 0.7202093475107586, 'feature_fraction': 0.6198991930466521, 'boosting': 'gbdt'}. Best is trial 23 with value: 2326.6489839222204.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:32:10,998]\u001b[0m Trial 33 finished with value: 2298.6721740924013 and parameters: {'eta': 0.17803868809627144, 'max_depth': 10, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 11.180551086747291, 'min_sum_hessian_in_leaf': 5.941223084296848, 'subsample': 0.7044395005683846, 'feature_fraction': 0.6512665609282323, 'boosting': 'gbdt'}. Best is trial 33 with value: 2298.6721740924013.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:32:15,761]\u001b[0m Trial 34 finished with value: 2288.310725119945 and parameters: {'eta': 0.04269902087327984, 'max_depth': 10, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 10.327585984239349, 'min_sum_hessian_in_leaf': 6.153929451386866, 'subsample': 0.7460618046807748, 'feature_fraction': 0.6979232506912373, 'boosting': 'gbdt'}. Best is trial 34 with value: 2288.310725119945.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:32:21,568]\u001b[0m Trial 35 finished with value: 2301.1912863650805 and parameters: {'eta': 0.032750934241078125, 'max_depth': 10, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 11.968960432565158, 'min_sum_hessian_in_leaf': 3.565962093638854, 'subsample': 0.7242452532512303, 'feature_fraction': 0.676646365887166, 'boosting': 'gbdt'}. Best is trial 34 with value: 2288.310725119945.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:32:26,398]\u001b[0m Trial 36 finished with value: 2278.9083548948374 and parameters: {'eta': 0.03760315712287438, 'max_depth': 10, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 10.31982551796113, 'min_sum_hessian_in_leaf': 1.8902616256990257, 'subsample': 0.7553375646342636, 'feature_fraction': 0.7675486539088038, 'boosting': 'gbdt'}. Best is trial 36 with value: 2278.9083548948374.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:32:31,837]\u001b[0m Trial 37 finished with value: 2269.2143955976367 and parameters: {'eta': 0.031145173663260313, 'max_depth': 10, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 8.952859571921808, 'min_sum_hessian_in_leaf': 3.0772038834652715, 'subsample': 0.7709696860517249, 'feature_fraction': 0.7701635257519885, 'boosting': 'gbdt'}. Best is trial 37 with value: 2269.2143955976367.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:32:33,736]\u001b[0m Trial 38 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:32:38,801]\u001b[0m Trial 39 finished with value: 2316.269695587194 and parameters: {'eta': 0.04173267653504825, 'max_depth': 10, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 6.89728898162554, 'min_sum_hessian_in_leaf': 77.90819993951197, 'subsample': 0.8510010952323241, 'feature_fraction': 0.8386166008506785, 'boosting': 'gbdt'}. Best is trial 37 with value: 2269.2143955976367.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:32:40,919]\u001b[0m Trial 40 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:32:43,050]\u001b[0m Trial 41 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:32:48,767]\u001b[0m Trial 42 finished with value: 2245.070069471738 and parameters: {'eta': 0.042173847501786106, 'max_depth': 10, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 5.6188158422858985, 'min_sum_hessian_in_leaf': 1.7629752758962243, 'subsample': 0.8016798146120535, 'feature_fraction': 0.7496454044538944, 'boosting': 'gbdt'}. Best is trial 42 with value: 2245.070069471738.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:32:53,119]\u001b[0m Trial 43 finished with value: 2228.159596963739 and parameters: {'eta': 0.05191998792950477, 'max_depth': 10, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 4.741057189243693, 'min_sum_hessian_in_leaf': 1.271027464320476, 'subsample': 0.8219192415951626, 'feature_fraction': 0.8296163363382973, 'boosting': 'gbdt'}. Best is trial 43 with value: 2228.159596963739.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:32:57,516]\u001b[0m Trial 44 finished with value: 2226.78036535566 and parameters: {'eta': 0.051477816066066946, 'max_depth': 9, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 4.376364800405991, 'min_sum_hessian_in_leaf': 0.8792254560052736, 'subsample': 0.8387685023131976, 'feature_fraction': 0.8535264518938385, 'boosting': 'gbdt'}. Best is trial 44 with value: 2226.78036535566.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:32:59,568]\u001b[0m Trial 45 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:33:07,602]\u001b[0m Trial 46 finished with value: 2075.2410532368367 and parameters: {'eta': 0.05688986586082767, 'max_depth': 9, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 0.16750442551495404, 'min_sum_hessian_in_leaf': 0.2687257222099593, 'subsample': 0.8493305271131836, 'feature_fraction': 0.878587674756591, 'boosting': 'gbdt'}. Best is trial 46 with value: 2075.2410532368367.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:33:13,825]\u001b[0m Trial 47 finished with value: 2151.618233076929 and parameters: {'eta': 0.05716208939561728, 'max_depth': 9, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 0.6168959319814218, 'min_sum_hessian_in_leaf': 0.26236498755098553, 'subsample': 0.8864865117037826, 'feature_fraction': 0.8864687211471932, 'boosting': 'gbdt'}. Best is trial 46 with value: 2075.2410532368367.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:33:19,091]\u001b[0m Trial 48 finished with value: 2163.8704917362998 and parameters: {'eta': 0.0661500125532209, 'max_depth': 9, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 0.9170210019308764, 'min_sum_hessian_in_leaf': 0.23950036814351627, 'subsample': 0.9009058592987782, 'feature_fraction': 0.8812469322199725, 'boosting': 'gbdt'}. Best is trial 46 with value: 2075.2410532368367.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:33:27,158]\u001b[0m Trial 49 finished with value: 2108.266819652561 and parameters: {'eta': 0.07664872488957572, 'max_depth': 7, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 0.35411552277915437, 'min_sum_hessian_in_leaf': 0.22189629675845274, 'subsample': 0.8804800016807428, 'feature_fraction': 0.894418132133942, 'boosting': 'gbdt'}. Best is trial 46 with value: 2075.2410532368367.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:33:34,898]\u001b[0m Trial 50 finished with value: 2055.6390948180738 and parameters: {'eta': 0.12500756616342865, 'max_depth': 8, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 0.21252077843699707, 'min_sum_hessian_in_leaf': 0.020986146460828595, 'subsample': 0.8964896790032578, 'feature_fraction': 0.888380440217222, 'boosting': 'gbdt'}. Best is trial 50 with value: 2055.6390948180738.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:33:42,905]\u001b[0m Trial 51 finished with value: 2071.960617781594 and parameters: {'eta': 0.06924549825191528, 'max_depth': 7, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 0.22672170889501356, 'min_sum_hessian_in_leaf': 0.030355401129561588, 'subsample': 0.9189651731692174, 'feature_fraction': 0.9087302856711369, 'boosting': 'gbdt'}. Best is trial 50 with value: 2055.6390948180738.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:33:51,056]\u001b[0m Trial 52 finished with value: 1984.2579611521608 and parameters: {'eta': 0.11687267868675957, 'max_depth': 7, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 0.07270388623516136, 'min_sum_hessian_in_leaf': 0.01256536556758994, 'subsample': 0.9215379815933538, 'feature_fraction': 0.8831075816856314, 'boosting': 'gbdt'}. Best is trial 52 with value: 1984.2579611521608.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:33:55,510]\u001b[0m Trial 53 finished with value: 2146.5495247821887 and parameters: {'eta': 0.12129221695006828, 'max_depth': 7, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 0.6396267929673847, 'min_sum_hessian_in_leaf': 0.014955616333887792, 'subsample': 0.9675877305859379, 'feature_fraction': 0.914541493647696, 'boosting': 'gbdt'}. Best is trial 52 with value: 1984.2579611521608.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:33:58,812]\u001b[0m Trial 54 finished with value: 2216.960939400165 and parameters: {'eta': 0.12572107433539675, 'max_depth': 7, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 2.486452625652089, 'min_sum_hessian_in_leaf': 0.022859901566637045, 'subsample': 0.9701996437587083, 'feature_fraction': 0.9435590561868864, 'boosting': 'gbdt'}. Best is trial 52 with value: 1984.2579611521608.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:34:05,163]\u001b[0m Trial 55 finished with value: 2096.381785776171 and parameters: {'eta': 0.10418381263470201, 'max_depth': 7, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 0.2894194562530933, 'min_sum_hessian_in_leaf': 0.008628756043438867, 'subsample': 0.9227174740490002, 'feature_fraction': 0.94055089663108, 'boosting': 'gbdt'}. Best is trial 52 with value: 1984.2579611521608.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:34:08,247]\u001b[0m Trial 56 finished with value: 2224.8738393418266 and parameters: {'eta': 0.11542910593282225, 'max_depth': 5, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 2.6649531436506826, 'min_sum_hessian_in_leaf': 0.0023058536323214427, 'subsample': 0.9331309196557765, 'feature_fraction': 0.9759845072852276, 'boosting': 'gbdt'}. Best is trial 52 with value: 1984.2579611521608.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:34:12,207]\u001b[0m Trial 57 finished with value: 2201.1369744407652 and parameters: {'eta': 0.09500667632743191, 'max_depth': 7, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 1.7323883280785455, 'min_sum_hessian_in_leaf': 0.006867202144082953, 'subsample': 0.934558561781809, 'feature_fraction': 0.9466733475970236, 'boosting': 'gbdt'}. Best is trial 52 with value: 1984.2579611521608.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:34:19,651]\u001b[0m Trial 58 finished with value: 1933.9365744624768 and parameters: {'eta': 0.18868995052477497, 'max_depth': 6, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 0.1067669789031166, 'min_sum_hessian_in_leaf': 0.02332933630758901, 'subsample': 0.9912837219065174, 'feature_fraction': 0.8774041235779472, 'boosting': 'gbdt'}. Best is trial 58 with value: 1933.9365744624768.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:34:22,430]\u001b[0m Trial 59 finished with value: 2258.716691864054 and parameters: {'eta': 0.24750524982553787, 'max_depth': 5, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 3.1775290438903765, 'min_sum_hessian_in_leaf': 0.01911494627612403, 'subsample': 0.9925214962908524, 'feature_fraction': 0.805754592334728, 'boosting': 'gbdt'}. Best is trial 58 with value: 1933.9365744624768.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:34:29,991]\u001b[0m Trial 60 finished with value: 1952.7434978700257 and parameters: {'eta': 0.14828485489275825, 'max_depth': 6, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 0.0589375634903722, 'min_sum_hessian_in_leaf': 0.0018433608718709821, 'subsample': 0.9262578601257356, 'feature_fraction': 0.8651514021779112, 'boosting': 'gbdt'}. Best is trial 58 with value: 1933.9365744624768.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:34:37,730]\u001b[0m Trial 61 finished with value: 1939.7417403365591 and parameters: {'eta': 0.18111930699526543, 'max_depth': 6, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 0.10458111640097839, 'min_sum_hessian_in_leaf': 0.0022072579873266725, 'subsample': 0.9211509741532398, 'feature_fraction': 0.8647453808290021, 'boosting': 'gbdt'}. Best is trial 58 with value: 1933.9365744624768.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:34:40,696]\u001b[0m Trial 62 finished with value: 2211.175665460217 and parameters: {'eta': 0.1685830542910868, 'max_depth': 6, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 1.8868049680261478, 'min_sum_hessian_in_leaf': 0.0028510296013477122, 'subsample': 0.868263507655618, 'feature_fraction': 0.8589181537801993, 'boosting': 'gbdt'}. Best is trial 58 with value: 1933.9365744624768.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:34:43,293]\u001b[0m Trial 63 finished with value: 2269.195082428441 and parameters: {'eta': 0.5506707225683988, 'max_depth': 5, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 3.0858512762478707, 'min_sum_hessian_in_leaf': 0.0006100572331300093, 'subsample': 0.9104709717863305, 'feature_fraction': 0.9166521441326464, 'boosting': 'gbdt'}. Best is trial 58 with value: 1933.9365744624768.\u001b[0m\n", + "\n", + "Hyper-Parameter Optimization successfully finished.\n", + " Number of finished trials: 64\n", + " Best trial:\n", + " Value: 1933.9365744624768\n", + " Params: \n", + " eta: 0.18868995052477497\n", + " max_depth: 6\n", + " num_leaves: 255\n", + " min_data_in_leaf: 20\n", + " min_gain_to_split: 0.1067669789031166\n", + " min_sum_hessian_in_leaf: 0.02332933630758901\n", + " subsample: 0.9912837219065174\n", + " feature_fraction: 0.8774041235779472\n", + " boosting: gbdt\n", + " opt_rounds: 100\n" + ] + } + ], + "source": [ + "# Any LightGBM hyperparameter can be tuned, where the structure of the parameter dictionary needs to be as follows:\n", + "\n", + " # Float/Int sample_type\n", + " # {\"param_name\": [\"sample_type\", low, high, log]}\n", + " # sample_type: str, Type of sampling, e.g., \"float\" or \"int\"\n", + " # low: int, Lower endpoint of the range of suggested values\n", + " # high: int, Upper endpoint of the range of suggested values\n", + " # log: bool, Flag to sample the value from the log domain or not\n", + " # Example: {\"eta\": \"float\", low=1e-5, high=1, log=True]}\n", + "\n", + " # Categorical sample_type\n", + " # {\"param_name\": [\"sample_type\", [\"choice1\", \"choice2\", \"choice3\", \"...\"]]}\n", + " # sample_type: str, Type of sampling, either \"categorical\"\n", + " # choice1, choice2, choice3, ...: str, Possible choices for the parameter\n", + " # Example: {\"boosting\": [\"categorical\", [\"gbdt\", \"dart\"]]}\n", + "\n", + " # For parameters without tunable choice (this is needed if tree_method = \"gpu_hist\" and gpu_id needs to be specified)\n", + " # {\"param_name\": [\"none\", [value]]},\n", + " # param_name: str, Name of the parameter\n", + " # value: int, Value of the parameter\n", + " # Example: {\"gpu_id\": [\"none\", [0]]}\n", + "\n", + "param_dict = {\n", + " \"eta\": [\"float\", {\"low\": 1e-5, \"high\": 1, \"log\": True}],\n", + " \"max_depth\": [\"int\", {\"low\": 1, \"high\": 10, \"log\": False}],\n", + " \"num_leaves\": [\"int\", {\"low\": 255, \"high\": 255, \"log\": False}], # set to constant for this example\n", + " \"min_data_in_leaf\": [\"int\", {\"low\": 20, \"high\": 20, \"log\": False}], # set to constant for this example\n", + " \"min_gain_to_split\": [\"float\", {\"low\": 1e-8, \"high\": 40, \"log\": False}],\n", + " \"min_sum_hessian_in_leaf\": [\"float\", {\"low\": 1e-8, \"high\": 500, \"log\": True}],\n", + " \"subsample\": [\"float\", {\"low\": 0.2, \"high\": 1.0, \"log\": False}],\n", + " \"feature_fraction\": [\"float\", {\"low\": 0.2, \"high\": 1.0, \"log\": False}],\n", + " \"boosting\": [\"categorical\", [\"gbdt\"]],\n", + "}\n", + "\n", + "np.random.seed(123)\n", + "opt_param = lgblss.hyper_opt(param_dict,\n", + " dtrain,\n", + " num_boost_round=100, # Number of boosting iterations.\n", + " nfold=5, # Number of cv-folds.\n", + " early_stopping_rounds=20, # Number of early-stopping rounds\n", + " max_minutes=5, # Time budget in minutes, i.e., stop study after the given number of minutes.\n", + " n_trials=None, # The number of trials. If this argument is set to None, there is no limitation on the number of trials.\n", + " silence=False, # Controls the verbosity of the trail, i.e., user can silence the outputs of the trail.\n", + " seed=123, # Seed used to generate cv-folds.\n", + " hp_seed=None # Seed for random number generator used in the Bayesian hyperparameter search.\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Model Training" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:57:01.202188600Z", + "start_time": "2023-05-18T06:56:59.315239400Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.random.seed(123)\n", + "\n", + "opt_params = opt_param.copy()\n", + "n_rounds = opt_params[\"opt_rounds\"]\n", + "del opt_params[\"opt_rounds\"]\n", + "\n", + "# Train Model with optimized hyperparameters\n", + "lgblss.train(opt_params,\n", + " dtrain,\n", + " num_boost_round=n_rounds\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prediction" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:57:02.113749300Z", + "start_time": "2023-05-18T06:57:01.203185800Z" + } + }, + "outputs": [], + "source": [ + "# Set seed for reproducibility\n", + "torch.manual_seed(123)\n", + "\n", + "# Number of samples to draw from predicted distribution\n", + "n_samples = 1000\n", + "quant_sel = [0.05, 0.95] # Quantiles to calculate from predicted distribution\n", + "\n", + "# Sample from predicted distribution\n", + "pred_samples = lgblss.predict(X_test,\n", + " pred_type=\"samples\",\n", + " n_samples=n_samples,\n", + " seed=123)\n", + "\n", + "# Calculate quantiles from predicted distribution\n", + "pred_quantiles = lgblss.predict(X_test,\n", + " pred_type=\"quantiles\",\n", + " n_samples=n_samples,\n", + " quantiles=quant_sel)\n", + "\n", + "# Returns predicted distributional parameters\n", + "pred_params = lgblss.predict(X_test,\n", + " pred_type=\"parameters\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:57:02.163617200Z", + "start_time": "2023-05-18T06:57:02.114747Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
y_sample0y_sample1y_sample2y_sample3y_sample4y_sample5y_sample6y_sample7y_sample8y_sample9...y_sample990y_sample991y_sample992y_sample993y_sample994y_sample995y_sample996y_sample997y_sample998y_sample999
033282027234528285232...52501026274910394738
125241930154220362531...10441462132517192717
21316211428181829138...811311640169151218
32016171510107109...22131955231312249
4510122696858...61108732425
\n", + "

5 rows × 1000 columns

\n", + "
" + ], + "text/plain": [ + " y_sample0 y_sample1 y_sample2 y_sample3 y_sample4 y_sample5 \n", + "0 33 28 20 27 23 45 \\\n", + "1 25 24 19 30 15 42 \n", + "2 13 16 21 14 28 18 \n", + "3 20 16 17 1 5 10 \n", + "4 5 10 12 2 6 9 \n", + "\n", + " y_sample6 y_sample7 y_sample8 y_sample9 ... y_sample990 y_sample991 \n", + "0 28 28 52 32 ... 52 50 \\\n", + "1 20 36 25 31 ... 10 44 \n", + "2 18 29 13 8 ... 8 11 \n", + "3 10 7 10 9 ... 22 13 \n", + "4 6 8 5 8 ... 6 1 \n", + "\n", + " y_sample992 y_sample993 y_sample994 y_sample995 y_sample996 \n", + "0 10 26 27 49 10 \\\n", + "1 14 62 13 25 17 \n", + "2 31 16 40 16 9 \n", + "3 19 5 5 23 13 \n", + "4 10 8 7 3 2 \n", + "\n", + " y_sample997 y_sample998 y_sample999 \n", + "0 39 47 38 \n", + "1 19 27 17 \n", + "2 15 12 18 \n", + "3 12 24 9 \n", + "4 4 2 5 \n", + "\n", + "[5 rows x 1000 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred_samples.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:57:02.186554200Z", + "start_time": "2023-05-18T06:57:02.146661700Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
quant_0.05quant_0.95
01358
1947
2636
3324
4115
\n", + "
" + ], + "text/plain": [ + " quant_0.05 quant_0.95\n", + "0 13 58\n", + "1 9 47\n", + "2 6 36\n", + "3 3 24\n", + "4 1 15" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred_quantiles.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:57:02.233679600Z", + "start_time": "2023-05-18T06:57:02.160625600Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
total_countprobs
06.7954830.826296
16.3908240.797698
25.6518810.767255
35.1142010.689337
44.3825650.614175
\n", + "
" + ], + "text/plain": [ + " total_count probs\n", + "0 6.795483 0.826296\n", + "1 6.390824 0.797698\n", + "2 5.651881 0.767255\n", + "3 5.114201 0.689337\n", + "4 4.382565 0.614175" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred_params.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/simulation_example_Poisson.ipynb b/examples/simulation_example_Poisson.ipynb new file mode 100644 index 0000000..835da47 --- /dev/null +++ b/examples/simulation_example_Poisson.ipynb @@ -0,0 +1,953 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/StatMixedML/LightGBMLSS/blob/master/examples/simulation_example_Poisson.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:51:02.442538100Z", + "start_time": "2023-05-18T06:51:02.434559700Z" + } + }, + "outputs": [], + "source": [ + "from lightgbmlss.model import *\n", + "from lightgbmlss.distributions.Poisson import *\n", + "\n", + "from sklearn.datasets import make_regression\n", + "from sklearn.model_selection import train_test_split\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:43:08.745369900Z", + "start_time": "2023-05-18T06:43:07.883394Z" + } + }, + "outputs": [], + "source": [ + "def custom_transform(y, constr_val):\n", + " # Apply a custom transformation to restrict y between 0 and constr_val\n", + " transformed_y = np.abs(y) # Example transformation: logarithmic\n", + " constrained_y = constr_val * transformed_y / np.max(transformed_y) # Scale to desired range\n", + " int_y = constrained_y.astype(int)\n", + " return int_y\n", + "\n", + "# Generate a custom dataset\n", + "X, y = make_regression(n_samples=5000, n_features=10, n_informative=2, random_state=123)\n", + "\n", + "# Apply the custom transformation\n", + "y = custom_transform(y, 50)\n", + "\n", + "# Make a dataframe for visualization\n", + "X = pd.DataFrame(X, columns=[f\"x_{i+1}\" for i in range(X.shape[1])])\n", + "\n", + "# Split into train and test\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)\n", + "\n", + "dtrain = lgb.Dataset(X_train, label=y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distribution Selection" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:43:08.760991300Z", + "start_time": "2023-05-18T06:43:08.745369900Z" + } + }, + "outputs": [], + "source": [ + "# Specifies Poisson distribution with relu response function and option to stabilize Gradient/Hessian.\n", + "lgblss = LightGBMLSS(\n", + " Poisson(stabilization=\"None\", # Options are \"None\", \"MAD\", \"L2\".\n", + " response_fn=\"relu\") # Function to transform the rate-parameter, e.g., \"exp\", \"softplus\" or \"relu\".\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Hyper-Parameter Optimization" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:48:09.339445300Z", + "start_time": "2023-05-18T06:43:08.760991300Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m[I 2023-05-23 17:49:20,692]\u001b[0m A new study created in memory with name: LightGBMLSS Hyper-Parameter Optimization\u001b[0m\n", + "C:\\Users\\maerzale\\.virtualenvs\\LightGBMLSS-Dam57Fpb\\lib\\site-packages\\optuna\\progress_bar.py:56: ExperimentalWarning: Progress bar is experimental (supported from v1.2.0). The interface can change in the future.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "896155058995486c92e829aa03d90f2a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 00:00/05:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m[I 2023-05-23 17:49:24,165]\u001b[0m Trial 0 finished with value: 3980.3492107495294 and parameters: {'eta': 2.699359681957599e-05, 'max_depth': 6, 'subsample': 0.2999341545563561, 'feature_fraction': 0.5556034158427867, 'boosting': 'gbdt'}. Best is trial 0 with value: 3980.3492107495294.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:49:26,086]\u001b[0m Trial 1 finished with value: 2518.34168110926 and parameters: {'eta': 0.45895527487414584, 'max_depth': 4, 'subsample': 0.9457458063686266, 'feature_fraction': 0.7992492274530714, 'boosting': 'gbdt'}. Best is trial 1 with value: 2518.34168110926.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:49:28,467]\u001b[0m Trial 2 finished with value: 1973.077981251329 and parameters: {'eta': 0.1076719026212947, 'max_depth': 2, 'subsample': 0.9949902525970964, 'feature_fraction': 0.8547024470705862, 'boosting': 'gbdt'}. Best is trial 2 with value: 1973.077981251329.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:49:30,875]\u001b[0m Trial 3 finished with value: 3222.7300283960785 and parameters: {'eta': 0.02108376076622654, 'max_depth': 2, 'subsample': 0.4446423511770703, 'feature_fraction': 0.6224426341684355, 'boosting': 'gbdt'}. Best is trial 2 with value: 1973.077981251329.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:49:33,939]\u001b[0m Trial 4 finished with value: 2726.9540583834487 and parameters: {'eta': 0.12416637667735411, 'max_depth': 7, 'subsample': 0.9328713673646369, 'feature_fraction': 0.3539707901862066, 'boosting': 'gbdt'}. Best is trial 2 with value: 1973.077981251329.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:49:37,033]\u001b[0m Trial 5 finished with value: 3835.158268058169 and parameters: {'eta': 0.005223357095351933, 'max_depth': 1, 'subsample': 0.8379109985296609, 'feature_fraction': 0.9981101544369979, 'boosting': 'gbdt'}. Best is trial 2 with value: 1973.077981251329.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:49:40,269]\u001b[0m Trial 6 finished with value: 3817.048573437457 and parameters: {'eta': 0.0027739627706070196, 'max_depth': 5, 'subsample': 0.930286994682431, 'feature_fraction': 0.3119284237647043, 'boosting': 'gbdt'}. Best is trial 2 with value: 1973.077981251329.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:49:43,923]\u001b[0m Trial 7 finished with value: 3189.893060214954 and parameters: {'eta': 0.011017544852965473, 'max_depth': 9, 'subsample': 0.8816085222739356, 'feature_fraction': 0.2592839499697819, 'boosting': 'gbdt'}. Best is trial 2 with value: 1973.077981251329.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:49:45,026]\u001b[0m Trial 8 finished with value: 3529.5167951296885 and parameters: {'eta': 0.5558605312146222, 'max_depth': 6, 'subsample': 0.664453376884137, 'feature_fraction': 0.3367665412900343, 'boosting': 'gbdt'}. Best is trial 2 with value: 1973.077981251329.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:49:48,690]\u001b[0m Trial 9 finished with value: 3973.1992904924773 and parameters: {'eta': 2.386367897867079e-05, 'max_depth': 7, 'subsample': 0.7436343866982718, 'feature_fraction': 0.8531334189403488, 'boosting': 'gbdt'}. Best is trial 2 with value: 1973.077981251329.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:49:51,843]\u001b[0m Trial 10 finished with value: 1859.3452692304927 and parameters: {'eta': 0.06347344481668696, 'max_depth': 3, 'subsample': 0.5558305184948805, 'feature_fraction': 0.992559257639847, 'boosting': 'gbdt'}. Best is trial 10 with value: 1859.3452692304927.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:49:55,019]\u001b[0m Trial 11 finished with value: 1802.3385411663435 and parameters: {'eta': 0.07269085961109838, 'max_depth': 3, 'subsample': 0.5523755165229258, 'feature_fraction': 0.9987473864842598, 'boosting': 'gbdt'}. Best is trial 11 with value: 1802.3385411663435.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:49:58,132]\u001b[0m Trial 12 finished with value: 2008.5592603155278 and parameters: {'eta': 0.04907998209254754, 'max_depth': 3, 'subsample': 0.5502504139020136, 'feature_fraction': 0.9949493453306741, 'boosting': 'gbdt'}. Best is trial 11 with value: 1802.3385411663435.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:49:59,092]\u001b[0m Trial 13 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:00,108]\u001b[0m Trial 14 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:01,159]\u001b[0m Trial 15 pruned. Trial was pruned at iteration 23.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:04,608]\u001b[0m Trial 16 finished with value: 1912.255969192346 and parameters: {'eta': 0.2519424480924615, 'max_depth': 4, 'subsample': 0.6019480162652984, 'feature_fraction': 0.7139868847784355, 'boosting': 'gbdt'}. Best is trial 11 with value: 1802.3385411663435.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:08,220]\u001b[0m Trial 17 finished with value: 1800.8913916628667 and parameters: {'eta': 0.026159728338340717, 'max_depth': 10, 'subsample': 0.23799334215586387, 'feature_fraction': 0.8942419948865727, 'boosting': 'gbdt'}. Best is trial 17 with value: 1800.8913916628667.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:11,947]\u001b[0m Trial 18 finished with value: 2039.1537636382716 and parameters: {'eta': 0.012837167175775474, 'max_depth': 10, 'subsample': 0.2861282857022663, 'feature_fraction': 0.8994980050638184, 'boosting': 'gbdt'}. Best is trial 17 with value: 1800.8913916628667.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:14,222]\u001b[0m Trial 19 finished with value: 2096.0079586617685 and parameters: {'eta': 0.16357712449435033, 'max_depth': 8, 'subsample': 0.24153404836673487, 'feature_fraction': 0.9064201648101173, 'boosting': 'gbdt'}. Best is trial 17 with value: 1800.8913916628667.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:15,224]\u001b[0m Trial 20 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:18,737]\u001b[0m Trial 21 finished with value: 1789.6468540051435 and parameters: {'eta': 0.03755745374776891, 'max_depth': 5, 'subsample': 0.35845869524506735, 'feature_fraction': 0.946992170413161, 'boosting': 'gbdt'}. Best is trial 21 with value: 1789.6468540051435.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:19,891]\u001b[0m Trial 22 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:22,188]\u001b[0m Trial 23 finished with value: 2405.5590005979275 and parameters: {'eta': 0.27616911056666227, 'max_depth': 7, 'subsample': 0.37414700238468257, 'feature_fraction': 0.854103557834457, 'boosting': 'gbdt'}. Best is trial 21 with value: 1789.6468540051435.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:26,013]\u001b[0m Trial 24 finished with value: 1750.4546912365124 and parameters: {'eta': 0.04141995048299225, 'max_depth': 8, 'subsample': 0.32350560020697755, 'feature_fraction': 0.9369940306356884, 'boosting': 'gbdt'}. Best is trial 24 with value: 1750.4546912365124.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:30,003]\u001b[0m Trial 25 finished with value: 1825.8962731262175 and parameters: {'eta': 0.02604108091301939, 'max_depth': 9, 'subsample': 0.20148712591697931, 'feature_fraction': 0.8054311195431573, 'boosting': 'gbdt'}. Best is trial 24 with value: 1750.4546912365124.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:31,249]\u001b[0m Trial 26 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:32,439]\u001b[0m Trial 27 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:34,489]\u001b[0m Trial 28 finished with value: 2241.862650625325 and parameters: {'eta': 0.2091822249381417, 'max_depth': 8, 'subsample': 0.25429587380716545, 'feature_fraction': 0.8233788791342402, 'boosting': 'gbdt'}. Best is trial 24 with value: 1750.4546912365124.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:35,490]\u001b[0m Trial 29 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:36,472]\u001b[0m Trial 30 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:39,956]\u001b[0m Trial 31 finished with value: 1688.9717224880667 and parameters: {'eta': 0.08031009016924785, 'max_depth': 5, 'subsample': 0.44664268600013013, 'feature_fraction': 0.9495697003314675, 'boosting': 'gbdt'}. Best is trial 31 with value: 1688.9717224880667.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:43,896]\u001b[0m Trial 32 finished with value: 1748.948818393838 and parameters: {'eta': 0.04179120993400561, 'max_depth': 5, 'subsample': 0.4447991267112251, 'feature_fraction': 0.9423869685081693, 'boosting': 'gbdt'}. Best is trial 31 with value: 1688.9717224880667.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:47,470]\u001b[0m Trial 33 finished with value: 1674.627830651402 and parameters: {'eta': 0.1092207671064788, 'max_depth': 5, 'subsample': 0.4381528854095832, 'feature_fraction': 0.9472757074178997, 'boosting': 'gbdt'}. Best is trial 33 with value: 1674.627830651402.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:50,852]\u001b[0m Trial 34 finished with value: 1643.3783244356266 and parameters: {'eta': 0.08979529354451304, 'max_depth': 5, 'subsample': 0.4298193072212186, 'feature_fraction': 0.8659637792996855, 'boosting': 'gbdt'}. Best is trial 34 with value: 1643.3783244356266.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:54,244]\u001b[0m Trial 35 finished with value: 1763.4122611204846 and parameters: {'eta': 0.10523696174612765, 'max_depth': 6, 'subsample': 0.45123286116918215, 'feature_fraction': 0.8637194427050581, 'boosting': 'gbdt'}. Best is trial 34 with value: 1643.3783244356266.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:56,271]\u001b[0m Trial 36 finished with value: 2013.4887388830334 and parameters: {'eta': 0.3648803862328835, 'max_depth': 4, 'subsample': 0.475849472696358, 'feature_fraction': 0.7981758402356571, 'boosting': 'gbdt'}. Best is trial 34 with value: 1643.3783244356266.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:50:59,783]\u001b[0m Trial 37 finished with value: 1752.0996934024934 and parameters: {'eta': 0.11894371172218482, 'max_depth': 5, 'subsample': 0.4005001405170632, 'feature_fraction': 0.8718292943515381, 'boosting': 'gbdt'}. Best is trial 34 with value: 1643.3783244356266.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:03,403]\u001b[0m Trial 38 finished with value: 1751.9842725764497 and parameters: {'eta': 0.09947677667048685, 'max_depth': 6, 'subsample': 0.490590870502015, 'feature_fraction': 0.9565691734521639, 'boosting': 'gbdt'}. Best is trial 34 with value: 1643.3783244356266.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:04,502]\u001b[0m Trial 39 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:08,020]\u001b[0m Trial 40 finished with value: 1800.4519696660864 and parameters: {'eta': 0.16761210673059446, 'max_depth': 4, 'subsample': 0.4943410198967817, 'feature_fraction': 0.8818995342733525, 'boosting': 'gbdt'}. Best is trial 34 with value: 1643.3783244356266.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:11,405]\u001b[0m Trial 41 finished with value: 1613.1536991611197 and parameters: {'eta': 0.06826235287955465, 'max_depth': 5, 'subsample': 0.420359262989728, 'feature_fraction': 0.9545025235726183, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:15,007]\u001b[0m Trial 42 finished with value: 1633.7762816623977 and parameters: {'eta': 0.08921427334759761, 'max_depth': 5, 'subsample': 0.41199838466683164, 'feature_fraction': 0.9623379126458551, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:17,543]\u001b[0m Trial 43 finished with value: 1810.5383652519401 and parameters: {'eta': 0.09936770667792444, 'max_depth': 7, 'subsample': 0.3915607133253813, 'feature_fraction': 0.9625447105496714, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:21,249]\u001b[0m Trial 44 finished with value: 1723.8437483773146 and parameters: {'eta': 0.0753029355861983, 'max_depth': 6, 'subsample': 0.4077169991481429, 'feature_fraction': 0.8981938660912558, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:22,631]\u001b[0m Trial 45 pruned. Trial was pruned at iteration 34.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:25,939]\u001b[0m Trial 46 finished with value: 1635.1324440642884 and parameters: {'eta': 0.16197824770088326, 'max_depth': 4, 'subsample': 0.42836803379593047, 'feature_fraction': 0.9647320477310738, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:26,981]\u001b[0m Trial 47 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:28,064]\u001b[0m Trial 48 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:30,218]\u001b[0m Trial 49 finished with value: 1906.8899110262676 and parameters: {'eta': 0.48562785804696945, 'max_depth': 3, 'subsample': 0.3876885637323186, 'feature_fraction': 0.9110497473714594, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:32,911]\u001b[0m Trial 50 finished with value: 1679.5954359133714 and parameters: {'eta': 0.2365664369991201, 'max_depth': 4, 'subsample': 0.46989860524557564, 'feature_fraction': 0.976052531202607, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:35,118]\u001b[0m Trial 51 finished with value: 1796.2139051378654 and parameters: {'eta': 0.28422275888650295, 'max_depth': 4, 'subsample': 0.47179866254034725, 'feature_fraction': 0.9964797219502293, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:38,377]\u001b[0m Trial 52 finished with value: 1631.3898590226456 and parameters: {'eta': 0.15533112963610093, 'max_depth': 4, 'subsample': 0.5146305131312789, 'feature_fraction': 0.9716596394449465, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:39,287]\u001b[0m Trial 53 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:42,200]\u001b[0m Trial 54 pruned. Trial was pruned at iteration 78.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:43,221]\u001b[0m Trial 55 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:46,378]\u001b[0m Trial 56 finished with value: 1641.3046530849438 and parameters: {'eta': 0.11314103393874166, 'max_depth': 4, 'subsample': 0.5618255685654434, 'feature_fraction': 0.8945115338632039, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:49,135]\u001b[0m Trial 57 finished with value: 1729.3530425734123 and parameters: {'eta': 0.1568324695034597, 'max_depth': 4, 'subsample': 0.5809283469974645, 'feature_fraction': 0.8891366422359686, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:50,505]\u001b[0m Trial 58 pruned. Trial was pruned at iteration 36.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:51,647]\u001b[0m Trial 59 pruned. Trial was pruned at iteration 23.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:52,608]\u001b[0m Trial 60 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:56,088]\u001b[0m Trial 61 finished with value: 1652.0812659880844 and parameters: {'eta': 0.09022372778946038, 'max_depth': 5, 'subsample': 0.464081626268277, 'feature_fraction': 0.9354757486696555, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:51:58,351]\u001b[0m Trial 62 finished with value: 1844.167258844327 and parameters: {'eta': 0.23618478883265465, 'max_depth': 5, 'subsample': 0.46919565092245996, 'feature_fraction': 0.9238989507559477, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:01,521]\u001b[0m Trial 63 finished with value: 1669.3322564280165 and parameters: {'eta': 0.14019823582896124, 'max_depth': 4, 'subsample': 0.5040327375480566, 'feature_fraction': 0.9649276652586276, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:02,741]\u001b[0m Trial 64 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:03,802]\u001b[0m Trial 65 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:04,763]\u001b[0m Trial 66 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:05,793]\u001b[0m Trial 67 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:06,761]\u001b[0m Trial 68 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:09,679]\u001b[0m Trial 69 finished with value: 1734.0258121726613 and parameters: {'eta': 0.19870123599007436, 'max_depth': 3, 'subsample': 0.5513675458314182, 'feature_fraction': 0.9357713039042574, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:10,772]\u001b[0m Trial 70 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:13,492]\u001b[0m Trial 71 finished with value: 1673.9021412210343 and parameters: {'eta': 0.12797428338568972, 'max_depth': 4, 'subsample': 0.504138599554762, 'feature_fraction': 0.976043129513024, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:16,614]\u001b[0m Trial 72 finished with value: 1756.8449184912236 and parameters: {'eta': 0.145302530074595, 'max_depth': 5, 'subsample': 0.4854274605664581, 'feature_fraction': 0.9562371020572519, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:17,854]\u001b[0m Trial 73 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:21,281]\u001b[0m Trial 74 finished with value: 1654.0924045327563 and parameters: {'eta': 0.11730201675330273, 'max_depth': 4, 'subsample': 0.5120221945493588, 'feature_fraction': 0.9305843991144697, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:22,315]\u001b[0m Trial 75 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:23,349]\u001b[0m Trial 76 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:25,375]\u001b[0m Trial 77 pruned. Trial was pruned at iteration 57.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:28,578]\u001b[0m Trial 78 finished with value: 1687.9019611596116 and parameters: {'eta': 0.211987643138902, 'max_depth': 3, 'subsample': 0.37878079226941086, 'feature_fraction': 0.9246544597540306, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:29,956]\u001b[0m Trial 79 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:31,058]\u001b[0m Trial 80 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:34,228]\u001b[0m Trial 81 finished with value: 1621.7059193985976 and parameters: {'eta': 0.1281061906912468, 'max_depth': 4, 'subsample': 0.49692426841862347, 'feature_fraction': 0.9526476982117121, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:35,166]\u001b[0m Trial 82 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:36,939]\u001b[0m Trial 83 pruned. Trial was pruned at iteration 45.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:40,006]\u001b[0m Trial 84 finished with value: 1710.95554582482 and parameters: {'eta': 0.3103534643704688, 'max_depth': 3, 'subsample': 0.5183898062284562, 'feature_fraction': 0.9808275740627064, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:42,976]\u001b[0m Trial 85 finished with value: 1649.7344422552815 and parameters: {'eta': 0.12550933264460645, 'max_depth': 4, 'subsample': 0.43755380095124585, 'feature_fraction': 0.9531127896417276, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:43,964]\u001b[0m Trial 86 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:45,208]\u001b[0m Trial 87 pruned. Trial was pruned at iteration 30.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:46,252]\u001b[0m Trial 88 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:47,333]\u001b[0m Trial 89 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:48,385]\u001b[0m Trial 90 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:51,433]\u001b[0m Trial 91 finished with value: 1669.1807506406974 and parameters: {'eta': 0.12972117193589527, 'max_depth': 4, 'subsample': 0.4820281670079518, 'feature_fraction': 0.9280405365663502, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:53,977]\u001b[0m Trial 92 finished with value: 1644.3213417057705 and parameters: {'eta': 0.10335423983688893, 'max_depth': 4, 'subsample': 0.4955795867451968, 'feature_fraction': 0.9769771500848161, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:52:57,387]\u001b[0m Trial 93 finished with value: 1644.1687325991254 and parameters: {'eta': 0.09462798289505757, 'max_depth': 5, 'subsample': 0.4778247837643533, 'feature_fraction': 0.9815195363465108, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:00,588]\u001b[0m Trial 94 finished with value: 1645.0215990881638 and parameters: {'eta': 0.18272783495553896, 'max_depth': 4, 'subsample': 0.4092495638927149, 'feature_fraction': 0.9796954515130953, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:03,613]\u001b[0m Trial 95 finished with value: 1653.4181130088684 and parameters: {'eta': 0.2718209965748566, 'max_depth': 3, 'subsample': 0.4901093507016783, 'feature_fraction': 0.9837081935210223, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:05,852]\u001b[0m Trial 96 pruned. Trial was pruned at iteration 59.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:06,809]\u001b[0m Trial 97 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:08,638]\u001b[0m Trial 98 pruned. Trial was pruned at iteration 54.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:09,675]\u001b[0m Trial 99 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:13,658]\u001b[0m Trial 100 finished with value: 1718.887515642936 and parameters: {'eta': 0.1825819234671453, 'max_depth': 4, 'subsample': 0.3438060707358511, 'feature_fraction': 0.9493912434499308, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:17,634]\u001b[0m Trial 101 finished with value: 1620.1600042413713 and parameters: {'eta': 0.11393636928116825, 'max_depth': 4, 'subsample': 0.43748478299369964, 'feature_fraction': 0.9566187556238437, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:18,722]\u001b[0m Trial 102 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:20,909]\u001b[0m Trial 103 pruned. Trial was pruned at iteration 59.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:24,141]\u001b[0m Trial 104 finished with value: 1643.828663400553 and parameters: {'eta': 0.09978817449723257, 'max_depth': 5, 'subsample': 0.40770827862747494, 'feature_fraction': 0.9821094051425081, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:25,168]\u001b[0m Trial 105 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:26,191]\u001b[0m Trial 106 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:27,386]\u001b[0m Trial 107 pruned. Trial was pruned at iteration 25.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:28,536]\u001b[0m Trial 108 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:29,437]\u001b[0m Trial 109 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:30,444]\u001b[0m Trial 110 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:33,864]\u001b[0m Trial 111 finished with value: 1636.9327834198057 and parameters: {'eta': 0.15628335040331998, 'max_depth': 4, 'subsample': 0.4141697969132473, 'feature_fraction': 0.9789443795063648, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:35,880]\u001b[0m Trial 112 pruned. Trial was pruned at iteration 53.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:39,048]\u001b[0m Trial 113 finished with value: 1666.3609944712393 and parameters: {'eta': 0.14277802397471012, 'max_depth': 4, 'subsample': 0.442274170153352, 'feature_fraction': 0.9648296704923597, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:42,389]\u001b[0m Trial 114 finished with value: 1629.887737082267 and parameters: {'eta': 0.10688436516079522, 'max_depth': 4, 'subsample': 0.4161239936077784, 'feature_fraction': 0.987208383633686, 'boosting': 'gbdt'}. Best is trial 41 with value: 1613.1536991611197.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:45,548]\u001b[0m Trial 115 finished with value: 1602.071680743745 and parameters: {'eta': 0.07596366590755924, 'max_depth': 5, 'subsample': 0.40986478186150205, 'feature_fraction': 0.9985802759008446, 'boosting': 'gbdt'}. Best is trial 115 with value: 1602.071680743745.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:46,445]\u001b[0m Trial 116 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:47,374]\u001b[0m Trial 117 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:48,486]\u001b[0m Trial 118 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:51,180]\u001b[0m Trial 119 pruned. Trial was pruned at iteration 53.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:52,187]\u001b[0m Trial 120 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:55,613]\u001b[0m Trial 121 finished with value: 1619.0276106717215 and parameters: {'eta': 0.07973977811944279, 'max_depth': 5, 'subsample': 0.44043347488361556, 'feature_fraction': 0.9848734325802573, 'boosting': 'gbdt'}. Best is trial 115 with value: 1602.071680743745.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:53:59,320]\u001b[0m Trial 122 finished with value: 1629.854960635958 and parameters: {'eta': 0.0798107407342345, 'max_depth': 5, 'subsample': 0.44115896977836355, 'feature_fraction': 0.9990389448350798, 'boosting': 'gbdt'}. Best is trial 115 with value: 1602.071680743745.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:54:00,421]\u001b[0m Trial 123 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:54:01,589]\u001b[0m Trial 124 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:54:03,040]\u001b[0m Trial 125 pruned. Trial was pruned at iteration 20.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:54:04,524]\u001b[0m Trial 126 pruned. Trial was pruned at iteration 25.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:54:07,653]\u001b[0m Trial 127 finished with value: 1626.324531133902 and parameters: {'eta': 0.12870518065106204, 'max_depth': 4, 'subsample': 0.378397726390322, 'feature_fraction': 0.9869622583067622, 'boosting': 'gbdt'}. Best is trial 115 with value: 1602.071680743745.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:54:10,710]\u001b[0m Trial 128 finished with value: 1664.5592298730244 and parameters: {'eta': 0.2460575893891931, 'max_depth': 3, 'subsample': 0.3855893412559655, 'feature_fraction': 0.9863190039049687, 'boosting': 'gbdt'}. Best is trial 115 with value: 1602.071680743745.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:54:13,732]\u001b[0m Trial 129 finished with value: 1629.2398585973865 and parameters: {'eta': 0.12621829820388744, 'max_depth': 4, 'subsample': 0.34244545316590425, 'feature_fraction': 0.9509808909504863, 'boosting': 'gbdt'}. Best is trial 115 with value: 1602.071680743745.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:54:16,670]\u001b[0m Trial 130 finished with value: 1667.4863602629612 and parameters: {'eta': 0.20579778366142507, 'max_depth': 4, 'subsample': 0.3680384648931427, 'feature_fraction': 0.9543965285212962, 'boosting': 'gbdt'}. Best is trial 115 with value: 1602.071680743745.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:54:20,130]\u001b[0m Trial 131 finished with value: 1633.2008555813288 and parameters: {'eta': 0.12361764760018717, 'max_depth': 4, 'subsample': 0.3295734098616875, 'feature_fraction': 0.9710973360557045, 'boosting': 'gbdt'}. Best is trial 115 with value: 1602.071680743745.\u001b[0m\n", + "\u001b[32m[I 2023-05-23 17:54:23,158]\u001b[0m Trial 132 finished with value: 1612.2962393635255 and parameters: {'eta': 0.1377234159143613, 'max_depth': 4, 'subsample': 0.3408204013726096, 'feature_fraction': 0.973446528162137, 'boosting': 'gbdt'}. Best is trial 115 with value: 1602.071680743745.\u001b[0m\n", + "\n", + "Hyper-Parameter Optimization successfully finished.\n", + " Number of finished trials: 133\n", + " Best trial:\n", + " Value: 1602.071680743745\n", + " Params: \n", + " eta: 0.07596366590755924\n", + " max_depth: 5\n", + " subsample: 0.40986478186150205\n", + " feature_fraction: 0.9985802759008446\n", + " boosting: gbdt\n", + " opt_rounds: 100\n" + ] + } + ], + "source": [ + "# Any LightGBM hyperparameter can be tuned, where the structure of the parameter dictionary needs to be as follows:\n", + "\n", + " # Float/Int sample_type\n", + " # {\"param_name\": [\"sample_type\", low, high, log]}\n", + " # sample_type: str, Type of sampling, e.g., \"float\" or \"int\"\n", + " # low: int, Lower endpoint of the range of suggested values\n", + " # high: int, Upper endpoint of the range of suggested values\n", + " # log: bool, Flag to sample the value from the log domain or not\n", + " # Example: {\"eta\": \"float\", low=1e-5, high=1, log=True]}\n", + "\n", + " # Categorical sample_type\n", + " # {\"param_name\": [\"sample_type\", [\"choice1\", \"choice2\", \"choice3\", \"...\"]]}\n", + " # sample_type: str, Type of sampling, either \"categorical\"\n", + " # choice1, choice2, choice3, ...: str, Possible choices for the parameter\n", + " # Example: {\"boosting\": [\"categorical\", [\"gbdt\", \"dart\"]]}\n", + "\n", + " # For parameters without tunable choice (this is needed if tree_method = \"gpu_hist\" and gpu_id needs to be specified)\n", + " # {\"param_name\": [\"none\", [value]]},\n", + " # param_name: str, Name of the parameter\n", + " # value: int, Value of the parameter\n", + " # Example: {\"gpu_id\": [\"none\", [0]]}\n", + "\n", + "param_dict = {\n", + " \"eta\": [\"float\", {\"low\": 1e-5, \"high\": 1, \"log\": True}],\n", + " \"max_depth\": [\"int\", {\"low\": 1, \"high\": 10, \"log\": False}],\n", + " \"subsample\": [\"float\", {\"low\": 0.2, \"high\": 1.0, \"log\": False}],\n", + " \"feature_fraction\": [\"float\", {\"low\": 0.2, \"high\": 1.0, \"log\": False}],\n", + " \"boosting\": [\"categorical\", [\"gbdt\"]],\n", + "}\n", + "\n", + "np.random.seed(123)\n", + "opt_param = lgblss.hyper_opt(param_dict,\n", + " dtrain,\n", + " num_boost_round=100, # Number of boosting iterations.\n", + " nfold=5, # Number of cv-folds.\n", + " early_stopping_rounds=20, # Number of early-stopping rounds\n", + " max_minutes=5, # Time budget in minutes, i.e., stop study after the given number of minutes.\n", + " n_trials=None, # The number of trials. If this argument is set to None, there is no limitation on the number of trials.\n", + " silence=False, # Controls the verbosity of the trail, i.e., user can silence the outputs of the trail.\n", + " seed=123, # Seed used to generate cv-folds.\n", + " hp_seed=None # Seed for random number generator used in the Bayesian hyperparameter search.\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Model Training" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:48:10.249194200Z", + "start_time": "2023-05-18T06:48:09.343434700Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.random.seed(123)\n", + "\n", + "opt_params = opt_param.copy()\n", + "n_rounds = opt_params[\"opt_rounds\"]\n", + "del opt_params[\"opt_rounds\"]\n", + "\n", + "# Train Model with optimized hyperparameters\n", + "lgblss.train(opt_params,\n", + " dtrain,\n", + " num_boost_round=n_rounds\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prediction" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:48:10.905030600Z", + "start_time": "2023-05-18T06:48:10.254180800Z" + } + }, + "outputs": [], + "source": [ + "# Set seed for reproducibility\n", + "torch.manual_seed(123)\n", + "\n", + "# Number of samples to draw from predicted distribution\n", + "n_samples = 1000\n", + "quant_sel = [0.05, 0.95] # Quantiles to calculate from predicted distribution\n", + "\n", + "# Sample from predicted distribution\n", + "pred_samples = lgblss.predict(X_test,\n", + " pred_type=\"samples\",\n", + " n_samples=n_samples,\n", + " seed=123)\n", + "\n", + "# Calculate quantiles from predicted distribution\n", + "pred_quantiles = lgblss.predict(X_test,\n", + " pred_type=\"quantiles\",\n", + " n_samples=n_samples,\n", + " quantiles=quant_sel)\n", + "\n", + "# Returns predicted distributional parameters\n", + "pred_params = lgblss.predict(X_test,\n", + " pred_type=\"parameters\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:48:10.961878800Z", + "start_time": "2023-05-18T06:48:10.906028300Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
y_sample0y_sample1y_sample2y_sample3y_sample4y_sample5y_sample6y_sample7y_sample8y_sample9...y_sample990y_sample991y_sample992y_sample993y_sample994y_sample995y_sample996y_sample997y_sample998y_sample999
029253831423036232129...24393838212227272627
127183823212717363323...25263620273121212017
21721131914202316149...14141421131112191320
371310991110989...713117811108119
46967347637...56106655697
\n", + "

5 rows × 1000 columns

\n", + "
" + ], + "text/plain": [ + " y_sample0 y_sample1 y_sample2 y_sample3 y_sample4 y_sample5 \n", + "0 29 25 38 31 42 30 \\\n", + "1 27 18 38 23 21 27 \n", + "2 17 21 13 19 14 20 \n", + "3 7 13 10 9 9 11 \n", + "4 6 9 6 7 3 4 \n", + "\n", + " y_sample6 y_sample7 y_sample8 y_sample9 ... y_sample990 y_sample991 \n", + "0 36 23 21 29 ... 24 39 \\\n", + "1 17 36 33 23 ... 25 26 \n", + "2 23 16 14 9 ... 14 14 \n", + "3 10 9 8 9 ... 7 13 \n", + "4 7 6 3 7 ... 5 6 \n", + "\n", + " y_sample992 y_sample993 y_sample994 y_sample995 y_sample996 \n", + "0 38 38 21 22 27 \\\n", + "1 36 20 27 31 21 \n", + "2 14 21 13 11 12 \n", + "3 11 7 8 11 10 \n", + "4 10 6 6 5 5 \n", + "\n", + " y_sample997 y_sample998 y_sample999 \n", + "0 27 26 27 \n", + "1 21 20 17 \n", + "2 19 13 20 \n", + "3 8 11 9 \n", + "4 6 9 7 \n", + "\n", + "[5 rows x 1000 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred_samples.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:48:10.994790600Z", + "start_time": "2023-05-18T06:48:10.937943900Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
quant_0.05quant_0.95
02240
11734
21124
3514
4311
\n", + "
" + ], + "text/plain": [ + " quant_0.05 quant_0.95\n", + "0 22 40\n", + "1 17 34\n", + "2 11 24\n", + "3 5 14\n", + "4 3 11" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred_quantiles.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:48:10.994790600Z", + "start_time": "2023-05-18T06:48:10.955897700Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rate
031.458658
125.565571
216.973503
39.111122
46.517088
\n", + "
" + ], + "text/plain": [ + " rate\n", + "0 31.458658\n", + "1 25.565571\n", + "2 16.973503\n", + "3 9.111122\n", + "4 6.517088" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred_params.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "# SHAP Interpretability" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:48:11.803053500Z", + "start_time": "2023-05-18T06:48:10.974844500Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Feature Importance of rate parameter\n", + "lgblss.plot(X_test,\n", + " parameter=\"rate\",\n", + " plot_type=\"Feature_Importance\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "# Plot Predicted vs. True" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-18T06:48:12.022498800Z", + "start_time": "2023-05-18T06:48:11.803053500Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(pred_params[\"rate\"], y_test)\n", + "plt.title(\"Predicted vs. True\")\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}