-
Notifications
You must be signed in to change notification settings - Fork 0
/
xgboost_exec.py
100 lines (74 loc) · 3.29 KB
/
xgboost_exec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import xgboost as xgb
import pickle
import numpy as np
import time
from sklearn.metrics import mean_squared_error
SEED = 500
# Load data from pickle
with open(r"../data-x-li-data/df_merged_train_test_05p.pickle", "rb") as input_file:
X_train, y_train, X_test, y_test = pickle.load(input_file)
# Instantiate a xgb.XGBRegressor
#gbm0 = xgb.XGBRegressor(n_estimators = 50, learning_rate = 0.1, objective='reg:squarederror', seed = SEED)
gbm0 = xgb.XGBRegressor(n_estimators = 14000, learning_rate = 0.28, max_depth = 8, objective='reg:squarederror', seed = SEED)
# Fit XGBoost with SciKit
gbm0.fit(X_train, y_train)
# Predict the test set labels 'y_pred0'
y_pred0 = gbm0.predict(X_test)
# Evaluate the test set RMSE
rmse_test0 = mean_squared_error(y_test, y_pred0, squared=False)
print(rmse_test0)
# Evaluate using MAPE
def mean_absolute_percentage_error(y_true, y_pred):
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
# Evaluate the test set RMSE
MAPE_test0 = mean_absolute_percentage_error(y_test, y_pred0)
print(MAPE_test0)
##########################
#### Grid optimization ###
##########################
# Setup params grid
param_grid = {'learning_rate': [0.1, 0.2, 0.3], # alias eta, Step size shrinkage used in update to prevents overfitting.
'n_estimators': [10000, 15000, 20000],
'subsample': [1], # Subsample ratio of the training instances
'max_depth': [6, 8, 10, 12],
'colsample_bytree': [1] # colsample_bytree is the subsample ratio of columns when constructing each tree. Subsampling occurs once for every tree constructed.
}
#instantiate XGBRegressor
gbm = xgb.XGBRegressor(seed=SEED, objective='reg:squarederror')
grid_mse = GridSearchCV(estimator=gbm,
param_grid=param_grid,
scoring='neg_mean_squared_error',
cv=3,
verbose=1,
n_jobs=-1)
# fit GridSearchCV
tic = time.perf_counter() # begin timing
grid_mse.fit(X_train, y_train)
time_fit_cv = time.perf_counter() - tic # save timer
print("Best parameters found: ",grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))
#extract the estimator best_estimator_
gbm_ins = grid_mse.best_estimator_
# Predict the test set labels 'y_pred'
y_pred = gbm_ins.predict(X_test)
# Evaluate the test set RMSE
rmse_test = mean_squared_error(y_test, y_pred, squared=False)
print(rmse_test)
#best_fit_order = np.argpartition(grid['MAPE'], 1)
#best_fit_no = np.argmin(grid['MAPE'])
#sec_best_fit_no = best_fit_order[1]
#low_MAPE = grid.iloc[best_fit_no]['MAPE']
#best_params = grid.iloc[best_fit_no].drop('MAPE').drop('FIMP').astype(str).to_dict()
#train_size = X_train_valid.shape[0]
#columns = str(X_train_valid.columns.values)
#fimp = grid.iloc[best_fit_no]['FIMP']
#y_pred = lgbm_list[best_fit_no].predict(X_test)
#MAPE_test_set = mean_absolute_percentage_error(y_test, y_pred)
#sec_best_params = grid.iloc[sec_best_fit_no].drop('MAPE').drop('FIMP').astype(str).to_dict()
#y_pred = lgbm_list[sec_best_fit_no].predict(X_test_valid)
#sec_best_MAPE = mean_absolute_percentage_error(y_test_valid, y_pred)
#y_pred = lgbm_list[best_fit_no].predict(X_train_valid)
#MAPE_train_set = mean_absolute_percentage_error(y_train_valid, y_pred)
#print(low_MAPE)
#print(best_params)
#print(FIMP_list[best_fit_no])