-
Notifications
You must be signed in to change notification settings - Fork 12
/
stacking.py
104 lines (82 loc) · 4.5 KB
/
stacking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
import xgboost as xgb
import lightgbm as lgb
## 采用stacking的方法训练预测,最终的提交文件为stacking_submit.csv
all_train = pd.read_csv('all_train.csv',sep='\t')
test_set = pd.read_csv('test_set.csv',sep='\t')
result_name = test_set[['USRID']]
train = all_train.drop(['USRID', 'FLAG'], axis=1)
y_train = all_train['FLAG'].values
test = test_set.drop(['USRID'], axis=1)
#线下的交叉验证函数
n_folds = 5
def auc_cv(model):
kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
auc = cross_val_score(model, train.values, y_train, scoring="roc_auc", cv = kf)
return(auc)
lasso = make_pipeline(RobustScaler(), Lasso(max_iter=1000,alpha=0.0005,fit_intercept=True,random_state=1))
GBoost = GradientBoostingRegressor(n_estimators=500, learning_rate=0.01,
max_depth=18, max_features='sqrt',
min_samples_leaf=16, min_samples_split=10,
random_state =5)
model_xgb = xgb.XGBRegressor(colsample_bytree=0.9, objective = 'binary:logistic',
learning_rate=0.02, max_depth=6, eval_metric = 'auc',
min_child_weight=10, n_estimators=842,
subsample=0.7, silent=1,
random_state =0, nthread = -1)
model_lgb = lgb.LGBMRegressor(objective='binary',metric ='auc',num_leaves=35,
learning_rate=0.01, n_estimators=842,
max_bin = 55, bagging_fraction = 0.8,
bagging_freq = 3, feature_fraction = 0.9,
feature_fraction_seed=9, bagging_seed=9,
min_data_in_leaf =370, min_sum_hessian_in_leaf = 11)
# 单模型的线下得分
score_lasso = auc_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score_lasso.mean(), score_lasso.std()))
score_GBoost = auc_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score_GBoost.mean(), score_GBoost.std()))
score_lgb = auc_cv(model_lgb)
print("LightGBM score: {:.4f} ({:.4f})\n".format(score_lgb.mean(), score_lgb.std()))
score_xgb = auc_cv(model_xgb)
print("XGBoost score: {:.4f} ({:.4f})\n".format(score_xgb.mean(), score_xgb.std()))
## 定义stacking的类
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
def __init__(self, base_models, meta_model, n_folds=5):
self.base_models = base_models
self.meta_model = meta_model
self.n_folds = n_folds
def fit(self, X, y):
self.base_models_ = [list() for x in self.base_models]
self.meta_model_ = clone(self.meta_model)
kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models))) ##初始化矩阵
for i, model in enumerate(self.base_models):
for train_index, holdout_index in kfold.split(X, y):
instance = clone(model)
self.base_models_[i].append(instance) ##五折交叉验证,一个基模型有5个instance
instance.fit(X[train_index], y[train_index])
y_pred = instance.predict(X[holdout_index])
out_of_fold_predictions[holdout_index, i] = y_pred
self.meta_model_.fit(out_of_fold_predictions, y)
return self
def predict(self, X):
meta_features = np.column_stack([
np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
for base_models in self.base_models_])
return self.meta_model_.predict(meta_features)
stacked_averaged_models = StackingAveragedModels(base_models = (GBoost,model_xgb,model_lgb),
meta_model = lasso)
# stacking模型的线下得分
score = auc_cv(stacked_averaged_models)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))
stacked_averaged_models.fit(train.values, y_train)
stacked_pred = stacked_averaged_models.predict(test.values)
result_name['RST'] = stacked_pred
result_name.to_csv('stacking_submit.csv',index=None,sep='\t')