forked from numerai/example-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
example_model_advanced.py
296 lines (254 loc) · 14.4 KB
/
example_model_advanced.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
import pandas as pd
from lightgbm import LGBMRegressor
import gc
from numerapi import NumerAPI
from pathlib import Path
from utils import (
save_model,
load_model,
neutralize,
get_biggest_change_features,
get_time_series_cross_val_splits,
validation_metrics,
load_model_config,
save_model_config,
save_prediction,
TARGET_COL,
)
EXAMPLE_PREDS_COL = "example_preds"
ERA_COL = "era"
# params we'll use to train all of our models.
# Ideal params would be more like 20000, 0.001, 6, 2**6, 0.1, but this is slow enough as it is
model_params = {"n_estimators": 2000,
"learning_rate": 0.01,
"max_depth": 5,
"num_leaves": 2 ** 5,
"colsample_bytree": 0.1}
# the amount of downsampling we'll use to speed up cross validation and full train.
# a value of 1 means no downsampling
# a value of 10 means use every 10th row
downsample_cross_val = 20
downsample_full_train = 2
# if model_selection_loop=True get OOS performance for training_data
# and use that to select best model
# if model_selection_loop=False, just predict on tournament data using existing models and model config
model_selection_loop = True
model_config_name = "advanced_example_model"
napi = NumerAPI()
current_round = napi.get_current_round()
Path("./v4").mkdir(parents=False, exist_ok=True)
napi.download_dataset("v4/train.parquet")
napi.download_dataset("v4/features.json")
print("Entering model selection loop. This may take awhile.")
if model_selection_loop:
model_config = {}
print('reading training_data')
training_data = pd.read_parquet('v4/train.parquet')
# keep track of some prediction columns
ensemble_cols = set()
pred_cols = set()
# pick some targets to use
possible_targets = [c for c in training_data.columns if c.startswith("target_")]
# randomly pick a handful of targets
# this can be vastly improved
targets = ["target", "target_nomi_v4_60", "target_jerome_v4_20"]
# all the possible features to train on
feature_cols = [c for c in training_data if c.startswith("feature_")]
""" do cross val to get out of sample training preds"""
cv = 3
train_test_zip = get_time_series_cross_val_splits(training_data, cv=cv, embargo=12)
# get out of sample training preds via embargoed time series cross validation
# optionally downsample training data to speed up this section.
print("entering time series cross validation loop")
for split, train_test_split in enumerate(train_test_zip):
gc.collect()
print(f"doing split {split+1} out of {cv}")
train_split, test_split = train_test_split
train_split_index = training_data[ERA_COL].isin(train_split)
test_split_index = training_data[ERA_COL].isin(test_split)
downsampled_train_split_index = train_split_index[train_split_index].index[::downsample_cross_val]
# getting the per era correlation of each feature vs the primary target across the training split
print("getting feature correlations over time and identifying riskiest features")
all_feature_corrs_split = training_data.loc[downsampled_train_split_index, :].groupby(ERA_COL).apply(
lambda d: d[feature_cols].corrwith(d[TARGET_COL]))
# find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data
# there are probably more clever ways to do this
riskiest_features_split = get_biggest_change_features(all_feature_corrs_split, 50)
print(f"entering model training loop for split {split+1}")
for target in targets:
model_name = f"model_{target}"
print(f"model: {model_name}")
# train a model on the training split (and save it for future use)
split_model_name = f"model_{target}_split{split+1}cv{cv}downsample{downsample_cross_val}"
split_model = load_model(split_model_name)
if not split_model:
print(f"training model: {model_name}")
split_model = LGBMRegressor(**model_params)
split_model.fit(training_data.loc[downsampled_train_split_index, feature_cols],
training_data.loc[downsampled_train_split_index,
[target]])
save_model(split_model, split_model_name)
# now we can predict on the test part of the split
model_expected_features = split_model.booster_.feature_name()
if set(model_expected_features) != set(feature_cols):
print(f"New features are available! Might want to retrain model {split_model_name}.")
print(f"predicting {model_name}")
training_data.loc[test_split_index, f"preds_{model_name}"] = \
split_model.predict(training_data.loc[test_split_index, model_expected_features])
# do neutralization
print("doing neutralization to riskiest features")
training_data.loc[test_split_index, f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
df=training_data.loc[test_split_index, :],
columns=[f"preds_{model_name}"],
neutralizers=riskiest_features_split,
proportion=1.0,
normalize=True,
era_col=ERA_COL)[f"preds_{model_name}"]
# remember that we made all of these different pred columns
pred_cols.add(f"preds_{model_name}")
pred_cols.add(f"preds_{model_name}_neutral_riskiest_50")
print("creating ensembles")
# ranking per era for all of our pred cols so we can combine safely on the same scales
training_data[list(pred_cols)] = training_data.groupby(ERA_COL).apply(
lambda d: d[list(pred_cols)].rank(pct=True))
# do ensembles
training_data["ensemble_neutral_riskiest_50"] = sum(
[training_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
pct=True)
training_data["ensemble_not_neutral"] = sum(
[training_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
training_data["ensemble_all"] = sum([training_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
ensemble_cols.add("ensemble_neutral_riskiest_50")
ensemble_cols.add("ensemble_not_neutral")
ensemble_cols.add("ensemble_all")
""" Now get some stats and pick our favorite model"""
print("gathering validation metrics for out of sample training results")
all_model_cols = list(pred_cols) + list(ensemble_cols)
# use example_col preds_model_target as an estimates since no example preds provided for training
# fast_mode=True so that we skip some of the stats that are slower to calculate
training_stats = validation_metrics(training_data, all_model_cols, example_col="preds_model_target",
fast_mode=True, target_col=TARGET_COL)
print(training_stats[["mean", "sharpe"]].sort_values(by="sharpe", ascending=False).to_markdown())
# pick the model that has the highest correlation sharpe
best_pred_col = training_stats.sort_values(by="sharpe", ascending=False).head(1).index[0]
print(f"selecting model {best_pred_col} as our highest sharpe model in validation")
""" Now do a full train"""
print("entering full training section")
# getting the per era correlation of each feature vs the target across all of training data
print("getting feature correlations with target and identifying riskiest features")
all_feature_corrs = training_data.groupby(ERA_COL).apply(
lambda d: d[feature_cols].corrwith(d[TARGET_COL]))
# find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data
riskiest_features = get_biggest_change_features(all_feature_corrs, 50)
for target in targets:
gc.collect()
model_name = f"model_{target}_downsample{downsample_full_train}"
model = load_model(model_name)
if not model:
print(f"training {model_name}")
model = LGBMRegressor(**model_params)
# train on all of train, predict on val, predict on tournament
model.fit(training_data.iloc[::downsample_full_train].loc[:, feature_cols],
training_data.iloc[::downsample_full_train][target])
save_model(model, model_name)
gc.collect()
model_config["feature_cols"] = feature_cols
model_config["targets"] = targets
model_config["best_pred_col"] = best_pred_col
model_config["riskiest_features"] = riskiest_features
print(f"saving model config for {model_config_name}")
save_model_config(model_config, model_config_name)
else:
# load model config from previous model selection loop
print(f"loading model config for {model_config_name}")
model_config = load_model_config(model_config_name)
feature_cols = model_config["feature_cols"]
targets = model_config["targets"]
best_pred_col = model_config["best_pred_col"]
riskiest_features = model_config["riskiest_features"]
""" Things that we always do even if we've already trained """
gc.collect()
print("reading tournament_data")
live_data = pd.read_parquet('v4/live.parquet')
print("reading validation_data")
validation_data = pd.read_parquet('v4/validation.parquet')
print("reading example_predictions")
example_preds = pd.read_parquet('v4/live_example_preds.parquet')
print("reading example_validaton_predictions")
validation_example_preds = pd.read_parquet('v4/validation_example_preds.parquet')
# set the example predictions
validation_data[EXAMPLE_PREDS_COL] = validation_example_preds["prediction"]
# check for nans and fill nans
print("checking for nans in the tournament data")
if live_data.loc[:, feature_cols].isna().sum().sum():
cols_w_nan = live_data.loc[:, feature_cols].isna().sum()
total_rows = len(live_data)
print(f"Number of nans per column this week: {cols_w_nan[cols_w_nan > 0]}")
print(f"out of {total_rows} total rows")
print(f"filling nans with 0.5")
live_data.loc[:, feature_cols] = live_data.loc[:, feature_cols].fillna(0.5)
else:
print("No nans in the features this week!")
pred_cols = set()
ensemble_cols = set()
for target in targets:
gc.collect()
model_name = f"model_{target}_downsample{downsample_full_train}"
print(f"loading {model_name}")
model = load_model(model_name)
if not model:
raise ValueError(f"{model_name} is not trained yet!")
model_expected_features = model.booster_.feature_name()
if set(model_expected_features) != set(feature_cols):
print(f"New features are available! Might want to retrain model {model_name}.")
print(f"predicting tournament and validation for {model_name}")
validation_data.loc[:, f"preds_{model_name}"] = model.predict(validation_data.loc[:, model_expected_features])
live_data.loc[:, f"preds_{model_name}"] = model.predict(live_data.loc[:, model_expected_features])
# do different neutralizations
# neutralize our predictions to the riskiest features only
print("neutralizing to riskiest_50 for validation and tournament")
validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=validation_data,
columns=[f"preds_{model_name}"],
neutralizers=riskiest_features,
proportion=1.0,
normalize=True,
era_col=ERA_COL)[f"preds_{model_name}"]
live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=live_data,
columns=[f"preds_{model_name}"],
neutralizers=riskiest_features,
proportion=1.0,
normalize=True,
era_col=ERA_COL)[f"preds_{model_name}"]
pred_cols.add(f"preds_{model_name}")
pred_cols.add(f"preds_{model_name}_neutral_riskiest_50")
# rank per era for each prediction column so that we can combine safely
validation_data[list(pred_cols)] = validation_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
live_data[list(pred_cols)] = live_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
# make ensembles for val and tournament
print('creating ensembles for tournament and validation')
validation_data["ensemble_neutral_riskiest_50"] = sum(
[validation_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
pct=True)
live_data["ensemble_neutral_riskiest_50"] = sum(
[live_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
pct=True)
ensemble_cols.add("ensemble_neutral_riskiest_50")
validation_data["ensemble_not_neutral"] = sum(
[validation_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
live_data["ensemble_not_neutral"] = sum(
[live_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
ensemble_cols.add("ensemble_not_neutral")
validation_data["ensemble_all"] = sum([validation_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
live_data["ensemble_all"] = sum([live_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
ensemble_cols.add("ensemble_all")
gc.collect()
print("getting final validation stats")
# get our final validation stats for our chosen model
validation_stats = validation_metrics(validation_data, list(pred_cols)+list(ensemble_cols), example_col=EXAMPLE_PREDS_COL,
fast_mode=False, target_col=TARGET_COL)
print(validation_stats.to_markdown())
# rename best model to prediction and rank from 0 to 1 to meet diagnostic/submission file requirements
validation_data["prediction"] = validation_data[best_pred_col].rank(pct=True)
live_data["prediction"] = live_data[best_pred_col].rank(pct=True)
save_prediction(validation_data["prediction"], f"validation_predictions_{current_round}")
save_prediction(live_data["prediction"], f"live_data_{current_round}")