-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_model_AutoML.py
82 lines (72 loc) · 2.99 KB
/
create_model_AutoML.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import datetime
import time
import pandas as pd
import data_parser
from h2o.automl import H2OAutoML
import h2o.estimators
# Load generated df
DataParser = data_parser.DataParser()
df = DataParser.load_complete_data_from_pickle()
df = df[DataParser.selected_cols_v2]
df = DataParser.preprocess_dropna(df)
df = DataParser.rename_columns_df(df)
opt_save = True
seeds = [42]
r2s = []
start = time.time()
h2o.init(nthreads=-1, min_mem_size="8g")
# Split the dataset into a train and valid set:
h2o_data = h2o.H2OFrame(df, destination_frame="CatNum", column_types=DataParser.col_dtypes_renamed)
# Automl opts
max_models = 10
df_data = pd.DataFrame()
for seed in seeds:
h2o.init(nthreads=-1, min_mem_size_GB=8)
# Split the dataset into a train and valid set:
train, test = h2o_data.split_frame(ratios=DataParser.ratios, seed=seed)
# train, valid, test = h2o_data.split_frame([0.7, 0.15], seed=seed)
train.frame_id = "Train"
# valid.frame_id = "Valid"
test.frame_id = "Test"
# train_valid = h2o.H2OFrame.rbind(train, valid)
X = h2o_data.columns
X.remove(DataParser.target)
y = DataParser.target
aml = H2OAutoML(max_models=max_models, seed=seed, stopping_metric='AUTO', nfolds=5,
keep_cross_validation_predictions=True)
print("Training")
time.sleep(2)
aml.train(x=X, y=y, training_frame=train)
time.sleep(5)
best_model = aml.get_best_model(criterion="deviance")
data = dict()
data['seed'] = seed
r2 = best_model.model_performance(test_data=test)['r2']
r2_t = best_model.model_performance(train)['r2']
mae = best_model.model_performance(test_data=test)['mae']
mrd = best_model.model_performance(test_data=test)['mean_residual_deviance']
data['r2'], data['mae'], data['mrd'] = r2, mae, mrd
new_row = df_data.from_dict([data])
df_data = pd.concat([df_data, new_row], ignore_index=True)
now = datetime.datetime.now().strftime("%y%m%d%H%M")
r2, mae, mrd = "{:.04f}".format(r2), "{:.04f}".format(mae), "{:.04f}".format(mrd)
h2o.save_model(best_model, path="temp/AutoML_model", filename=f"AutoML_{now}_{seed}_{r2}_{mae}_{mrd}", force=True)
r2s.append(float(r2))
time.sleep(2)
print(df_data)
r2, mae, mrd = df_data['r2'].mean(), df_data['mae'].mean(), df_data['mrd'].mean()
r2, mae, mrd, diff = "{:.04f}".format(r2), "{:.04f}".format(mae), "{:.04f}".format(mrd), \
"{:.04f}".format(best_model.r2() - r2)
print("Mean residual deviance: ", mrd)
print("Mean absolut error: ", mae)
print("Pearson Coefficient R^2: ", r2)
print("Difference of r^2 between test and train: ", diff)
now = datetime.datetime.now().strftime("%y%m%d%H%M")
h2o.save_model(best_model, path="temp/AutoML_model", filename=f"AutoML_{now}_{seed}_{r2}_{mae}_{mrd}", force=True)
print("Elapsed {:.04f} minutes".format((time.time() - start) / 60))
print(best_model.base_models)
print("Elapsed {:.04f} minutes".format((time.time() - start) / 60))
df_r2 = pd.DataFrame(r2s)
# print(best_model.actual_params)
print('Mean all r2s', df_r2.mean())
print(df_r2)