-
Notifications
You must be signed in to change notification settings - Fork 8
/
sklearn_job.py
40 lines (36 loc) · 1.47 KB
/
sklearn_job.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from dagster import job
from ops.sklearn_ops import (
fetch_titanic_dataset,
feature_selection,
split_into_train_test,
get_features_columns,
get_target_column,
encode_features,
logregression,
fit_model,
predict,
get_accuracy_score,
)
# NOTE: When the same op is being invoked multiple times, the op
# names after the first invocation, will be suffixed (ex. "_2", "_3", etc)
# The 2nd invocation of get_features_columns will be shown in dagit as
# "get_features_columns_2" and the 2nd invocation of get_target_column
# will be shown in dagit as "get_target_column_2"
# With .alias(), we can give them a better name or alias, instead of
# the "_2" suffix naming convention.
# Documentation:
# https://docs.dagster.io/concepts/solids-pipelines/pipelines#aliases-and-tags
@job
def sklearn_job():
raw_data = fetch_titanic_dataset()
final_features = feature_selection(raw_data)
df_train, df_test = split_into_train_test(final_features)
X_train = get_features_columns.alias("get_features_from_train")(df_train)
X_test = get_features_columns.alias("get_features_from_test")(df_test)
y_train = get_target_column.alias("get_target_from_train")(df_train)
y_test = get_target_column.alias("get_target_from_test")(df_test)
ct = encode_features()
logreg = logregression()
model = fit_model(X_train, y_train, ct, logreg)
y_pred = predict(X_test, model)
get_accuracy_score(y_test, y_pred)