Skip to content

Commit

Permalink
Added logging with mlflow
Browse files Browse the repository at this point in the history
  • Loading branch information
TopCoder2K committed Nov 30, 2023
1 parent d561ed4 commit a5c7bcf
Show file tree
Hide file tree
Showing 11 changed files with 736 additions and 4 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -161,3 +161,4 @@ cython_debug/
checkpoints
predictions
catboost_info
mlruns
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ The available models are `rf` (Random Forest from the `scikit-learn` library) an
poetry run python3 commands.py train --config_name cb_config
```

_N.B. Do not forget to set `logging.mlflow.tracking_uri` before the launch. The logs are
saved in the default directory: `mlruns`._

### Evaluation

If you want to infer a previously trained model, make sure you've placed the checkpoint in
Expand Down
9 changes: 8 additions & 1 deletion configs/cb_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,17 @@ model:
learning_rate: 0.3
l2_leaf_reg: 3
loss_function: RMSE
metric_period: 100
metric_period: 10
logging_level: Verbose

training:
checkpoint_name: cb_model.p

inference:
checkpoint_name: cb_model.p

logging:
commit_id: None # Adding new fields from a script is prohibited by default
mlflow:
exp_name: MLOps hw2
tracking_uri: http://127.0.0.1:5000
10 changes: 9 additions & 1 deletion configs/rf_config.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
model:
name: rf
hyperparams:
n_estimators: 500
n_estimators: 100
random_state: 0
verbose: 0
n_jobs: -1

training:
checkpoint_name: rf_model.p

inference:
checkpoint_name: rf_model.p

logging:
commit_id: None # Adding new fields from a script is prohibited by default
mlflow:
exp_name: MLOps hw2
tracking_uri: http://127.0.0.1:5000
4 changes: 4 additions & 0 deletions mlopscourse/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,7 @@ def __call__(self, X_sample: pd.DataFrame) -> pd.Series:
def save_checkpoint(self, path: str) -> None:
with open(path + self.cfg.training.checkpoint_name, "wb") as f:
pickle.dump(self, f)

@abstractmethod
def log_fis_and_metrics(self, exp_id: str) -> None:
raise NotImplementedError()
24 changes: 24 additions & 0 deletions mlopscourse/models/catboost.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import List, Optional

import mlflow
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
Expand Down Expand Up @@ -68,3 +69,26 @@ def __call__(self, X_sample: pd.DataFrame) -> np.ndarray:
feature_names=list(X_sample.columns),
)
return self.model.predict(sample_data)

def log_fis_and_metrics(self, exp_id: str, col_names: List[str]) -> None:
with mlflow.start_run(
experiment_id=exp_id, run_name=f"training-{self.cfg.model.name}"
):
# Log the model's hyperparameters and the code version
mlflow.log_params(self.cfg.model.hyperparams)
mlflow.log_param("commit_id", self.cfg.logging.commit_id)
# Log feature importances
mlflow.log_metrics(
{
f"fi_of_{col_name}": self.model.feature_importances_[i]
for i, col_name in enumerate(col_names)
}
)
r2_scores = self.model.evals_result_["learn"]["R2"]
rmse_scores = self.model.evals_result_["learn"]["RMSE"]
assert len(r2_scores) == len(rmse_scores), "Something wrong with metrics!"
for i in range(len(r2_scores)):
mlflow.log_metrics(
{"R2_metric": r2_scores[i], "RMSE_loss": rmse_scores[i]},
step=i * self.cfg.model.hyperparams.metric_period,
)
37 changes: 37 additions & 0 deletions mlopscourse/models/random_forest.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
from typing import List, Optional

import mlflow
import numpy as np
import pandas as pd
from omegaconf import DictConfig
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder
from tqdm import tqdm

from .base import BaseModel

Expand Down Expand Up @@ -52,3 +55,37 @@ def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> pd.Series:

def __call__(self, X_sample: pd.DataFrame) -> np.ndarray:
return self.model.predict(X_sample)

def log_fis_and_metrics(
self, exp_id: str, X_train: pd.DataFrame, y_train: pd.Series
) -> None:
with mlflow.start_run(
experiment_id=exp_id, run_name=f"training-{self.cfg.model.name}"
):
# Log the model's hyperparameters and the code version
mlflow.log_params(self.cfg.model.hyperparams)
mlflow.log_param("commit_id", self.cfg.logging.commit_id)
# Log feature importances
mlflow.log_metrics(
{
f"fi_of_{col_name}": self.model.named_steps[
"randomforestregressor"
].feature_importances_[i]
for i, col_name in enumerate(X_train.columns)
}
)
# Log R2 and RMSE metrics
for i in tqdm(range(0, self.cfg.model.hyperparams.n_estimators)):
model_i = make_pipeline(
self.preprocessor, RandomForestRegressor(**self.cfg.model.hyperparams)
)
model_i.named_steps["randomforestregressor"].n_estimators = i + 1
model_i.fit(X_train, y_train)
y_pred = model_i.predict(X_train)
mlflow.log_metrics(
{
"R2_metric": r2_score(y_train, y_pred),
"RMSE_metric": mean_squared_error(y_train, y_pred, squared=False),
},
step=i,
)
17 changes: 16 additions & 1 deletion mlopscourse/train.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import os

import fire
import mlflow
from hydra import compose
from omegaconf import DictConfig, OmegaConf

from .data.prepare_dataset import load_dataset
from .models.models_zoo import prepare_model
from .utils import get_git_revision_hash


class Trainer:
Expand All @@ -23,6 +25,7 @@ def __init__(self, config_name: str, **kwargs: dict) -> None:
self.cfg: DictConfig = compose(
config_name=config_name, overrides=[f"{k}={v}" for k, v in kwargs.items()]
)
self.cfg.logging.commit_id = get_git_revision_hash()
print(OmegaConf.to_yaml(self.cfg))

def train(self) -> None:
Expand All @@ -40,7 +43,19 @@ def train(self) -> None:

os.makedirs("checkpoints", exist_ok=True)
model.save_checkpoint("checkpoints/")
print("The training was finished successfully!")
print("The training was finished successfully!\nCollecting logs...")

# Since there is no easy way to log metrics as functions of time during
# the training, they should be collected after it.
mlflow_cfg = self.cfg.logging.mlflow
mlflow.set_tracking_uri(mlflow_cfg.tracking_uri)
exp_id = mlflow.set_experiment(mlflow_cfg.exp_name).experiment_id
# Unfortunately, logging is model dependent, at least because
# RandomForestRegressor doesn't provide the target metric progress.
if self.cfg.model.name == "cb":
model.log_fis_and_metrics(exp_id, X_train.columns)
else:
model.log_fis_and_metrics(exp_id, X_train, y_train)


if __name__ == "__main__":
Expand Down
6 changes: 6 additions & 0 deletions mlopscourse/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import subprocess


# Credits to https://stackoverflow.com/a/21901260/12187881
def get_git_revision_hash() -> str:
return subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("ascii").strip()
Loading

0 comments on commit a5c7bcf

Please sign in to comment.