Added logging with mlflow

TopCoder2K · Nov 30, 2023 · a5c7bcf · a5c7bcf
1 parent d561ed4
commit a5c7bcf
Show file tree

Hide file tree

Showing 11 changed files with 736 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -161,3 +161,4 @@ cython_debug/
 checkpoints
 predictions
 catboost_info
+mlruns
diff --git a/README.md b/README.md
@@ -79,6 +79,9 @@ The available models are `rf` (Random Forest from the `scikit-learn` library) an
 poetry run python3 commands.py train --config_name cb_config
 ```
 
+_N.B. Do not forget to set `logging.mlflow.tracking_uri` before the launch. The logs are
+saved in the default directory: `mlruns`._
+
 ### Evaluation
 
 If you want to infer a previously trained model, make sure you've placed the checkpoint in

diff --git a/configs/cb_config.yaml b/configs/cb_config.yaml
@@ -9,10 +9,17 @@ model:
     learning_rate: 0.3
     l2_leaf_reg: 3
     loss_function: RMSE
-    metric_period: 100
+    metric_period: 10
+    logging_level: Verbose
 
 training:
   checkpoint_name: cb_model.p
 
 inference:
   checkpoint_name: cb_model.p
+
+logging:
+  commit_id: None # Adding new fields from a script is prohibited by default
+  mlflow:
+    exp_name: MLOps hw2
+    tracking_uri: http://127.0.0.1:5000
diff --git a/configs/rf_config.yaml b/configs/rf_config.yaml
@@ -1,11 +1,19 @@
 model:
   name: rf
   hyperparams:
-    n_estimators: 500
+    n_estimators: 100
     random_state: 0
+    verbose: 0
+    n_jobs: -1
 
 training:
   checkpoint_name: rf_model.p
 
 inference:
   checkpoint_name: rf_model.p
+
+logging:
+  commit_id: None # Adding new fields from a script is prohibited by default
+  mlflow:
+    exp_name: MLOps hw2
+    tracking_uri: http://127.0.0.1:5000
diff --git a/mlopscourse/models/base.py b/mlopscourse/models/base.py
@@ -35,3 +35,7 @@ def __call__(self, X_sample: pd.DataFrame) -> pd.Series:
     def save_checkpoint(self, path: str) -> None:
         with open(path + self.cfg.training.checkpoint_name, "wb") as f:
             pickle.dump(self, f)
+
+    @abstractmethod
+    def log_fis_and_metrics(self, exp_id: str) -> None:
+        raise NotImplementedError()
diff --git a/mlopscourse/models/catboost.py b/mlopscourse/models/catboost.py
@@ -1,5 +1,6 @@
 from typing import List, Optional
 
+import mlflow
 import numpy as np
 import pandas as pd
 from catboost import CatBoostRegressor, Pool
@@ -68,3 +69,26 @@ def __call__(self, X_sample: pd.DataFrame) -> np.ndarray:
             feature_names=list(X_sample.columns),
         )
         return self.model.predict(sample_data)
+
+    def log_fis_and_metrics(self, exp_id: str, col_names: List[str]) -> None:
+        with mlflow.start_run(
+            experiment_id=exp_id, run_name=f"training-{self.cfg.model.name}"
+        ):
+            # Log the model's hyperparameters and the code version
+            mlflow.log_params(self.cfg.model.hyperparams)
+            mlflow.log_param("commit_id", self.cfg.logging.commit_id)
+            # Log feature importances
+            mlflow.log_metrics(
+                {
+                    f"fi_of_{col_name}": self.model.feature_importances_[i]
+                    for i, col_name in enumerate(col_names)
+                }
+            )
+            r2_scores = self.model.evals_result_["learn"]["R2"]
+            rmse_scores = self.model.evals_result_["learn"]["RMSE"]
+            assert len(r2_scores) == len(rmse_scores), "Something wrong with metrics!"
+            for i in range(len(r2_scores)):
+                mlflow.log_metrics(
+                    {"R2_metric": r2_scores[i], "RMSE_loss": rmse_scores[i]},
+                    step=i * self.cfg.model.hyperparams.metric_period,
+                )
diff --git a/mlopscourse/models/random_forest.py b/mlopscourse/models/random_forest.py
@@ -1,12 +1,15 @@
 from typing import List, Optional
 
+import mlflow
 import numpy as np
 import pandas as pd
 from omegaconf import DictConfig
 from sklearn.compose import ColumnTransformer
 from sklearn.ensemble import RandomForestRegressor
+from sklearn.metrics import mean_squared_error, r2_score
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import OrdinalEncoder
+from tqdm import tqdm
 
 from .base import BaseModel
 
@@ -52,3 +55,37 @@ def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> pd.Series:
 
     def __call__(self, X_sample: pd.DataFrame) -> np.ndarray:
         return self.model.predict(X_sample)
+
+    def log_fis_and_metrics(
+        self, exp_id: str, X_train: pd.DataFrame, y_train: pd.Series
+    ) -> None:
+        with mlflow.start_run(
+            experiment_id=exp_id, run_name=f"training-{self.cfg.model.name}"
+        ):
+            # Log the model's hyperparameters and the code version
+            mlflow.log_params(self.cfg.model.hyperparams)
+            mlflow.log_param("commit_id", self.cfg.logging.commit_id)
+            # Log feature importances
+            mlflow.log_metrics(
+                {
+                    f"fi_of_{col_name}": self.model.named_steps[
+                        "randomforestregressor"
+                    ].feature_importances_[i]
+                    for i, col_name in enumerate(X_train.columns)
+                }
+            )
+            # Log R2 and RMSE metrics
+            for i in tqdm(range(0, self.cfg.model.hyperparams.n_estimators)):
+                model_i = make_pipeline(
+                    self.preprocessor, RandomForestRegressor(**self.cfg.model.hyperparams)
+                )
+                model_i.named_steps["randomforestregressor"].n_estimators = i + 1
+                model_i.fit(X_train, y_train)
+                y_pred = model_i.predict(X_train)
+                mlflow.log_metrics(
+                    {
+                        "R2_metric": r2_score(y_train, y_pred),
+                        "RMSE_metric": mean_squared_error(y_train, y_pred, squared=False),
+                    },
+                    step=i,
+                )
diff --git a/mlopscourse/train.py b/mlopscourse/train.py
@@ -1,11 +1,13 @@
 import os
 
 import fire
+import mlflow
 from hydra import compose
 from omegaconf import DictConfig, OmegaConf
 
 from .data.prepare_dataset import load_dataset
 from .models.models_zoo import prepare_model
+from .utils import get_git_revision_hash
 
 
 class Trainer:
@@ -23,6 +25,7 @@ def __init__(self, config_name: str, **kwargs: dict) -> None:
         self.cfg: DictConfig = compose(
             config_name=config_name, overrides=[f"{k}={v}" for k, v in kwargs.items()]
         )
+        self.cfg.logging.commit_id = get_git_revision_hash()
         print(OmegaConf.to_yaml(self.cfg))
 
     def train(self) -> None:
@@ -40,7 +43,19 @@ def train(self) -> None:
 
         os.makedirs("checkpoints", exist_ok=True)
         model.save_checkpoint("checkpoints/")
-        print("The training was finished successfully!")
+        print("The training was finished successfully!\nCollecting logs...")
+
+        # Since there is no easy way to log metrics as functions of time during
+        # the training, they should be collected after it.
+        mlflow_cfg = self.cfg.logging.mlflow
+        mlflow.set_tracking_uri(mlflow_cfg.tracking_uri)
+        exp_id = mlflow.set_experiment(mlflow_cfg.exp_name).experiment_id
+        # Unfortunately, logging is model dependent, at least because
+        # RandomForestRegressor doesn't provide the target metric progress.
+        if self.cfg.model.name == "cb":
+            model.log_fis_and_metrics(exp_id, X_train.columns)
+        else:
+            model.log_fis_and_metrics(exp_id, X_train, y_train)
 
 
 if __name__ == "__main__":

diff --git a/mlopscourse/utils.py b/mlopscourse/utils.py
@@ -0,0 +1,6 @@
+import subprocess
+
+
+# Credits to https://stackoverflow.com/a/21901260/12187881
+def get_git_revision_hash() -> str:
+    return subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("ascii").strip()