From d92cbc77ee0f30d9a26d4ac0d7f2b632c63c3a67 Mon Sep 17 00:00:00 2001
From: Svyatoslav <pchelintsev.sv@phystech.edu>
Date: Tue, 28 Nov 2023 12:21:53 +0300
Subject: [PATCH] Hydra was added. Dataclasses weren't added because they seem
 to make things more messier

---
 README.md                           | 20 ++++++++----
 commands.py                         | 48 +++++++++++++++++++++--------
 mlopscourse/configs/cb_config.yaml  | 18 +++++++++++
 mlopscourse/configs/rf_config.yaml  | 11 +++++++
 mlopscourse/infer.py                | 27 ++++++++--------
 mlopscourse/models/base.py          | 14 +++++++--
 mlopscourse/models/catboost.py      | 28 +++++------------
 mlopscourse/models/models_zoo.py    | 14 +++++----
 mlopscourse/models/random_forest.py | 15 +++++----
 mlopscourse/train.py                | 20 +++++++-----
 poetry.lock                         |  2 +-
 pyproject.toml                      |  1 +
 12 files changed, 141 insertions(+), 77 deletions(-)
 create mode 100644 mlopscourse/configs/cb_config.yaml
 create mode 100644 mlopscourse/configs/rf_config.yaml

diff --git a/README.md b/README.md
index 8a6fded..20fa00b 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,8 @@
 # MLOpsCourse
 
+This repository is dedicated to the easy-to-understand practice of the standard MLOps
+techniques.
+
 ## Task Description
 
 The "Bike Rentals" dataset is used for scripts in this repository. This dataset contains
@@ -40,7 +43,7 @@ To setup only the necessary dependencies, run the following:
 poetry install --without dev
 ```
 
-If you want to use `pre-commit`, install all the dependencies:
+If you want to use `pre-commit` and `dvc`, install all the dependencies:
 
 ```
 poetry install
@@ -61,20 +64,25 @@ The command should download two .csv files from my GDrive and place them inside
 
 ### Training
 
-If you want to train the chosen model and save it afterwards, run:
+If you want to train the chosen model and save it afterwards, place its configuration file
+in the `mlopscourse/configs` directory and run:
 
 ```
-poetry run python3 commands.py train --model_type [chosen_model]
+poetry run python3 commands.py train --config_name [config_name_without_extension]
 ```
 
 The available models are `rf` (Random Forest from the `scikit-learn` library) and `cb`
-(Yandex's CatBoost).
+(Yandex's CatBoost), so an example with the CatBoost would be the following:
+
+```
+poetry run python3 commands.py train --config_name cb_config
+```
 
 ### Evaluation
 
 If you want to infer a previously trained model, make sure you've placed the checkpoint in
-`checkpoints/` and then run
+`checkpoints/` and the configuration file in `mlopscourse/configs` then run
 
 ```
-poetry run python3 commands.py infer --model_type [chosen_model] --ckpt [checkpoint_filename_with_extension]
+poetry run python3 commands.py infer --config_name [config_name_without_extension]
 ```
diff --git a/commands.py b/commands.py
index 4fa6568..1734ff3 100644
--- a/commands.py
+++ b/commands.py
@@ -1,36 +1,58 @@
 import fire
+from hydra import initialize
 
 from mlopscourse.infer import Inferencer
 from mlopscourse.train import Trainer
 
 
-def train(model_type: str) -> None:
+def train(
+    config_name: str,
+    config_path: str = "mlopscourse/configs/",
+    hydra_version_base: str = "1.3",
+    **kwargs: dict,
+) -> None:
     """
     Trains the chosen model on the train split of the dataset and saves the checkpoint.
 
     Parameters
     ----------
-    model_type : str
-        The type of model for training. Should be "rf" for RandomForest and "cb"
-        for CatBoost.
+    config_name : str
+        The name of the configuration file to use for model, training and inference
+        hyperparameters.
+    config_path : str
+        The path to the configuration files.
+    hydra_version_base : str
+        The compatibility level of hydra to use.
+    **kwargs : dict, optional
+        Values of the configuration file to override.
     """
-    Trainer(model_type).train()
+    with initialize(config_path=config_path, version_base=hydra_version_base):
+        Trainer(config_name, **kwargs).train()
 
 
-def infer(model_type: str, ckpt: str) -> None:
+def infer(
+    config_name: str,
+    config_path: str = "mlopscourse/configs/",
+    hydra_version_base: str = "1.3",
+    **kwargs: dict,
+) -> None:
     """
     Runs the chosen model on the test set of the dataset and calculates the R^2 metric.
 
     Parameters
     ----------
-    model_type : str
-        The type of model that was used for training. Should be "rf" for RandomForest
-        and "cb" for CatBoost.
-    ckpt : str
-        The filename inside 'checkpoint/' to load the model from. Should also contain the
-        the filename extension.
+    config_name : str
+        The name of the configuration file to use for model, training and inference
+        hyperparameters.
+    config_path : str
+        The path to the configuration files.
+    hydra_version_base : str
+        The compatibility level of hydra to use.
+    **kwargs : dict, optional
+        Values of the configuration file to override.
     """
-    Inferencer(model_type, ckpt).infer()
+    with initialize(config_path=config_path, version_base=hydra_version_base):
+        Inferencer(config_name, **kwargs).infer()
 
 
 if __name__ == "__main__":
diff --git a/mlopscourse/configs/cb_config.yaml b/mlopscourse/configs/cb_config.yaml
new file mode 100644
index 0000000..9f37ea4
--- /dev/null
+++ b/mlopscourse/configs/cb_config.yaml
@@ -0,0 +1,18 @@
+model:
+  name: cb
+  hyperparams:
+    depth: 6
+    n_estimators: 1000
+    eval_metric: R2
+    task_type: CPU
+    random_seed: 0
+    learning_rate: 0.3
+    l2_leaf_reg: 3
+    loss_function: RMSE
+    metric_period: 100
+
+training:
+  checkpoint_name: cb_model.p
+
+inference:
+  checkpoint_name: cb_model.p
diff --git a/mlopscourse/configs/rf_config.yaml b/mlopscourse/configs/rf_config.yaml
new file mode 100644
index 0000000..82e4efd
--- /dev/null
+++ b/mlopscourse/configs/rf_config.yaml
@@ -0,0 +1,11 @@
+model:
+  name: rf
+  hyperparams:
+    n_estimators: 500
+    random_state: 0
+
+training:
+  checkpoint_name: rf_model.p
+
+inference:
+  checkpoint_name: rf_model.p
diff --git a/mlopscourse/infer.py b/mlopscourse/infer.py
index 48c31a1..fbc9961 100644
--- a/mlopscourse/infer.py
+++ b/mlopscourse/infer.py
@@ -2,6 +2,8 @@
 import pickle
 
 import fire
+from hydra import compose
+from omegaconf import DictConfig, OmegaConf
 
 from .data.prepare_dataset import load_dataset
 
@@ -12,17 +14,16 @@ class Inferencer:
 
     Attributes
     ----------
-    model_type : str
-        The type of model that was used for training. Should be "rf" for RandomForest
-        and "cb" for CatBoost.
-    ckpt : str
-        The filename inside 'checkpoint/' to load the model from. Should also contain the
-        the filename extension.
+    cfg : omegaconf.DictConfig
+        The configuration containing the model type and hyperparameters, training and
+        inference parameters.
     """
 
-    def __init__(self, model_type: str, ckpt: str) -> None:
-        self.model_type = model_type
-        self.ckpt = ckpt
+    def __init__(self, config_name: str, **kwargs: dict) -> None:
+        self.cfg: DictConfig = compose(
+            config_name=config_name, overrides=[f"{k}={v}" for k, v in kwargs.items()]
+        )
+        print(OmegaConf.to_yaml(self.cfg))
 
     def infer(self) -> None:
         (
@@ -32,14 +33,14 @@ def infer(self) -> None:
             _,
         ) = load_dataset(split="test")
 
-        with open(f"checkpoints/{self.ckpt}", "rb") as f:
+        with open(f"checkpoints/{self.cfg.inference.checkpoint_name}", "rb") as f:
             model = pickle.load(f)
-        print(f"Evaluating the {self.model_type} model...")
+        print(f"Evaluating the {self.cfg.model.name} model...")
         y_preds = model.eval(X_test, y_test)
 
         os.makedirs("predictions", exist_ok=True)
-        preds_name = self.ckpt.split(".")[0]
-        y_preds.to_csv(f"predictions/{preds_name}_preds.csv")
+        ckpt_name = self.cfg.inference.checkpoint_name.split(".")[0]
+        y_preds.to_csv(f"predictions/{ckpt_name}_preds.csv")
 
 
 if __name__ == "__main__":
diff --git a/mlopscourse/models/base.py b/mlopscourse/models/base.py
index 134780e..07b0f4a 100644
--- a/mlopscourse/models/base.py
+++ b/mlopscourse/models/base.py
@@ -1,15 +1,20 @@
+import pickle
+from abc import ABCMeta, abstractmethod
 from typing import Optional
 
 import pandas as pd
+from omegaconf import DictConfig
 
 
-class BaseModel:
+class BaseModel(metaclass=ABCMeta):
     """Represents an interface that any model used must implement."""
 
-    def __init__(self) -> None:
+    def __init__(self, cfg: DictConfig) -> None:
+        self.cfg = cfg
         self.preprocessor = None
         self.model = None
 
+    @abstractmethod
     def train(
         self,
         X_train: pd.DataFrame,
@@ -19,11 +24,14 @@ def train(
     ) -> None:
         raise NotImplementedError()
 
+    @abstractmethod
     def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> pd.Series:
         raise NotImplementedError()
 
+    @abstractmethod
     def __call__(self, X_sample: pd.DataFrame) -> pd.Series:
         raise NotImplementedError()
 
     def save_checkpoint(self, path: str) -> None:
-        raise NotImplementedError()
+        with open(path + self.cfg.training.checkpoint_name, "wb") as f:
+            pickle.dump(self, f)
diff --git a/mlopscourse/models/catboost.py b/mlopscourse/models/catboost.py
index c54047f..d8de95c 100644
--- a/mlopscourse/models/catboost.py
+++ b/mlopscourse/models/catboost.py
@@ -1,34 +1,26 @@
-import pickle
 from typing import List, Optional
 
 import numpy as np
 import pandas as pd
 from catboost import CatBoostRegressor, Pool
+from omegaconf import DictConfig
 from sklearn.metrics import r2_score
 
 from .base import BaseModel
 
 
 class CatboostModel(BaseModel):
-    """A basic Random Forest model from sklearn."""
+    """The Yandex's CatBoost."""
 
     def __init__(
-        self, numerical_features: List[str], categorical_features: List[str]
+        self,
+        cfg: DictConfig,
+        numerical_features: List[str],
+        categorical_features: List[str],
     ) -> None:
-        super().__init__()
+        super().__init__(cfg)
 
-        self.hyperparams = {
-            "learning_rate": 0.3,
-            "depth": 6,
-            "l2_leaf_reg": 3,
-            "loss_function": "RMSE",
-            "eval_metric": "R2",
-            "random_seed": 0,
-            "task_type": "CPU",
-            "n_estimators": 1000,
-            "metric_period": 100,
-        }
-        self.model = CatBoostRegressor(**self.hyperparams)
+        self.model = CatBoostRegressor(**cfg.model.hyperparams)
         self.numerical_features = numerical_features
         self.categorical_features = categorical_features
 
@@ -76,7 +68,3 @@ def __call__(self, X_sample: pd.DataFrame) -> np.ndarray:
             feature_names=list(X_sample.columns),
         )
         return self.model.predict(sample_data)
-
-    def save_checkpoint(self, path: str) -> None:
-        with open(path + "model_cb.p", "wb") as f:
-            pickle.dump(self, f)
diff --git a/mlopscourse/models/models_zoo.py b/mlopscourse/models/models_zoo.py
index 5eb5de4..c442f13 100644
--- a/mlopscourse/models/models_zoo.py
+++ b/mlopscourse/models/models_zoo.py
@@ -1,18 +1,20 @@
 from typing import List
 
+from omegaconf import DictConfig
+
 from .base import BaseModel
 from .catboost import CatboostModel
 from .random_forest import RandomForest
 
 
 def prepare_model(
-    model_type: str,
+    cfg: DictConfig,
     numerical_features: List[str],
     categorical_features: List[str],
 ) -> BaseModel:
-    if model_type == "rf":
-        return RandomForest(numerical_features, categorical_features)
-    elif model_type == "cb":
-        return CatboostModel(numerical_features, categorical_features)
+    if cfg.model.name == "rf":
+        return RandomForest(cfg, numerical_features, categorical_features)
+    elif cfg.model.name == "cb":
+        return CatboostModel(cfg, numerical_features, categorical_features)
     else:
-        raise AssertionError(f"Unknown model name: {model_type}")
+        raise AssertionError(f"Unknown model name: {cfg.model.name}")
diff --git a/mlopscourse/models/random_forest.py b/mlopscourse/models/random_forest.py
index cb533c9..97c16c0 100644
--- a/mlopscourse/models/random_forest.py
+++ b/mlopscourse/models/random_forest.py
@@ -1,8 +1,8 @@
-import pickle
 from typing import List, Optional
 
 import numpy as np
 import pandas as pd
+from omegaconf import DictConfig
 from sklearn.compose import ColumnTransformer
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.pipeline import make_pipeline
@@ -15,9 +15,12 @@ class RandomForest(BaseModel):
     """A basic Random Forest model from sklearn."""
 
     def __init__(
-        self, numerical_features: List[str], categorical_features: List[str]
+        self,
+        cfg: DictConfig,
+        numerical_features: List[str],
+        categorical_features: List[str],
     ) -> None:
-        super().__init__()
+        super().__init__(cfg)
 
         self.preprocessor = ColumnTransformer(
             transformers=[
@@ -28,7 +31,7 @@ def __init__(
             verbose_feature_names_out=False,
         ).set_output(transform="pandas")
         self.model = make_pipeline(
-            self.preprocessor, RandomForestRegressor(random_state=0)
+            self.preprocessor, RandomForestRegressor(**cfg.model.hyperparams)
         )
 
     def train(
@@ -49,7 +52,3 @@ def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> pd.Series:
 
     def __call__(self, X_sample: pd.DataFrame) -> np.ndarray:
         return self.model.predict(X_sample)
-
-    def save_checkpoint(self, path: str) -> None:
-        with open(path + "model_rf.p", "wb") as f:
-            pickle.dump(self, f)
diff --git a/mlopscourse/train.py b/mlopscourse/train.py
index ec2ace8..b83d4b7 100644
--- a/mlopscourse/train.py
+++ b/mlopscourse/train.py
@@ -1,6 +1,8 @@
 import os
 
 import fire
+from hydra import compose
+from omegaconf import DictConfig, OmegaConf
 
 from .data.prepare_dataset import load_dataset
 from .models.models_zoo import prepare_model
@@ -12,13 +14,16 @@ class Trainer:
 
     Attributes
     ----------
-    model_type : str
-        The type of model for training. Should be "rf" for RandomForest and "cb"
-        for CatBoost.
+    cfg : omegaconf.DictConfig
+        The configuration containing the model type and hyperparameters, training and
+        inference parameters.
     """
 
-    def __init__(self, model_type: str) -> None:
-        self.model_type = model_type
+    def __init__(self, config_name: str, **kwargs: dict) -> None:
+        self.cfg: DictConfig = compose(
+            config_name=config_name, overrides=[f"{k}={v}" for k, v in kwargs.items()]
+        )
+        print(OmegaConf.to_yaml(self.cfg))
 
     def train(self) -> None:
         (
@@ -27,9 +32,10 @@ def train(self) -> None:
             numerical_features,
             categorical_features,
         ) = load_dataset(split="train")
-        model = prepare_model(self.model_type, numerical_features, categorical_features)
 
-        print(f"Training the {self.model_type} model...")
+        model = prepare_model(self.cfg, numerical_features, categorical_features)
+
+        print(f"Training the {self.cfg.model.name} model...")
         model.train(X_train, y_train)
 
         os.makedirs("checkpoints", exist_ok=True)
diff --git a/poetry.lock b/poetry.lock
index 2f2afe4..d7eacb3 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -3529,4 +3529,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = ">= 3.9, < 3.13"
-content-hash = "9a89905b4eaada199038f60bf33876eac300737e58b619c00427afc8ec57644f"
+content-hash = "0342bc00755934e3708f20a4477a4ca42fc3a8f0834da73ea92d401b38a6e251"
diff --git a/pyproject.toml b/pyproject.toml
index 9d6f3e0..f46551e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,6 +13,7 @@ numpy = "^1.26.0"
 scikit-learn = "^1.3.1"
 catboost = "^1.2.2"
 fire = "^0.5.0"
+hydra-core = "^1.3.2"
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = "^3.4.0"