Hydra was added. Dataclasses weren't added because they seem to make …

…things more messier
TopCoder2K · Nov 28, 2023 · d92cbc7 · d92cbc7
1 parent f0dd799
commit d92cbc7
Show file tree

Hide file tree

Showing 12 changed files with 141 additions and 77 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,8 @@
 # MLOpsCourse
 
+This repository is dedicated to the easy-to-understand practice of the standard MLOps
+techniques.
+
 ## Task Description
 
 The "Bike Rentals" dataset is used for scripts in this repository. This dataset contains
@@ -40,7 +43,7 @@ To setup only the necessary dependencies, run the following:
 poetry install --without dev
 ```
 
-If you want to use `pre-commit`, install all the dependencies:
+If you want to use `pre-commit` and `dvc`, install all the dependencies:
 
 ```
 poetry install
@@ -61,20 +64,25 @@ The command should download two .csv files from my GDrive and place them inside
 
 ### Training
 
-If you want to train the chosen model and save it afterwards, run:
+If you want to train the chosen model and save it afterwards, place its configuration file
+in the `mlopscourse/configs` directory and run:
 
 ```
-poetry run python3 commands.py train --model_type [chosen_model]
+poetry run python3 commands.py train --config_name [config_name_without_extension]
 ```
 
 The available models are `rf` (Random Forest from the `scikit-learn` library) and `cb`
-(Yandex's CatBoost).
+(Yandex's CatBoost), so an example with the CatBoost would be the following:
+
+```
+poetry run python3 commands.py train --config_name cb_config
+```
 
 ### Evaluation
 
 If you want to infer a previously trained model, make sure you've placed the checkpoint in
-`checkpoints/` and then run
+`checkpoints/` and the configuration file in `mlopscourse/configs` then run
 
 ```
-poetry run python3 commands.py infer --model_type [chosen_model] --ckpt [checkpoint_filename_with_extension]
+poetry run python3 commands.py infer --config_name [config_name_without_extension]
 ```
diff --git a/commands.py b/commands.py
@@ -1,36 +1,58 @@
 import fire
+from hydra import initialize
 
 from mlopscourse.infer import Inferencer
 from mlopscourse.train import Trainer
 
 
-def train(model_type: str) -> None:
+def train(
+    config_name: str,
+    config_path: str = "mlopscourse/configs/",
+    hydra_version_base: str = "1.3",
+    **kwargs: dict,
+) -> None:
     """
     Trains the chosen model on the train split of the dataset and saves the checkpoint.
 
     Parameters
     ----------
-    model_type : str
-        The type of model for training. Should be "rf" for RandomForest and "cb"
-        for CatBoost.
+    config_name : str
+        The name of the configuration file to use for model, training and inference
+        hyperparameters.
+    config_path : str
+        The path to the configuration files.
+    hydra_version_base : str
+        The compatibility level of hydra to use.
+    **kwargs : dict, optional
+        Values of the configuration file to override.
     """
-    Trainer(model_type).train()
+    with initialize(config_path=config_path, version_base=hydra_version_base):
+        Trainer(config_name, **kwargs).train()
 
 
-def infer(model_type: str, ckpt: str) -> None:
+def infer(
+    config_name: str,
+    config_path: str = "mlopscourse/configs/",
+    hydra_version_base: str = "1.3",
+    **kwargs: dict,
+) -> None:
     """
     Runs the chosen model on the test set of the dataset and calculates the R^2 metric.
 
     Parameters
     ----------
-    model_type : str
-        The type of model that was used for training. Should be "rf" for RandomForest
-        and "cb" for CatBoost.
-    ckpt : str
-        The filename inside 'checkpoint/' to load the model from. Should also contain the
-        the filename extension.
+    config_name : str
+        The name of the configuration file to use for model, training and inference
+        hyperparameters.
+    config_path : str
+        The path to the configuration files.
+    hydra_version_base : str
+        The compatibility level of hydra to use.
+    **kwargs : dict, optional
+        Values of the configuration file to override.
     """
-    Inferencer(model_type, ckpt).infer()
+    with initialize(config_path=config_path, version_base=hydra_version_base):
+        Inferencer(config_name, **kwargs).infer()
 
 
 if __name__ == "__main__":

diff --git a/mlopscourse/configs/cb_config.yaml b/mlopscourse/configs/cb_config.yaml
@@ -0,0 +1,18 @@
+model:
+  name: cb
+  hyperparams:
+    depth: 6
+    n_estimators: 1000
+    eval_metric: R2
+    task_type: CPU
+    random_seed: 0
+    learning_rate: 0.3
+    l2_leaf_reg: 3
+    loss_function: RMSE
+    metric_period: 100
+
+training:
+  checkpoint_name: cb_model.p
+
+inference:
+  checkpoint_name: cb_model.p
diff --git a/mlopscourse/configs/rf_config.yaml b/mlopscourse/configs/rf_config.yaml
@@ -0,0 +1,11 @@
+model:
+  name: rf
+  hyperparams:
+    n_estimators: 500
+    random_state: 0
+
+training:
+  checkpoint_name: rf_model.p
+
+inference:
+  checkpoint_name: rf_model.p
diff --git a/mlopscourse/infer.py b/mlopscourse/infer.py
@@ -2,6 +2,8 @@
 import pickle
 
 import fire
+from hydra import compose
+from omegaconf import DictConfig, OmegaConf
 
 from .data.prepare_dataset import load_dataset
 
@@ -12,17 +14,16 @@ class Inferencer:
 
     Attributes
     ----------
-    model_type : str
-        The type of model that was used for training. Should be "rf" for RandomForest
-        and "cb" for CatBoost.
-    ckpt : str
-        The filename inside 'checkpoint/' to load the model from. Should also contain the
-        the filename extension.
+    cfg : omegaconf.DictConfig
+        The configuration containing the model type and hyperparameters, training and
+        inference parameters.
     """
 
-    def __init__(self, model_type: str, ckpt: str) -> None:
-        self.model_type = model_type
-        self.ckpt = ckpt
+    def __init__(self, config_name: str, **kwargs: dict) -> None:
+        self.cfg: DictConfig = compose(
+            config_name=config_name, overrides=[f"{k}={v}" for k, v in kwargs.items()]
+        )
+        print(OmegaConf.to_yaml(self.cfg))
 
     def infer(self) -> None:
         (
@@ -32,14 +33,14 @@ def infer(self) -> None:
             _,
         ) = load_dataset(split="test")
 
-        with open(f"checkpoints/{self.ckpt}", "rb") as f:
+        with open(f"checkpoints/{self.cfg.inference.checkpoint_name}", "rb") as f:
             model = pickle.load(f)
-        print(f"Evaluating the {self.model_type} model...")
+        print(f"Evaluating the {self.cfg.model.name} model...")
         y_preds = model.eval(X_test, y_test)
 
         os.makedirs("predictions", exist_ok=True)
-        preds_name = self.ckpt.split(".")[0]
-        y_preds.to_csv(f"predictions/{preds_name}_preds.csv")
+        ckpt_name = self.cfg.inference.checkpoint_name.split(".")[0]
+        y_preds.to_csv(f"predictions/{ckpt_name}_preds.csv")
 
 
 if __name__ == "__main__":

diff --git a/mlopscourse/models/base.py b/mlopscourse/models/base.py
@@ -1,15 +1,20 @@
+import pickle
+from abc import ABCMeta, abstractmethod
 from typing import Optional
 
 import pandas as pd
+from omegaconf import DictConfig
 
 
-class BaseModel:
+class BaseModel(metaclass=ABCMeta):
     """Represents an interface that any model used must implement."""
 
-    def __init__(self) -> None:
+    def __init__(self, cfg: DictConfig) -> None:
+        self.cfg = cfg
         self.preprocessor = None
         self.model = None
 
+    @abstractmethod
     def train(
         self,
         X_train: pd.DataFrame,
@@ -19,11 +24,14 @@ def train(
     ) -> None:
         raise NotImplementedError()
 
+    @abstractmethod
     def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> pd.Series:
         raise NotImplementedError()
 
+    @abstractmethod
     def __call__(self, X_sample: pd.DataFrame) -> pd.Series:
         raise NotImplementedError()
 
     def save_checkpoint(self, path: str) -> None:
-        raise NotImplementedError()
+        with open(path + self.cfg.training.checkpoint_name, "wb") as f:
+            pickle.dump(self, f)
diff --git a/mlopscourse/models/catboost.py b/mlopscourse/models/catboost.py
@@ -1,34 +1,26 @@
-import pickle
 from typing import List, Optional
 
 import numpy as np
 import pandas as pd
 from catboost import CatBoostRegressor, Pool
+from omegaconf import DictConfig
 from sklearn.metrics import r2_score
 
 from .base import BaseModel
 
 
 class CatboostModel(BaseModel):
-    """A basic Random Forest model from sklearn."""
+    """The Yandex's CatBoost."""
 
     def __init__(
-        self, numerical_features: List[str], categorical_features: List[str]
+        self,
+        cfg: DictConfig,
+        numerical_features: List[str],
+        categorical_features: List[str],
     ) -> None:
-        super().__init__()
+        super().__init__(cfg)
 
-        self.hyperparams = {
-            "learning_rate": 0.3,
-            "depth": 6,
-            "l2_leaf_reg": 3,
-            "loss_function": "RMSE",
-            "eval_metric": "R2",
-            "random_seed": 0,
-            "task_type": "CPU",
-            "n_estimators": 1000,
-            "metric_period": 100,
-        }
-        self.model = CatBoostRegressor(**self.hyperparams)
+        self.model = CatBoostRegressor(**cfg.model.hyperparams)
         self.numerical_features = numerical_features
         self.categorical_features = categorical_features
 
@@ -76,7 +68,3 @@ def __call__(self, X_sample: pd.DataFrame) -> np.ndarray:
             feature_names=list(X_sample.columns),
         )
         return self.model.predict(sample_data)
-
-    def save_checkpoint(self, path: str) -> None:
-        with open(path + "model_cb.p", "wb") as f:
-            pickle.dump(self, f)
diff --git a/mlopscourse/models/models_zoo.py b/mlopscourse/models/models_zoo.py
@@ -1,18 +1,20 @@
 from typing import List
 
+from omegaconf import DictConfig
+
 from .base import BaseModel
 from .catboost import CatboostModel
 from .random_forest import RandomForest
 
 
 def prepare_model(
-    model_type: str,
+    cfg: DictConfig,
     numerical_features: List[str],
     categorical_features: List[str],
 ) -> BaseModel:
-    if model_type == "rf":
-        return RandomForest(numerical_features, categorical_features)
-    elif model_type == "cb":
-        return CatboostModel(numerical_features, categorical_features)
+    if cfg.model.name == "rf":
+        return RandomForest(cfg, numerical_features, categorical_features)
+    elif cfg.model.name == "cb":
+        return CatboostModel(cfg, numerical_features, categorical_features)
     else:
-        raise AssertionError(f"Unknown model name: {model_type}")
+        raise AssertionError(f"Unknown model name: {cfg.model.name}")
diff --git a/mlopscourse/models/random_forest.py b/mlopscourse/models/random_forest.py
@@ -1,8 +1,8 @@
-import pickle
 from typing import List, Optional
 
 import numpy as np
 import pandas as pd
+from omegaconf import DictConfig
 from sklearn.compose import ColumnTransformer
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.pipeline import make_pipeline
@@ -15,9 +15,12 @@ class RandomForest(BaseModel):
     """A basic Random Forest model from sklearn."""
 
     def __init__(
-        self, numerical_features: List[str], categorical_features: List[str]
+        self,
+        cfg: DictConfig,
+        numerical_features: List[str],
+        categorical_features: List[str],
     ) -> None:
-        super().__init__()
+        super().__init__(cfg)
 
         self.preprocessor = ColumnTransformer(
             transformers=[
@@ -28,7 +31,7 @@ def __init__(
             verbose_feature_names_out=False,
         ).set_output(transform="pandas")
         self.model = make_pipeline(
-            self.preprocessor, RandomForestRegressor(random_state=0)
+            self.preprocessor, RandomForestRegressor(**cfg.model.hyperparams)
         )
 
     def train(
@@ -49,7 +52,3 @@ def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> pd.Series:
 
     def __call__(self, X_sample: pd.DataFrame) -> np.ndarray:
         return self.model.predict(X_sample)
-
-    def save_checkpoint(self, path: str) -> None:
-        with open(path + "model_rf.p", "wb") as f:
-            pickle.dump(self, f)