From d92cbc77ee0f30d9a26d4ac0d7f2b632c63c3a67 Mon Sep 17 00:00:00 2001 From: Svyatoslav Date: Tue, 28 Nov 2023 12:21:53 +0300 Subject: [PATCH] Hydra was added. Dataclasses weren't added because they seem to make things more messier --- README.md | 20 ++++++++---- commands.py | 48 +++++++++++++++++++++-------- mlopscourse/configs/cb_config.yaml | 18 +++++++++++ mlopscourse/configs/rf_config.yaml | 11 +++++++ mlopscourse/infer.py | 27 ++++++++-------- mlopscourse/models/base.py | 14 +++++++-- mlopscourse/models/catboost.py | 28 +++++------------ mlopscourse/models/models_zoo.py | 14 +++++---- mlopscourse/models/random_forest.py | 15 +++++---- mlopscourse/train.py | 20 +++++++----- poetry.lock | 2 +- pyproject.toml | 1 + 12 files changed, 141 insertions(+), 77 deletions(-) create mode 100644 mlopscourse/configs/cb_config.yaml create mode 100644 mlopscourse/configs/rf_config.yaml diff --git a/README.md b/README.md index 8a6fded..20fa00b 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ # MLOpsCourse +This repository is dedicated to the easy-to-understand practice of the standard MLOps +techniques. + ## Task Description The "Bike Rentals" dataset is used for scripts in this repository. This dataset contains @@ -40,7 +43,7 @@ To setup only the necessary dependencies, run the following: poetry install --without dev ``` -If you want to use `pre-commit`, install all the dependencies: +If you want to use `pre-commit` and `dvc`, install all the dependencies: ``` poetry install @@ -61,20 +64,25 @@ The command should download two .csv files from my GDrive and place them inside ### Training -If you want to train the chosen model and save it afterwards, run: +If you want to train the chosen model and save it afterwards, place its configuration file +in the `mlopscourse/configs` directory and run: ``` -poetry run python3 commands.py train --model_type [chosen_model] +poetry run python3 commands.py train --config_name [config_name_without_extension] ``` The available models are `rf` (Random Forest from the `scikit-learn` library) and `cb` -(Yandex's CatBoost). +(Yandex's CatBoost), so an example with the CatBoost would be the following: + +``` +poetry run python3 commands.py train --config_name cb_config +``` ### Evaluation If you want to infer a previously trained model, make sure you've placed the checkpoint in -`checkpoints/` and then run +`checkpoints/` and the configuration file in `mlopscourse/configs` then run ``` -poetry run python3 commands.py infer --model_type [chosen_model] --ckpt [checkpoint_filename_with_extension] +poetry run python3 commands.py infer --config_name [config_name_without_extension] ``` diff --git a/commands.py b/commands.py index 4fa6568..1734ff3 100644 --- a/commands.py +++ b/commands.py @@ -1,36 +1,58 @@ import fire +from hydra import initialize from mlopscourse.infer import Inferencer from mlopscourse.train import Trainer -def train(model_type: str) -> None: +def train( + config_name: str, + config_path: str = "mlopscourse/configs/", + hydra_version_base: str = "1.3", + **kwargs: dict, +) -> None: """ Trains the chosen model on the train split of the dataset and saves the checkpoint. Parameters ---------- - model_type : str - The type of model for training. Should be "rf" for RandomForest and "cb" - for CatBoost. + config_name : str + The name of the configuration file to use for model, training and inference + hyperparameters. + config_path : str + The path to the configuration files. + hydra_version_base : str + The compatibility level of hydra to use. + **kwargs : dict, optional + Values of the configuration file to override. """ - Trainer(model_type).train() + with initialize(config_path=config_path, version_base=hydra_version_base): + Trainer(config_name, **kwargs).train() -def infer(model_type: str, ckpt: str) -> None: +def infer( + config_name: str, + config_path: str = "mlopscourse/configs/", + hydra_version_base: str = "1.3", + **kwargs: dict, +) -> None: """ Runs the chosen model on the test set of the dataset and calculates the R^2 metric. Parameters ---------- - model_type : str - The type of model that was used for training. Should be "rf" for RandomForest - and "cb" for CatBoost. - ckpt : str - The filename inside 'checkpoint/' to load the model from. Should also contain the - the filename extension. + config_name : str + The name of the configuration file to use for model, training and inference + hyperparameters. + config_path : str + The path to the configuration files. + hydra_version_base : str + The compatibility level of hydra to use. + **kwargs : dict, optional + Values of the configuration file to override. """ - Inferencer(model_type, ckpt).infer() + with initialize(config_path=config_path, version_base=hydra_version_base): + Inferencer(config_name, **kwargs).infer() if __name__ == "__main__": diff --git a/mlopscourse/configs/cb_config.yaml b/mlopscourse/configs/cb_config.yaml new file mode 100644 index 0000000..9f37ea4 --- /dev/null +++ b/mlopscourse/configs/cb_config.yaml @@ -0,0 +1,18 @@ +model: + name: cb + hyperparams: + depth: 6 + n_estimators: 1000 + eval_metric: R2 + task_type: CPU + random_seed: 0 + learning_rate: 0.3 + l2_leaf_reg: 3 + loss_function: RMSE + metric_period: 100 + +training: + checkpoint_name: cb_model.p + +inference: + checkpoint_name: cb_model.p diff --git a/mlopscourse/configs/rf_config.yaml b/mlopscourse/configs/rf_config.yaml new file mode 100644 index 0000000..82e4efd --- /dev/null +++ b/mlopscourse/configs/rf_config.yaml @@ -0,0 +1,11 @@ +model: + name: rf + hyperparams: + n_estimators: 500 + random_state: 0 + +training: + checkpoint_name: rf_model.p + +inference: + checkpoint_name: rf_model.p diff --git a/mlopscourse/infer.py b/mlopscourse/infer.py index 48c31a1..fbc9961 100644 --- a/mlopscourse/infer.py +++ b/mlopscourse/infer.py @@ -2,6 +2,8 @@ import pickle import fire +from hydra import compose +from omegaconf import DictConfig, OmegaConf from .data.prepare_dataset import load_dataset @@ -12,17 +14,16 @@ class Inferencer: Attributes ---------- - model_type : str - The type of model that was used for training. Should be "rf" for RandomForest - and "cb" for CatBoost. - ckpt : str - The filename inside 'checkpoint/' to load the model from. Should also contain the - the filename extension. + cfg : omegaconf.DictConfig + The configuration containing the model type and hyperparameters, training and + inference parameters. """ - def __init__(self, model_type: str, ckpt: str) -> None: - self.model_type = model_type - self.ckpt = ckpt + def __init__(self, config_name: str, **kwargs: dict) -> None: + self.cfg: DictConfig = compose( + config_name=config_name, overrides=[f"{k}={v}" for k, v in kwargs.items()] + ) + print(OmegaConf.to_yaml(self.cfg)) def infer(self) -> None: ( @@ -32,14 +33,14 @@ def infer(self) -> None: _, ) = load_dataset(split="test") - with open(f"checkpoints/{self.ckpt}", "rb") as f: + with open(f"checkpoints/{self.cfg.inference.checkpoint_name}", "rb") as f: model = pickle.load(f) - print(f"Evaluating the {self.model_type} model...") + print(f"Evaluating the {self.cfg.model.name} model...") y_preds = model.eval(X_test, y_test) os.makedirs("predictions", exist_ok=True) - preds_name = self.ckpt.split(".")[0] - y_preds.to_csv(f"predictions/{preds_name}_preds.csv") + ckpt_name = self.cfg.inference.checkpoint_name.split(".")[0] + y_preds.to_csv(f"predictions/{ckpt_name}_preds.csv") if __name__ == "__main__": diff --git a/mlopscourse/models/base.py b/mlopscourse/models/base.py index 134780e..07b0f4a 100644 --- a/mlopscourse/models/base.py +++ b/mlopscourse/models/base.py @@ -1,15 +1,20 @@ +import pickle +from abc import ABCMeta, abstractmethod from typing import Optional import pandas as pd +from omegaconf import DictConfig -class BaseModel: +class BaseModel(metaclass=ABCMeta): """Represents an interface that any model used must implement.""" - def __init__(self) -> None: + def __init__(self, cfg: DictConfig) -> None: + self.cfg = cfg self.preprocessor = None self.model = None + @abstractmethod def train( self, X_train: pd.DataFrame, @@ -19,11 +24,14 @@ def train( ) -> None: raise NotImplementedError() + @abstractmethod def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> pd.Series: raise NotImplementedError() + @abstractmethod def __call__(self, X_sample: pd.DataFrame) -> pd.Series: raise NotImplementedError() def save_checkpoint(self, path: str) -> None: - raise NotImplementedError() + with open(path + self.cfg.training.checkpoint_name, "wb") as f: + pickle.dump(self, f) diff --git a/mlopscourse/models/catboost.py b/mlopscourse/models/catboost.py index c54047f..d8de95c 100644 --- a/mlopscourse/models/catboost.py +++ b/mlopscourse/models/catboost.py @@ -1,34 +1,26 @@ -import pickle from typing import List, Optional import numpy as np import pandas as pd from catboost import CatBoostRegressor, Pool +from omegaconf import DictConfig from sklearn.metrics import r2_score from .base import BaseModel class CatboostModel(BaseModel): - """A basic Random Forest model from sklearn.""" + """The Yandex's CatBoost.""" def __init__( - self, numerical_features: List[str], categorical_features: List[str] + self, + cfg: DictConfig, + numerical_features: List[str], + categorical_features: List[str], ) -> None: - super().__init__() + super().__init__(cfg) - self.hyperparams = { - "learning_rate": 0.3, - "depth": 6, - "l2_leaf_reg": 3, - "loss_function": "RMSE", - "eval_metric": "R2", - "random_seed": 0, - "task_type": "CPU", - "n_estimators": 1000, - "metric_period": 100, - } - self.model = CatBoostRegressor(**self.hyperparams) + self.model = CatBoostRegressor(**cfg.model.hyperparams) self.numerical_features = numerical_features self.categorical_features = categorical_features @@ -76,7 +68,3 @@ def __call__(self, X_sample: pd.DataFrame) -> np.ndarray: feature_names=list(X_sample.columns), ) return self.model.predict(sample_data) - - def save_checkpoint(self, path: str) -> None: - with open(path + "model_cb.p", "wb") as f: - pickle.dump(self, f) diff --git a/mlopscourse/models/models_zoo.py b/mlopscourse/models/models_zoo.py index 5eb5de4..c442f13 100644 --- a/mlopscourse/models/models_zoo.py +++ b/mlopscourse/models/models_zoo.py @@ -1,18 +1,20 @@ from typing import List +from omegaconf import DictConfig + from .base import BaseModel from .catboost import CatboostModel from .random_forest import RandomForest def prepare_model( - model_type: str, + cfg: DictConfig, numerical_features: List[str], categorical_features: List[str], ) -> BaseModel: - if model_type == "rf": - return RandomForest(numerical_features, categorical_features) - elif model_type == "cb": - return CatboostModel(numerical_features, categorical_features) + if cfg.model.name == "rf": + return RandomForest(cfg, numerical_features, categorical_features) + elif cfg.model.name == "cb": + return CatboostModel(cfg, numerical_features, categorical_features) else: - raise AssertionError(f"Unknown model name: {model_type}") + raise AssertionError(f"Unknown model name: {cfg.model.name}") diff --git a/mlopscourse/models/random_forest.py b/mlopscourse/models/random_forest.py index cb533c9..97c16c0 100644 --- a/mlopscourse/models/random_forest.py +++ b/mlopscourse/models/random_forest.py @@ -1,8 +1,8 @@ -import pickle from typing import List, Optional import numpy as np import pandas as pd +from omegaconf import DictConfig from sklearn.compose import ColumnTransformer from sklearn.ensemble import RandomForestRegressor from sklearn.pipeline import make_pipeline @@ -15,9 +15,12 @@ class RandomForest(BaseModel): """A basic Random Forest model from sklearn.""" def __init__( - self, numerical_features: List[str], categorical_features: List[str] + self, + cfg: DictConfig, + numerical_features: List[str], + categorical_features: List[str], ) -> None: - super().__init__() + super().__init__(cfg) self.preprocessor = ColumnTransformer( transformers=[ @@ -28,7 +31,7 @@ def __init__( verbose_feature_names_out=False, ).set_output(transform="pandas") self.model = make_pipeline( - self.preprocessor, RandomForestRegressor(random_state=0) + self.preprocessor, RandomForestRegressor(**cfg.model.hyperparams) ) def train( @@ -49,7 +52,3 @@ def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> pd.Series: def __call__(self, X_sample: pd.DataFrame) -> np.ndarray: return self.model.predict(X_sample) - - def save_checkpoint(self, path: str) -> None: - with open(path + "model_rf.p", "wb") as f: - pickle.dump(self, f) diff --git a/mlopscourse/train.py b/mlopscourse/train.py index ec2ace8..b83d4b7 100644 --- a/mlopscourse/train.py +++ b/mlopscourse/train.py @@ -1,6 +1,8 @@ import os import fire +from hydra import compose +from omegaconf import DictConfig, OmegaConf from .data.prepare_dataset import load_dataset from .models.models_zoo import prepare_model @@ -12,13 +14,16 @@ class Trainer: Attributes ---------- - model_type : str - The type of model for training. Should be "rf" for RandomForest and "cb" - for CatBoost. + cfg : omegaconf.DictConfig + The configuration containing the model type and hyperparameters, training and + inference parameters. """ - def __init__(self, model_type: str) -> None: - self.model_type = model_type + def __init__(self, config_name: str, **kwargs: dict) -> None: + self.cfg: DictConfig = compose( + config_name=config_name, overrides=[f"{k}={v}" for k, v in kwargs.items()] + ) + print(OmegaConf.to_yaml(self.cfg)) def train(self) -> None: ( @@ -27,9 +32,10 @@ def train(self) -> None: numerical_features, categorical_features, ) = load_dataset(split="train") - model = prepare_model(self.model_type, numerical_features, categorical_features) - print(f"Training the {self.model_type} model...") + model = prepare_model(self.cfg, numerical_features, categorical_features) + + print(f"Training the {self.cfg.model.name} model...") model.train(X_train, y_train) os.makedirs("checkpoints", exist_ok=True) diff --git a/poetry.lock b/poetry.lock index 2f2afe4..d7eacb3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3529,4 +3529,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">= 3.9, < 3.13" -content-hash = "9a89905b4eaada199038f60bf33876eac300737e58b619c00427afc8ec57644f" +content-hash = "0342bc00755934e3708f20a4477a4ca42fc3a8f0834da73ea92d401b38a6e251" diff --git a/pyproject.toml b/pyproject.toml index 9d6f3e0..f46551e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ numpy = "^1.26.0" scikit-learn = "^1.3.1" catboost = "^1.2.2" fire = "^0.5.0" +hydra-core = "^1.3.2" [tool.poetry.group.dev.dependencies] pre-commit = "^3.4.0"