Skip to content

Commit

Permalink
Hydra was added. Dataclasses weren't added because they seem to make …
Browse files Browse the repository at this point in the history
…things more messier
  • Loading branch information
TopCoder2K committed Nov 28, 2023
1 parent f0dd799 commit d92cbc7
Show file tree
Hide file tree
Showing 12 changed files with 141 additions and 77 deletions.
20 changes: 14 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# MLOpsCourse

This repository is dedicated to the easy-to-understand practice of the standard MLOps
techniques.

## Task Description

The "Bike Rentals" dataset is used for scripts in this repository. This dataset contains
Expand Down Expand Up @@ -40,7 +43,7 @@ To setup only the necessary dependencies, run the following:
poetry install --without dev
```

If you want to use `pre-commit`, install all the dependencies:
If you want to use `pre-commit` and `dvc`, install all the dependencies:

```
poetry install
Expand All @@ -61,20 +64,25 @@ The command should download two .csv files from my GDrive and place them inside

### Training

If you want to train the chosen model and save it afterwards, run:
If you want to train the chosen model and save it afterwards, place its configuration file
in the `mlopscourse/configs` directory and run:

```
poetry run python3 commands.py train --model_type [chosen_model]
poetry run python3 commands.py train --config_name [config_name_without_extension]
```

The available models are `rf` (Random Forest from the `scikit-learn` library) and `cb`
(Yandex's CatBoost).
(Yandex's CatBoost), so an example with the CatBoost would be the following:

```
poetry run python3 commands.py train --config_name cb_config
```

### Evaluation

If you want to infer a previously trained model, make sure you've placed the checkpoint in
`checkpoints/` and then run
`checkpoints/` and the configuration file in `mlopscourse/configs` then run

```
poetry run python3 commands.py infer --model_type [chosen_model] --ckpt [checkpoint_filename_with_extension]
poetry run python3 commands.py infer --config_name [config_name_without_extension]
```
48 changes: 35 additions & 13 deletions commands.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,58 @@
import fire
from hydra import initialize

from mlopscourse.infer import Inferencer
from mlopscourse.train import Trainer


def train(model_type: str) -> None:
def train(
config_name: str,
config_path: str = "mlopscourse/configs/",
hydra_version_base: str = "1.3",
**kwargs: dict,
) -> None:
"""
Trains the chosen model on the train split of the dataset and saves the checkpoint.
Parameters
----------
model_type : str
The type of model for training. Should be "rf" for RandomForest and "cb"
for CatBoost.
config_name : str
The name of the configuration file to use for model, training and inference
hyperparameters.
config_path : str
The path to the configuration files.
hydra_version_base : str
The compatibility level of hydra to use.
**kwargs : dict, optional
Values of the configuration file to override.
"""
Trainer(model_type).train()
with initialize(config_path=config_path, version_base=hydra_version_base):
Trainer(config_name, **kwargs).train()


def infer(model_type: str, ckpt: str) -> None:
def infer(
config_name: str,
config_path: str = "mlopscourse/configs/",
hydra_version_base: str = "1.3",
**kwargs: dict,
) -> None:
"""
Runs the chosen model on the test set of the dataset and calculates the R^2 metric.
Parameters
----------
model_type : str
The type of model that was used for training. Should be "rf" for RandomForest
and "cb" for CatBoost.
ckpt : str
The filename inside 'checkpoint/' to load the model from. Should also contain the
the filename extension.
config_name : str
The name of the configuration file to use for model, training and inference
hyperparameters.
config_path : str
The path to the configuration files.
hydra_version_base : str
The compatibility level of hydra to use.
**kwargs : dict, optional
Values of the configuration file to override.
"""
Inferencer(model_type, ckpt).infer()
with initialize(config_path=config_path, version_base=hydra_version_base):
Inferencer(config_name, **kwargs).infer()


if __name__ == "__main__":
Expand Down
18 changes: 18 additions & 0 deletions mlopscourse/configs/cb_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
model:
name: cb
hyperparams:
depth: 6
n_estimators: 1000
eval_metric: R2
task_type: CPU
random_seed: 0
learning_rate: 0.3
l2_leaf_reg: 3
loss_function: RMSE
metric_period: 100

training:
checkpoint_name: cb_model.p

inference:
checkpoint_name: cb_model.p
11 changes: 11 additions & 0 deletions mlopscourse/configs/rf_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
model:
name: rf
hyperparams:
n_estimators: 500
random_state: 0

training:
checkpoint_name: rf_model.p

inference:
checkpoint_name: rf_model.p
27 changes: 14 additions & 13 deletions mlopscourse/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import pickle

import fire
from hydra import compose
from omegaconf import DictConfig, OmegaConf

from .data.prepare_dataset import load_dataset

Expand All @@ -12,17 +14,16 @@ class Inferencer:
Attributes
----------
model_type : str
The type of model that was used for training. Should be "rf" for RandomForest
and "cb" for CatBoost.
ckpt : str
The filename inside 'checkpoint/' to load the model from. Should also contain the
the filename extension.
cfg : omegaconf.DictConfig
The configuration containing the model type and hyperparameters, training and
inference parameters.
"""

def __init__(self, model_type: str, ckpt: str) -> None:
self.model_type = model_type
self.ckpt = ckpt
def __init__(self, config_name: str, **kwargs: dict) -> None:
self.cfg: DictConfig = compose(
config_name=config_name, overrides=[f"{k}={v}" for k, v in kwargs.items()]
)
print(OmegaConf.to_yaml(self.cfg))

def infer(self) -> None:
(
Expand All @@ -32,14 +33,14 @@ def infer(self) -> None:
_,
) = load_dataset(split="test")

with open(f"checkpoints/{self.ckpt}", "rb") as f:
with open(f"checkpoints/{self.cfg.inference.checkpoint_name}", "rb") as f:
model = pickle.load(f)
print(f"Evaluating the {self.model_type} model...")
print(f"Evaluating the {self.cfg.model.name} model...")
y_preds = model.eval(X_test, y_test)

os.makedirs("predictions", exist_ok=True)
preds_name = self.ckpt.split(".")[0]
y_preds.to_csv(f"predictions/{preds_name}_preds.csv")
ckpt_name = self.cfg.inference.checkpoint_name.split(".")[0]
y_preds.to_csv(f"predictions/{ckpt_name}_preds.csv")


if __name__ == "__main__":
Expand Down
14 changes: 11 additions & 3 deletions mlopscourse/models/base.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
import pickle
from abc import ABCMeta, abstractmethod
from typing import Optional

import pandas as pd
from omegaconf import DictConfig


class BaseModel:
class BaseModel(metaclass=ABCMeta):
"""Represents an interface that any model used must implement."""

def __init__(self) -> None:
def __init__(self, cfg: DictConfig) -> None:
self.cfg = cfg
self.preprocessor = None
self.model = None

@abstractmethod
def train(
self,
X_train: pd.DataFrame,
Expand All @@ -19,11 +24,14 @@ def train(
) -> None:
raise NotImplementedError()

@abstractmethod
def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> pd.Series:
raise NotImplementedError()

@abstractmethod
def __call__(self, X_sample: pd.DataFrame) -> pd.Series:
raise NotImplementedError()

def save_checkpoint(self, path: str) -> None:
raise NotImplementedError()
with open(path + self.cfg.training.checkpoint_name, "wb") as f:
pickle.dump(self, f)
28 changes: 8 additions & 20 deletions mlopscourse/models/catboost.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,26 @@
import pickle
from typing import List, Optional

import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from omegaconf import DictConfig
from sklearn.metrics import r2_score

from .base import BaseModel


class CatboostModel(BaseModel):
"""A basic Random Forest model from sklearn."""
"""The Yandex's CatBoost."""

def __init__(
self, numerical_features: List[str], categorical_features: List[str]
self,
cfg: DictConfig,
numerical_features: List[str],
categorical_features: List[str],
) -> None:
super().__init__()
super().__init__(cfg)

self.hyperparams = {
"learning_rate": 0.3,
"depth": 6,
"l2_leaf_reg": 3,
"loss_function": "RMSE",
"eval_metric": "R2",
"random_seed": 0,
"task_type": "CPU",
"n_estimators": 1000,
"metric_period": 100,
}
self.model = CatBoostRegressor(**self.hyperparams)
self.model = CatBoostRegressor(**cfg.model.hyperparams)
self.numerical_features = numerical_features
self.categorical_features = categorical_features

Expand Down Expand Up @@ -76,7 +68,3 @@ def __call__(self, X_sample: pd.DataFrame) -> np.ndarray:
feature_names=list(X_sample.columns),
)
return self.model.predict(sample_data)

def save_checkpoint(self, path: str) -> None:
with open(path + "model_cb.p", "wb") as f:
pickle.dump(self, f)
14 changes: 8 additions & 6 deletions mlopscourse/models/models_zoo.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
from typing import List

from omegaconf import DictConfig

from .base import BaseModel
from .catboost import CatboostModel
from .random_forest import RandomForest


def prepare_model(
model_type: str,
cfg: DictConfig,
numerical_features: List[str],
categorical_features: List[str],
) -> BaseModel:
if model_type == "rf":
return RandomForest(numerical_features, categorical_features)
elif model_type == "cb":
return CatboostModel(numerical_features, categorical_features)
if cfg.model.name == "rf":
return RandomForest(cfg, numerical_features, categorical_features)
elif cfg.model.name == "cb":
return CatboostModel(cfg, numerical_features, categorical_features)
else:
raise AssertionError(f"Unknown model name: {model_type}")
raise AssertionError(f"Unknown model name: {cfg.model.name}")
15 changes: 7 additions & 8 deletions mlopscourse/models/random_forest.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import pickle
from typing import List, Optional

import numpy as np
import pandas as pd
from omegaconf import DictConfig
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
Expand All @@ -15,9 +15,12 @@ class RandomForest(BaseModel):
"""A basic Random Forest model from sklearn."""

def __init__(
self, numerical_features: List[str], categorical_features: List[str]
self,
cfg: DictConfig,
numerical_features: List[str],
categorical_features: List[str],
) -> None:
super().__init__()
super().__init__(cfg)

self.preprocessor = ColumnTransformer(
transformers=[
Expand All @@ -28,7 +31,7 @@ def __init__(
verbose_feature_names_out=False,
).set_output(transform="pandas")
self.model = make_pipeline(
self.preprocessor, RandomForestRegressor(random_state=0)
self.preprocessor, RandomForestRegressor(**cfg.model.hyperparams)
)

def train(
Expand All @@ -49,7 +52,3 @@ def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> pd.Series:

def __call__(self, X_sample: pd.DataFrame) -> np.ndarray:
return self.model.predict(X_sample)

def save_checkpoint(self, path: str) -> None:
with open(path + "model_rf.p", "wb") as f:
pickle.dump(self, f)
Loading

0 comments on commit d92cbc7

Please sign in to comment.