From df859690de9cc8d23a1e16cba133ddfc79361b18 Mon Sep 17 00:00:00 2001 From: Svyatoslav Date: Mon, 2 Oct 2023 22:49:48 +0300 Subject: [PATCH] train.py and infer.py were added --- .gitignore | 3 ++ README.md | 28 ++++++++++++++-- main.py | 2 +- mlopscourse/data/prepare_dataset.py | 9 +++--- mlopscourse/infer.py | 50 +++++++++++++++++++++++++++++ mlopscourse/models/base.py | 2 +- mlopscourse/models/catboost.py | 17 +++++++--- mlopscourse/models/random_forest.py | 10 +++--- mlopscourse/train.py | 43 +++++++++++++++++++++++++ 9 files changed, 146 insertions(+), 18 deletions(-) create mode 100644 mlopscourse/infer.py create mode 100644 mlopscourse/train.py diff --git a/.gitignore b/.gitignore index c8c0adc..3c54aaf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ .vscode **__pycache__ + checkpoints +predictions + catboost_info diff --git a/README.md b/README.md index 794dbf4..59ffebd 100644 --- a/README.md +++ b/README.md @@ -2,16 +2,40 @@ ## Setup -To setup the all necessary dependencies, run the following: +To setup only the necessary dependencies, run the following: ``` poetry install --without dev ``` +If you want to use `pre-commit`, install all the dependencies: + +``` +poetry install +``` + ## Run experiments -To train and evaluate a chosen model, run: +To train and evaluate the chosen model, run: ``` poetry run python3 main.py --model [chosen_model] ``` + +Note that the `--model` argument is optional. By default, the scripts use Random Forest. + +If you only want to train the chosen model and save it afterwards, run: + +``` +poetry run python3 mlopscourse/train.py --model [chosen_model] +``` + +If you only want to infer a previously trained model, make sure you've placed the +checkpoint in `checkpoints/` and then run + +``` +poetry run python3 mlopscourse/infer.py --model [chosen_model] --ckpt [checkpoint_name] +``` + +The `--ckpt` argument is also optional. The script uses `model_[model_name_from_args].p` +as the default filename. _But if you set `--model`, do not forget to set `--ckpt` also!_ diff --git a/main.py b/main.py index 5f5e5dc..1c9e308 100644 --- a/main.py +++ b/main.py @@ -11,7 +11,7 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--model", type=str, - required=True, + default="rf", choices=["rf", "cb"], help="Type of model used for training", ) diff --git a/mlopscourse/data/prepare_dataset.py b/mlopscourse/data/prepare_dataset.py index 4601109..d50ffe5 100644 --- a/mlopscourse/data/prepare_dataset.py +++ b/mlopscourse/data/prepare_dataset.py @@ -4,9 +4,9 @@ from sklearn.datasets import fetch_openml -def prepare_dataset() -> ( - Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series, List[str], List[str]] -): +def prepare_dataset( + print_info: bool = True, +) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series, List[str], List[str]]: bikes = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True, parser="pandas") # Make an explicit copy to avoid "SettingWithCopyWarning" from pandas X, y = bikes.data.copy(), bikes.target @@ -20,7 +20,8 @@ def prepare_dataset() -> ( X = X.drop(columns=["year"]) X_train, y_train = X[mask_training], y[mask_training] X_test, y_test = X[~mask_training], y[~mask_training] - X_train.info() + if print_info: + X_train.info() numerical_features = [ "temp", diff --git a/mlopscourse/infer.py b/mlopscourse/infer.py new file mode 100644 index 0000000..f3c9978 --- /dev/null +++ b/mlopscourse/infer.py @@ -0,0 +1,50 @@ +import argparse +import os +import pickle + +from mlopscourse.data.prepare_dataset import prepare_dataset + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Training and evaluation parameters") + + parser.add_argument( + "--model", + type=str, + default="rf", + choices=["rf", "cb"], + help="Type of model used for training", + ) + parser.add_argument( + "--ckpt", + type=str, + default="model_rf.p", + help="The filename inside 'checkpoint/' to load the model from", + ) + + args = parser.parse_args() + return args + + +def infer(args: argparse.Namespace): + ( + _, + _, + X_test, + y_test, + _, + _, + ) = prepare_dataset(print_info=False) + + with open(f"checkpoints/{args.ckpt}", "rb") as f: + model = pickle.load(f) + print(f"Evaluating the {args.model} model...") + y_preds = model.eval(X_test, y_test) + + os.makedirs("predictions", exist_ok=True) + y_preds.to_csv(f"predictions/model_{args.model}_preds.csv") + + +if __name__ == "__main__": + arguments = parse_args() + infer(arguments) diff --git a/mlopscourse/models/base.py b/mlopscourse/models/base.py index c165675..134780e 100644 --- a/mlopscourse/models/base.py +++ b/mlopscourse/models/base.py @@ -19,7 +19,7 @@ def train( ) -> None: raise NotImplementedError() - def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> None: + def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> pd.Series: raise NotImplementedError() def __call__(self, X_sample: pd.DataFrame) -> pd.Series: diff --git a/mlopscourse/models/catboost.py b/mlopscourse/models/catboost.py index 2743f7a..9ef249e 100644 --- a/mlopscourse/models/catboost.py +++ b/mlopscourse/models/catboost.py @@ -1,6 +1,7 @@ import pickle from typing import List, Optional +import numpy as np import pandas as pd from catboost import CatBoostRegressor, Pool from sklearn.metrics import r2_score @@ -57,7 +58,7 @@ def train( else: self.model.fit(train_data) - def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> None: + def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> pd.Series: test_data = Pool( data=X_test, label=y_test, @@ -66,11 +67,17 @@ def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> None: ) preds = self.model.predict(test_data) print("CatBoost R2: {:.2f}".format(r2_score(y_test, preds))) + return pd.Series(preds, name="cb_preds") - # TODO - def __call__(self, X_sample: pd.DataFrame) -> pd.Series: - pass + def __call__(self, X_sample: pd.DataFrame) -> np.ndarray: + sample_data = Pool( + data=X_sample, + label=None, + cat_features=self.categorical_features, + feature_names=list(X_sample.columns), + ) + return self.model.predict(sample_data) def save_checkpoint(self, path: str) -> None: with open(path + "model_cb.p", "wb") as f: - pickle.dump(self.model, f) + pickle.dump(self, f) diff --git a/mlopscourse/models/random_forest.py b/mlopscourse/models/random_forest.py index d13b30f..cb533c9 100644 --- a/mlopscourse/models/random_forest.py +++ b/mlopscourse/models/random_forest.py @@ -43,13 +43,13 @@ def train( assert y_test is not None, "For the evaluation, y_test must be provided!" self.eval(X_test, y_test) - def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> None: + def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> pd.Series: print(f"Test R2 score: {self.model.score(X_test, y_test):.2f}") + return pd.Series(self.model.predict(X_test), name="rf_preds") - # TODO - def __call__(self, X_sample: pd.DataFrame) -> pd.Series: - pass + def __call__(self, X_sample: pd.DataFrame) -> np.ndarray: + return self.model.predict(X_sample) def save_checkpoint(self, path: str) -> None: with open(path + "model_rf.p", "wb") as f: - pickle.dump(self.model, f) + pickle.dump(self, f) diff --git a/mlopscourse/train.py b/mlopscourse/train.py new file mode 100644 index 0000000..fadd5d2 --- /dev/null +++ b/mlopscourse/train.py @@ -0,0 +1,43 @@ +import argparse +import os + +from data.prepare_dataset import prepare_dataset +from models.models_zoo import prepare_model + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Training and evaluation parameters") + + parser.add_argument( + "--model", + type=str, + default="rf", + choices=["rf", "cb"], + help="Type of model used for training", + ) + + args = parser.parse_args() + return args + + +def train(args: argparse.Namespace): + ( + X_train, + y_train, + _, + _, + numerical_features, + categorical_features, + ) = prepare_dataset() + model = prepare_model(args, numerical_features, categorical_features) + + print(f"Training the {args.model} model...") + model.train(X_train, y_train) + + os.makedirs("checkpoints", exist_ok=True) + model.save_checkpoint("checkpoints/") + + +if __name__ == "__main__": + arguments = parse_args() + train(arguments)