Skip to content

Commit

Permalink
train.py and infer.py were added
Browse files Browse the repository at this point in the history
  • Loading branch information
TopCoder2K committed Oct 2, 2023
1 parent f668682 commit df85969
Show file tree
Hide file tree
Showing 9 changed files with 146 additions and 18 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
.vscode
**__pycache__

checkpoints
predictions

catboost_info
28 changes: 26 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,40 @@

## Setup

To setup the all necessary dependencies, run the following:
To setup only the necessary dependencies, run the following:

```
poetry install --without dev
```

If you want to use `pre-commit`, install all the dependencies:

```
poetry install
```

## Run experiments

To train and evaluate a chosen model, run:
To train and evaluate the chosen model, run:

```
poetry run python3 main.py --model [chosen_model]
```

Note that the `--model` argument is optional. By default, the scripts use Random Forest.

If you only want to train the chosen model and save it afterwards, run:

```
poetry run python3 mlopscourse/train.py --model [chosen_model]
```

If you only want to infer a previously trained model, make sure you've placed the
checkpoint in `checkpoints/` and then run

```
poetry run python3 mlopscourse/infer.py --model [chosen_model] --ckpt [checkpoint_name]
```

The `--ckpt` argument is also optional. The script uses `model_[model_name_from_args].p`
as the default filename. _But if you set `--model`, do not forget to set `--ckpt` also!_
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def parse_args() -> argparse.Namespace:
parser.add_argument(
"--model",
type=str,
required=True,
default="rf",
choices=["rf", "cb"],
help="Type of model used for training",
)
Expand Down
9 changes: 5 additions & 4 deletions mlopscourse/data/prepare_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
from sklearn.datasets import fetch_openml


def prepare_dataset() -> (
Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series, List[str], List[str]]
):
def prepare_dataset(
print_info: bool = True,
) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series, List[str], List[str]]:
bikes = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True, parser="pandas")
# Make an explicit copy to avoid "SettingWithCopyWarning" from pandas
X, y = bikes.data.copy(), bikes.target
Expand All @@ -20,7 +20,8 @@ def prepare_dataset() -> (
X = X.drop(columns=["year"])
X_train, y_train = X[mask_training], y[mask_training]
X_test, y_test = X[~mask_training], y[~mask_training]
X_train.info()
if print_info:
X_train.info()

numerical_features = [
"temp",
Expand Down
50 changes: 50 additions & 0 deletions mlopscourse/infer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import argparse
import os
import pickle

from mlopscourse.data.prepare_dataset import prepare_dataset


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Training and evaluation parameters")

parser.add_argument(
"--model",
type=str,
default="rf",
choices=["rf", "cb"],
help="Type of model used for training",
)
parser.add_argument(
"--ckpt",
type=str,
default="model_rf.p",
help="The filename inside 'checkpoint/' to load the model from",
)

args = parser.parse_args()
return args


def infer(args: argparse.Namespace):
(
_,
_,
X_test,
y_test,
_,
_,
) = prepare_dataset(print_info=False)

with open(f"checkpoints/{args.ckpt}", "rb") as f:
model = pickle.load(f)
print(f"Evaluating the {args.model} model...")
y_preds = model.eval(X_test, y_test)

os.makedirs("predictions", exist_ok=True)
y_preds.to_csv(f"predictions/model_{args.model}_preds.csv")


if __name__ == "__main__":
arguments = parse_args()
infer(arguments)
2 changes: 1 addition & 1 deletion mlopscourse/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def train(
) -> None:
raise NotImplementedError()

def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> None:
def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> pd.Series:
raise NotImplementedError()

def __call__(self, X_sample: pd.DataFrame) -> pd.Series:
Expand Down
17 changes: 12 additions & 5 deletions mlopscourse/models/catboost.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pickle
from typing import List, Optional

import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import r2_score
Expand Down Expand Up @@ -57,7 +58,7 @@ def train(
else:
self.model.fit(train_data)

def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> None:
def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> pd.Series:
test_data = Pool(
data=X_test,
label=y_test,
Expand All @@ -66,11 +67,17 @@ def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> None:
)
preds = self.model.predict(test_data)
print("CatBoost R2: {:.2f}".format(r2_score(y_test, preds)))
return pd.Series(preds, name="cb_preds")

# TODO
def __call__(self, X_sample: pd.DataFrame) -> pd.Series:
pass
def __call__(self, X_sample: pd.DataFrame) -> np.ndarray:
sample_data = Pool(
data=X_sample,
label=None,
cat_features=self.categorical_features,
feature_names=list(X_sample.columns),
)
return self.model.predict(sample_data)

def save_checkpoint(self, path: str) -> None:
with open(path + "model_cb.p", "wb") as f:
pickle.dump(self.model, f)
pickle.dump(self, f)
10 changes: 5 additions & 5 deletions mlopscourse/models/random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,13 @@ def train(
assert y_test is not None, "For the evaluation, y_test must be provided!"
self.eval(X_test, y_test)

def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> None:
def eval(self, X_test: pd.DataFrame, y_test: pd.Series) -> pd.Series:
print(f"Test R2 score: {self.model.score(X_test, y_test):.2f}")
return pd.Series(self.model.predict(X_test), name="rf_preds")

# TODO
def __call__(self, X_sample: pd.DataFrame) -> pd.Series:
pass
def __call__(self, X_sample: pd.DataFrame) -> np.ndarray:
return self.model.predict(X_sample)

def save_checkpoint(self, path: str) -> None:
with open(path + "model_rf.p", "wb") as f:
pickle.dump(self.model, f)
pickle.dump(self, f)
43 changes: 43 additions & 0 deletions mlopscourse/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import argparse
import os

from data.prepare_dataset import prepare_dataset
from models.models_zoo import prepare_model


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Training and evaluation parameters")

parser.add_argument(
"--model",
type=str,
default="rf",
choices=["rf", "cb"],
help="Type of model used for training",
)

args = parser.parse_args()
return args


def train(args: argparse.Namespace):
(
X_train,
y_train,
_,
_,
numerical_features,
categorical_features,
) = prepare_dataset()
model = prepare_model(args, numerical_features, categorical_features)

print(f"Training the {args.model} model...")
model.train(X_train, y_train)

os.makedirs("checkpoints", exist_ok=True)
model.save_checkpoint("checkpoints/")


if __name__ == "__main__":
arguments = parse_args()
train(arguments)

0 comments on commit df85969

Please sign in to comment.