diff --git a/README.md b/README.md index 825d84a..001afc7 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,8 @@ poetry run python3 commands.py train --config_name cb_config ``` _N.B. Do not forget to set `logging.mlflow.tracking_uri` before the launch. The logs are -saved in the default directory: `mlruns`._ +saved in the default directory: `mlruns`. If you are using the standard MLFlow server, +then run it before the training with `poetry run mlflow ui`._ ### Evaluation @@ -90,3 +91,32 @@ If you want to infer a previously trained model, make sure you've placed the che ``` poetry run python3 commands.py infer --config_name [config_name_without_extension] ``` + +### Deployment + +In order to deploy a trained model, run: + +``` +poetry run mlflow models serve -p 5001 -m checkpoints/mlflow_[model_type]_ckpt/ --env-manager=local +``` + +where `[model_type]` is `cb` or `rf`. + +After this, it is possible to send requests to the model. I've created a script for the +correct json generation with the first example from the training set: + +``` +poetry run python3 create_example_request.py create_example_request +``` + +Send a request to the deployed model using the generated json: + +``` +curl http://127.0.0.1:5001/invocations -H 'Content-Type: application/json' -d @example_request.json +``` + +The model should reply with something like this: + +``` +{"predictions": [20.8]} +``` diff --git a/create_example_request.py b/create_example_request.py new file mode 100644 index 0000000..00b2078 --- /dev/null +++ b/create_example_request.py @@ -0,0 +1,30 @@ +import json + +import fire +import pandas as pd + + +def create_example_request() -> None: + example_df = pd.DataFrame( + [ + { + "season": "spring", + "month": 1, + "hour": 0, + "holiday": False, + "weekday": 6, + "workingday": False, + "weather": "clear", + "temp": 9.84, + "feel_temp": 14.395, + "humidity": 0.81, + "windspeed": 0.0, + } + ] + ) # This is the first row of the training split + with open("example_request.json", "w") as f: + json.dump({"dataframe_split": example_df.to_dict(orient="split")}, f) + + +if __name__ == "__main__": + fire.Fire() diff --git a/example_request.json b/example_request.json new file mode 100644 index 0000000..16d4743 --- /dev/null +++ b/example_request.json @@ -0,0 +1,19 @@ +{ + "dataframe_split": { + "index": [0], + "columns": [ + "season", + "month", + "hour", + "holiday", + "weekday", + "workingday", + "weather", + "temp", + "feel_temp", + "humidity", + "windspeed" + ], + "data": [["spring", 1, 0, false, 6, false, "clear", 9.84, 14.395, 0.81, 0.0]] + } +} diff --git a/mlopscourse/models/catboost.py b/mlopscourse/models/catboost.py index 1a07d59..b86f274 100644 --- a/mlopscourse/models/catboost.py +++ b/mlopscourse/models/catboost.py @@ -71,24 +71,21 @@ def __call__(self, X_sample: pd.DataFrame) -> np.ndarray: return self.model.predict(sample_data) def log_fis_and_metrics(self, exp_id: str, col_names: List[str]) -> None: - with mlflow.start_run( - experiment_id=exp_id, run_name=f"training-{self.cfg.model.name}" - ): - # Log the model's hyperparameters and the code version - mlflow.log_params(self.cfg.model.hyperparams) - mlflow.log_param("commit_id", self.cfg.logging.commit_id) - # Log feature importances + # Log the model's hyperparameters and the code version + mlflow.log_params(self.cfg.model.hyperparams) + mlflow.log_param("commit_id", self.cfg.logging.commit_id) + # Log feature importances + mlflow.log_metrics( + { + f"fi_of_{col_name}": self.model.feature_importances_[i] + for i, col_name in enumerate(col_names) + } + ) + r2_scores = self.model.evals_result_["learn"]["R2"] + rmse_scores = self.model.evals_result_["learn"]["RMSE"] + assert len(r2_scores) == len(rmse_scores), "Something wrong with metrics!" + for i in range(len(r2_scores)): mlflow.log_metrics( - { - f"fi_of_{col_name}": self.model.feature_importances_[i] - for i, col_name in enumerate(col_names) - } + {"R2_metric": r2_scores[i], "RMSE_loss": rmse_scores[i]}, + step=i * self.cfg.model.hyperparams.metric_period, ) - r2_scores = self.model.evals_result_["learn"]["R2"] - rmse_scores = self.model.evals_result_["learn"]["RMSE"] - assert len(r2_scores) == len(rmse_scores), "Something wrong with metrics!" - for i in range(len(r2_scores)): - mlflow.log_metrics( - {"R2_metric": r2_scores[i], "RMSE_loss": rmse_scores[i]}, - step=i * self.cfg.model.hyperparams.metric_period, - ) diff --git a/mlopscourse/models/random_forest.py b/mlopscourse/models/random_forest.py index 4f25001..dcce9aa 100644 --- a/mlopscourse/models/random_forest.py +++ b/mlopscourse/models/random_forest.py @@ -59,33 +59,30 @@ def __call__(self, X_sample: pd.DataFrame) -> np.ndarray: def log_fis_and_metrics( self, exp_id: str, X_train: pd.DataFrame, y_train: pd.Series ) -> None: - with mlflow.start_run( - experiment_id=exp_id, run_name=f"training-{self.cfg.model.name}" - ): - # Log the model's hyperparameters and the code version - mlflow.log_params(self.cfg.model.hyperparams) - mlflow.log_param("commit_id", self.cfg.logging.commit_id) - # Log feature importances + # Log the model's hyperparameters and the code version + mlflow.log_params(self.cfg.model.hyperparams) + mlflow.log_param("commit_id", self.cfg.logging.commit_id) + # Log feature importances + mlflow.log_metrics( + { + f"fi_of_{col_name}": self.model.named_steps[ + "randomforestregressor" + ].feature_importances_[i] + for i, col_name in enumerate(X_train.columns) + } + ) + # Log R2 and RMSE metrics + for i in tqdm(range(0, self.cfg.model.hyperparams.n_estimators)): + model_i = make_pipeline( + self.preprocessor, RandomForestRegressor(**self.cfg.model.hyperparams) + ) + model_i.named_steps["randomforestregressor"].n_estimators = i + 1 + model_i.fit(X_train, y_train) + y_pred = model_i.predict(X_train) mlflow.log_metrics( { - f"fi_of_{col_name}": self.model.named_steps[ - "randomforestregressor" - ].feature_importances_[i] - for i, col_name in enumerate(X_train.columns) - } + "R2_metric": r2_score(y_train, y_pred), + "RMSE_metric": mean_squared_error(y_train, y_pred, squared=False), + }, + step=i, ) - # Log R2 and RMSE metrics - for i in tqdm(range(0, self.cfg.model.hyperparams.n_estimators)): - model_i = make_pipeline( - self.preprocessor, RandomForestRegressor(**self.cfg.model.hyperparams) - ) - model_i.named_steps["randomforestregressor"].n_estimators = i + 1 - model_i.fit(X_train, y_train) - y_pred = model_i.predict(X_train) - mlflow.log_metrics( - { - "R2_metric": r2_score(y_train, y_pred), - "RMSE_metric": mean_squared_error(y_train, y_pred, squared=False), - }, - step=i, - ) diff --git a/mlopscourse/train.py b/mlopscourse/train.py index 9c7294c..df20409 100644 --- a/mlopscourse/train.py +++ b/mlopscourse/train.py @@ -50,12 +50,26 @@ def train(self) -> None: mlflow_cfg = self.cfg.logging.mlflow mlflow.set_tracking_uri(mlflow_cfg.tracking_uri) exp_id = mlflow.set_experiment(mlflow_cfg.exp_name).experiment_id - # Unfortunately, logging is model dependent, at least because - # RandomForestRegressor doesn't provide the target metric progress. - if self.cfg.model.name == "cb": - model.log_fis_and_metrics(exp_id, X_train.columns) - else: - model.log_fis_and_metrics(exp_id, X_train, y_train) + with mlflow.start_run( + experiment_id=exp_id, run_name=f"training-{self.cfg.model.name}" + ): + signature = mlflow.models.infer_signature(X_train, y_train) + # Unfortunately, logging is model dependent, at least because + # RandomForestRegressor doesn't provide the target metric progress. + if self.cfg.model.name == "cb": + mlflow.catboost.save_model( + model.model, + f"checkpoints/mlflow_{self.cfg.model.name}_ckpt/", + signature=signature, + ) + model.log_fis_and_metrics(exp_id, X_train.columns) + else: + mlflow.sklearn.save_model( + model.model, + f"checkpoints/mlflow_{self.cfg.model.name}_ckpt/", + signature=signature, + ) + model.log_fis_and_metrics(exp_id, X_train, y_train) if __name__ == "__main__":