Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

389 refactor ML model predicting and testing #390

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 117 additions & 63 deletions eis_toolkit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2313,97 +2313,162 @@ def gradient_boosting_regressor_train_cli(
typer.echo("Gradient boosting regressor training completed")


# EVALUATE ML MODEL
# TEST CLASSIFIER ML MODEL
@app.command()
def evaluate_trained_model_cli(
def classifier_test_cli(
input_rasters: INPUT_FILES_ARGUMENT,
target_labels: INPUT_FILE_OPTION,
model_file: INPUT_FILE_OPTION,
output_raster: OUTPUT_FILE_OPTION,
validation_metrics: Annotated[List[str], typer.Option()],
output_raster_probability: OUTPUT_FILE_OPTION,
output_raster_classified: OUTPUT_FILE_OPTION,
classification_threshold: float = 0.5,
test_metrics: Annotated[List[ClassifierMetrics], typer.Option(case_sensitive=False)] = [ClassifierMetrics.accuracy],
):
"""Predict and evaluate a trained machine learning model by predicting and scoring."""
from sklearn.base import is_classifier

"""Test trained machine learning classifier model by predicting and scoring."""
from eis_toolkit.evaluation.scoring import score_predictions
from eis_toolkit.prediction.machine_learning_general import load_model, prepare_data_for_ml, reshape_predictions
from eis_toolkit.prediction.machine_learning_predict import predict_classifier, predict_regressor
from eis_toolkit.prediction.machine_learning_predict import predict_classifier

X, y, reference_profile, nodata_mask = prepare_data_for_ml(input_rasters, target_labels)
typer.echo("Progress: 30%")

model = load_model(model_file)
if is_classifier(model):
predictions, probabilities = predict_classifier(X, model, True)
probabilities = probabilities[:, 1]
probabilities = probabilities.astype(np.float32)
probabilities_reshaped = reshape_predictions(
probabilities, reference_profile["height"], reference_profile["width"], nodata_mask
predictions, probabilities = predict_classifier(X, model, classification_threshold, True)
probabilities_reshaped = reshape_predictions(
probabilities, reference_profile["height"], reference_profile["width"], nodata_mask
)
predictions_reshaped = reshape_predictions(
predictions, reference_profile["height"], reference_profile["width"], nodata_mask
)

metrics_dict = score_predictions(y, predictions, get_enum_values(test_metrics))
typer.echo("Progress: 80%")

out_profile = reference_profile.copy()
out_profile.update({"count": 1, "dtype": np.float32})

with rasterio.open(output_raster_probability, "w", **out_profile) as dst:
dst.write(probabilities_reshaped, 1)
with rasterio.open(output_raster_classified, "w", **out_profile) as dst:
dst.write(predictions_reshaped, 1)

typer.echo("\n")
for key, value in metrics_dict.items():
typer.echo(f"{key}: {value}")
typer.echo("\n")

typer.echo("Progress: 100%")
typer.echo(
(
"Testing classifier model completed, writing rasters to "
f"{output_raster_probability} and {output_raster_classified}."
)
else:
predictions = predict_regressor(X, model)
)

metrics_dict = score_predictions(y, predictions, validation_metrics)

# TEST REGRESSOR ML MODEL
@app.command()
def regressor_test_cli(
input_rasters: INPUT_FILES_ARGUMENT,
target_labels: INPUT_FILE_OPTION,
model_file: INPUT_FILE_OPTION,
output_raster: OUTPUT_FILE_OPTION,
test_metrics: Annotated[List[RegressorMetrics], typer.Option(case_sensitive=False)] = [RegressorMetrics.mse],
):
"""Test trained machine learning regressor model by predicting and scoring."""
from eis_toolkit.evaluation.scoring import score_predictions
from eis_toolkit.prediction.machine_learning_general import load_model, prepare_data_for_ml, reshape_predictions
from eis_toolkit.prediction.machine_learning_predict import predict_regressor

X, y, reference_profile, nodata_mask = prepare_data_for_ml(input_rasters, target_labels)
typer.echo("Progress: 30%")

model = load_model(model_file)
predictions = predict_regressor(X, model)
predictions_reshaped = reshape_predictions(
predictions, reference_profile["height"], reference_profile["width"], nodata_mask
)

metrics_dict = score_predictions(y, predictions, get_enum_values(test_metrics))
typer.echo("Progress: 80%")

json_str = json.dumps(metrics_dict)

out_profile = reference_profile.copy()
out_profile.update({"count": 1, "dtype": np.float32})

if is_classifier(model):
directory = os.path.split(output_raster)[0]
name = os.path.splitext(os.path.basename(output_raster))[0]
labels_output = os.path.join(directory, name + "_labels" + ".tif")
probabilities_output = os.path.join(directory, name + "_probabilities" + ".tif")
for output_path, output_data in zip(
[labels_output, probabilities_output], [predictions_reshaped, probabilities_reshaped]
):
with rasterio.open(output_path, "w", **out_profile) as dst:
dst.write(output_data, 1)
else:
with rasterio.open(output_raster, "w", **out_profile) as dst:
dst.write(predictions_reshaped, 1)
with rasterio.open(output_raster, "w", **out_profile) as dst:
dst.write(predictions_reshaped, 1)

typer.echo("Progress: 100%")
typer.echo(f"Results: {json_str}")
typer.echo("\n")
for key, value in metrics_dict.items():
typer.echo(f"{key}: {value}")
typer.echo("\n")

typer.echo("Evaluating trained model completed")
typer.echo("Progress: 100%\n")

typer.echo(f"Testing regressor model completed, writing raster to {output_raster}.")


# PREDICT WITH TRAINED ML MODEL
@app.command()
def predict_with_trained_model_cli(
def classifier_predict_cli(
input_rasters: INPUT_FILES_ARGUMENT,
model_file: INPUT_FILE_OPTION,
output_raster: OUTPUT_FILE_OPTION,
output_raster_probability: OUTPUT_FILE_OPTION,
output_raster_classified: OUTPUT_FILE_OPTION,
classification_threshold: float = 0.5,
):
"""Predict with a trained machine learning model."""
from sklearn.base import is_classifier

"""Predict with a trained machine learning classifier model."""
from eis_toolkit.prediction.machine_learning_general import load_model, prepare_data_for_ml, reshape_predictions
from eis_toolkit.prediction.machine_learning_predict import predict_classifier, predict_regressor
from eis_toolkit.prediction.machine_learning_predict import predict_classifier

X, _, reference_profile, nodata_mask = prepare_data_for_ml(input_rasters)

typer.echo("Progress: 30%")

model = load_model(model_file)
if is_classifier(model):
predictions, probabilities = predict_classifier(X, model, True)
probabilities = probabilities[:, 1]
probabilities = probabilities.astype(np.float32)
probabilities_reshaped = reshape_predictions(
probabilities, reference_profile["height"], reference_profile["width"], nodata_mask
predictions, probabilities = predict_classifier(X, model, classification_threshold, True)
probabilities_reshaped = reshape_predictions(
probabilities, reference_profile["height"], reference_profile["width"], nodata_mask
)
predictions_reshaped = reshape_predictions(
predictions, reference_profile["height"], reference_profile["width"], nodata_mask
)
typer.echo("Progress: 80%")

out_profile = reference_profile.copy()
out_profile.update({"count": 1, "dtype": np.float32})

with rasterio.open(output_raster_probability, "w", **out_profile) as dst:
dst.write(probabilities_reshaped, 1)
with rasterio.open(output_raster_classified, "w", **out_profile) as dst:
dst.write(predictions_reshaped, 1)

typer.echo("Progress: 100%")
typer.echo(
(
"Predicting with classifier model completed, writing rasters to "
f"{output_raster_probability} and {output_raster_classified}."
)
else:
predictions = predict_regressor(X, model)
)


# PREDICT WITH TRAINED ML MODEL
@app.command()
def regressor_predict_cli(
input_rasters: INPUT_FILES_ARGUMENT,
model_file: INPUT_FILE_OPTION,
output_raster: OUTPUT_FILE_OPTION,
):
"""Predict with a trained machine learning regressor model."""
from eis_toolkit.prediction.machine_learning_general import load_model, prepare_data_for_ml, reshape_predictions
from eis_toolkit.prediction.machine_learning_predict import predict_regressor

X, _, reference_profile, nodata_mask = prepare_data_for_ml(input_rasters)

typer.echo("Progress: 30%")

model = load_model(model_file)
predictions = predict_regressor(X, model)
predictions_reshaped = reshape_predictions(
predictions, reference_profile["height"], reference_profile["width"], nodata_mask
)
Expand All @@ -2413,22 +2478,11 @@ def predict_with_trained_model_cli(
out_profile = reference_profile.copy()
out_profile.update({"count": 1, "dtype": np.float32})

if is_classifier(model):
directory = os.path.split(output_raster)[0]
name = os.path.splitext(os.path.basename(output_raster))[0]
labels_output = os.path.join(directory, name + "_labels" + ".tif")
probabilities_output = os.path.join(directory, name + "_probabilities" + ".tif")
for output_path, output_data in zip(
[labels_output, probabilities_output], [predictions_reshaped, probabilities_reshaped]
):
with rasterio.open(output_path, "w", **out_profile) as dst:
dst.write(output_data, 1)
else:
with rasterio.open(output_raster, "w", **out_profile) as dst:
dst.write(predictions_reshaped, 1)
with rasterio.open(output_raster, "w", **out_profile) as dst:
dst.write(predictions_reshaped, 1)

typer.echo("Progress: 100%")
typer.echo("Predicting completed")
typer.echo(f"Predicting with regressor model completed, writing raster to {output_raster}.")


# FUZZY OVERLAYS
Expand Down
4 changes: 4 additions & 0 deletions eis_toolkit/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ class InvalidDataShapeException(Exception):
"""Exception error for datasets with invalid shapes."""


class InvalidModelTypeException(Exception):
"""Exception error for invalid model type."""


class InvalidParameterValueException(Exception):
"""Exception error class for invalid parameter values."""

Expand Down
45 changes: 36 additions & 9 deletions eis_toolkit/prediction/machine_learning_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,62 @@
import pandas as pd
from beartype import beartype
from beartype.typing import Tuple, Union
from sklearn.base import BaseEstimator
from sklearn.base import BaseEstimator, is_classifier
from tensorflow import keras

from eis_toolkit.exceptions import InvalidModelTypeException


@beartype
def predict_classifier(
data: Union[np.ndarray, pd.DataFrame], model: Union[BaseEstimator, keras.Model], include_probabilities: bool = True
data: Union[np.ndarray, pd.DataFrame],
model: Union[BaseEstimator, keras.Model],
classification_threshold: float = 0.5,
include_probabilities: bool = True,
) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
"""
Predict with a trained model.
Predict with a trained classifier model.

Args:
data: Data used to make predictions.
model: Trained classifier or regressor. Can be any machine learning model trained with
EIS Toolkit (Sklearn and Keras models).
classification_threshold: Threshold for classifying based on probabilities. Only used for
binary classification. Defaults to 0.5.
include_probabilities: If the probability array should be returned too. Defaults to True.

Returns:
Predicted labels and optionally predicted probabilities by a classifier model.
Predicted labels and optionally predicted probabilities as one-dimensional arrays by a classifier model.

Raises:
InvalidModelTypeException: Input model is not a classifier model.
"""
if isinstance(model, keras.Model):
probabilities = model.predict(data)
labels = probabilities.argmax(axis=-1)
probabilities = model.predict(data).astype(np.float32)
if probabilities.shape[1] == 1: # Binary classification
probabilities = probabilities.squeeze()
labels = (probabilities >= classification_threshold).astype(np.float32)
else: # Multiclass classification
labels = probabilities.argmax(axis=-1).astype(np.float32)
if include_probabilities:
return labels, probabilities
else:
return labels
elif isinstance(model, BaseEstimator):
labels = model.predict(data)
if not is_classifier(model):
raise InvalidModelTypeException(f"Expected a classifier model: {type(model)}.")
probabilities = model.predict_proba(data).astype(np.float32)
if probabilities.shape[1] == 2: # Binary classification
probabilities = probabilities[:, 1]
labels = (probabilities >= classification_threshold).astype(np.float32)
else: # Multiclass classification
labels = probabilities.argmax(axis=-1).astype(np.float32)
if include_probabilities:
probabilities = model.predict_proba(data)
return labels, probabilities
else:
return labels
else:
raise InvalidModelTypeException(f"Model type not recognized: {type(model)}.")


@beartype
Expand All @@ -44,7 +66,7 @@ def predict_regressor(
model: Union[BaseEstimator, keras.Model],
) -> np.ndarray:
"""
Predict with a trained model.
Predict with a trained regressor model.

Args:
data: Data used to make predictions.
Expand All @@ -53,6 +75,11 @@ def predict_regressor(

Returns:
Regression model prediction array.

Raises:
InvalidModelTypeException: Input model is not a regressor model.
"""
if is_classifier(model):
raise InvalidModelTypeException(f"Expected a regressor model: {type(model)}.")
result = model.predict(data)
return result
4 changes: 2 additions & 2 deletions tests/prediction/machine_learning_general_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def test_evaluate_model_sklearn():
X_train, y_train, model=RF_MODEL, validation_method="none", metrics=CLF_METRICS, random_state=42
)

predictions = predict_classifier(X_test, model, include_probabilities=False)
predictions = predict_classifier(X_test, model, classification_threshold=0.5, include_probabilities=False)
accuracy = score_predictions(y_test, predictions, "accuracy")
np.testing.assert_equal(accuracy, 1.0)

Expand All @@ -131,7 +131,7 @@ def test_predict_classifier_sklearn():
X_train, y_train, model=RF_MODEL, validation_method="none", metrics=CLF_METRICS, random_state=42
)

predicted_labels, predicted_probabilities = predict_classifier(X_test, model, True)
predicted_labels, predicted_probabilities = predict_classifier(X_test, model, include_probabilities=True)
np.testing.assert_equal(len(predicted_labels), len(y_test))
np.testing.assert_equal(len(predicted_probabilities), len(y_test))

Expand Down
Loading