Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add standardize and normalize #339

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
3 changes: 3 additions & 0 deletions docs/transformations/normalize.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Normalize

::: eis_toolkit.transformations.normalize
3 changes: 3 additions & 0 deletions docs/transformations/standardize.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Standardize

::: eis_toolkit.transformations.standardize
105 changes: 105 additions & 0 deletions eis_toolkit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2356,6 +2356,111 @@ def winsorize_transform_cli(
typer.echo(f"Winsorize transform completed, writing raster to {output_raster}.")


# NORMALIZE RASTER
@app.command()
def normalize_raster_cli(
input_raster: Annotated[Path, INPUT_FILE_OPTION],
output_raster: Annotated[Path, OUTPUT_FILE_OPTION],
):
"""Normalize input raster data."""
from eis_toolkit.transformations.normalize import normalize

typer.echo("Progress: 10%")

with rasterio.open(input_raster) as raster:
data = raster.read()
out_meta = raster.meta.copy()
typer.echo("Progress: 25%")

out_image = normalize(data=data, array_type="raster")
typer.echo("Progress: 75%")

with rasterio.open(output_raster, "w", **out_meta) as dest:
dest.write(out_image)
typer.echo("Progress: 100%")

typer.echo(f"Normalizing completed, output raster written to {output_raster}.")


# NORMALIZE VECTOR
@app.command()
def normalize_vector_cli(
input_vector: Annotated[Path, INPUT_FILE_OPTION],
output_vector: Annotated[Path, OUTPUT_FILE_OPTION],
columns: Optional[List[str]] = None,
):
"""Normalize input vector data."""
from eis_toolkit.transformations.normalize import normalize

typer.echo("Progress: 10%")

gdf = gpd.read_file(input_vector)
df = pd.DataFrame(gdf.drop(columns="geometry"))
geometries = gdf["geometry"]
typer.echo("Progress: 25%")

normalized_df = normalize(data=df, columns=columns)
typer.echo("Progess 75%")

out_gdf = gpd.GeoDataFrame(normalized_df, geometry=geometries)
out_gdf.to_file(output_vector)
typer.echo("Progress: 100%")

typer.echo(f"Normalizing completed, output vector written to {output_vector}.")


# STANDARDIZE RASTER
@app.command()
def standardize_raster_cli(
input_raster: Annotated[Path, INPUT_FILE_OPTION], output_raster: Annotated[Path, OUTPUT_FILE_OPTION]
):
"""Standardize input raster data."""
from eis_toolkit.transformations.standardize import standardize

typer.echo("Progress: 10%")

with rasterio.open(input_raster) as raster:
data = raster.read()
out_meta = raster.meta.copy()
typer.echo("Progress: 25%")

out_image = standardize(data=data, array_type="raster")
typer.echo("Progress: 75%")

with rasterio.open(output_raster, "w", **out_meta) as dest:
dest.write(out_image)
typer.echo("Progress: 100%")

typer.echo(f"Standardizing completed, output raster written to {output_raster}.")


# STANDARDIZE VECTOR
@app.command()
def standardize_vector_cli(
input_vector: Annotated[Path, INPUT_FILE_OPTION],
output_vector: Annotated[Path, OUTPUT_FILE_OPTION],
columns: Optional[List[str]] = None,
):
"""Standardize input vector data."""
from eis_toolkit.transformations.standardize import standardize

typer.echo("Progress: 10%")

gdf = gpd.read_file(input_vector)
df = pd.DataFrame(gdf.drop(columns="geometry"))
geometries = gdf["geometry"]
typer.echo("Progress: 25%")

standardized_df = standardize(data=df, columns=columns)
typer.echo("Progess: 75%")

out_gdf = gpd.GeoDataFrame(standardized_df, geometry=geometries)
out_gdf.to_file(output_vector)
typer.echo("Progress: 100%")

typer.echo(f"Standardizing completed, output vector written to {output_vector}.")


# ---VALIDATION ---
# TODO

Expand Down
63 changes: 63 additions & 0 deletions eis_toolkit/transformations/normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import numpy as np
import pandas as pd
from beartype import beartype
from beartype.typing import Literal, Optional, Sequence, Union

from eis_toolkit.exceptions import InvalidColumnException, InvalidDataShapeException, InvalidParameterValueException


@beartype
def normalize(
data: Union[np.ndarray, pd.DataFrame],
array_type: Literal["tabular", "raster"] = "tabular",
columns: Optional[Sequence[str]] = None,
) -> Union[np.ndarray, pd.DataFrame]:
"""Normalize input data.

Scales values to range [0,1].

Normalization is applied to each variable independently. For DataFrames, each column is
treated as a variable and for Numpy arrays, either each column or each 2D array is treated
as a variable based on `array_type` value.

Args:
data: Input data to be normalized, either a numpy array or a pandas DataFrame.
array_type: Specifies how the data is interpreted if input is numpy array.
`tabular` is used for 2D data where each column is a variable (data preparation for ML modeling),
and `raster` for 2D/3D data where 2D array is a variable.
columns: Column selection for DataFrame input, ignored if input is numpy array. Defaults to None
(all found numeric columns used).

Returns:
Normalized data in the input format.

Raises:
InvalidParameterValueException: If array type selection is invalid.
InvalidDataShapeException: If shape of Numpy array is invalid for selected array type.
"""
out_data = data.copy().astype(np.float64)
if isinstance(data, pd.DataFrame):
if columns is None or columns == []:
columns = data.select_dtypes(include=[np.number]).columns
for col in columns:
if col not in data.columns:
raise InvalidColumnException(f"Column {col} was not found in the input DataFrame.")
out_data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())
else:
if array_type == "tabular":
if data.ndim != 2:
raise InvalidDataShapeException("Tabular data must be a 2D numpy array.")
out_data = (data - data.min(axis=0)) / (data.max(axis=0) - data.min(axis=0))
elif array_type == "raster":
if data.ndim == 2: # Treat like a single-band raster
out_data = (data - data.min()) / (data.max() - data.min())
elif data.ndim == 3:
for i in range(data.shape[0]):
min = data[i, :, :].min()
max = data[i, :, :].max()
out_data[i] = (data[i] - min) / (max - min)
else:
raise InvalidDataShapeException("Raster data must be a 2D or 3D numpy array.")
else:
raise InvalidParameterValueException("data_type must be either 'tabular' or 'raster'.")
return out_data
61 changes: 61 additions & 0 deletions eis_toolkit/transformations/standardize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import numpy as np
import pandas as pd
from beartype import beartype
from beartype.typing import Literal, Optional, Sequence, Union

from eis_toolkit.exceptions import InvalidColumnException, InvalidDataShapeException, InvalidParameterValueException


@beartype
def standardize(
data: Union[np.ndarray, pd.DataFrame],
array_type: Literal["tabular", "raster"] = "tabular",
columns: Optional[Sequence[str]] = None,
) -> Union[np.ndarray, pd.DataFrame]:
"""Standardize input data.

Scales data mean of 0 and standard deviation of 1.

Standardization is applied to each variable independently. For DataFrames, each column is
treated as a variable and for Numpy arrays, either each column or each 2D array is treated
as a variable based on `array_type` value.

Args:
data: Input data to be standardized, either a numpy array or a pandas DataFrame.
array_type: Specifies how the data is interpreted if input is numpy array.
`tabular` is used for 2D data where each column is a variable (data preparation for ML modeling),
and `raster` for 2D/3D data where 2D array is a variable.
columns: Column selection for DataFrame input, ignored if input is numpy array. Defaults to None
(all found numeric columns used).

Returns:
Standardized data in the input format.

Raises:
InvalidParameterValueException: If array type selection is invalid.
InvalidDataShapeException: If shape of Numpy array is invalid for selected array type.
"""
out_data = data.copy().astype(np.float64)
if isinstance(data, pd.DataFrame):
if columns is None or columns == []:
columns = data.select_dtypes(include=[np.number]).columns
for col in columns:
if col not in data.columns:
raise InvalidColumnException(f"Column {col} was not found in the input DataFrame.")
out_data[col] = (data[col] - data[col].mean()) / data[col].std(ddof=0)
else:
if array_type == "tabular":
if data.ndim != 2:
raise InvalidDataShapeException("Tabular data must be a 2D numpy array.")
out_data = (data - data.mean(axis=0)) / data.std(axis=0, ddof=0)
elif array_type == "raster":
if data.ndim == 2: # Treat like a single-band raster
out_data = (data - data.mean()) / data.std(ddof=0)
elif data.ndim == 3:
for i in range(data.shape[0]):
out_data[i, :, :] = (data[i, :, :] - data[i, :, :].mean()) / data[i, :, :].std(ddof=0)
else:
raise InvalidDataShapeException("Raster data must be a 2D or 3D numpy array.")
else:
raise InvalidParameterValueException("array_type must be either 'tabular' or 'raster'.")
return out_data
103 changes: 103 additions & 0 deletions tests/transformations/normalize_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import numpy as np
import pandas as pd
import pytest

from eis_toolkit.exceptions import InvalidColumnException, InvalidDataShapeException
from eis_toolkit.transformations.normalize import normalize

DF = pd.DataFrame(
{
"A": [1, 2, 3, 4, 5], # Min 1, max 5
"B": [1, 1, 1, 2, 2], # Min 1, max 2
"C": [1, 5, 10, 7, 9], # Min 1, max 10
},
dtype=np.float64,
)
ARRAY_TABULAR = DF.to_numpy()
ARRAY_RASTER = ARRAY_TABULAR
ARRAY_RASTER_3D = np.stack([ARRAY_RASTER, ARRAY_RASTER])


def test_normalize_dataframe():
"""Test that normalization of DataFrame works as expected."""
normalized_df = normalize(DF)
assert isinstance(normalized_df, pd.DataFrame)
np.testing.assert_array_equal(normalized_df["A"].to_numpy(), [0.0, 0.25, 0.5, 0.75, 1.0])
np.testing.assert_array_equal(normalized_df["B"].to_numpy(), [0.0, 0.0, 0.0, 1.0, 1.0])
np.testing.assert_array_almost_equal(normalized_df["C"].to_numpy(), [0.0, 0.4444, 1.0, 0.6667, 0.8889], decimal=3)


def test_noramlize_dataframe_column_selection():
"""Test that normalization of DataFrame with column selection works as expected."""
normalized_df = normalize(DF, columns=["A", "B"])
assert isinstance(normalized_df, pd.DataFrame)
np.testing.assert_array_equal(normalized_df["A"].to_numpy(), [0.0, 0.25, 0.5, 0.75, 1.0])
np.testing.assert_array_equal(normalized_df["B"].to_numpy(), [0.0, 0.0, 0.0, 1.0, 1.0])
np.testing.assert_array_equal(normalized_df["C"].to_numpy(), [1, 5, 10, 7, 9])


def test_normalize_array_tabular():
"""Test that normalization of numpy array with tabular format works as expected."""
normalized_array = normalize(ARRAY_TABULAR, array_type="tabular")
assert isinstance(normalized_array, np.ndarray)
np.testing.assert_equal(normalized_array.ndim, 2)
np.testing.assert_array_equal(normalized_array[:, 0], [0.0, 0.25, 0.5, 0.75, 1.0])
np.testing.assert_array_equal(normalized_array[:, 1], [0.0, 0.0, 0.0, 1.0, 1.0])
np.testing.assert_array_almost_equal(normalized_array[:, 2], [0.0, 0.4444, 1.0, 0.6667, 0.8889], decimal=3)


def test_normalize_array_raster():
"""Test that normalization of 2D numpy array with raster format works as expected."""
normalized_array = normalize(ARRAY_RASTER, array_type="raster")
assert isinstance(normalized_array, np.ndarray)
np.testing.assert_equal(normalized_array.ndim, 2)
np.testing.assert_array_almost_equal(
normalized_array,
[[0.0, 0.0, 0.0], [0.1111, 0.0, 0.4444], [0.2222, 0.0, 1.0], [0.3333, 0.111, 0.6667], [0.4444, 0.111, 0.8889]],
3,
)


def test_normalize_array_raster_3D():
"""Test that normalization of 3D numpy array with raster format works as expected."""
normalized_array = normalize(ARRAY_RASTER_3D, array_type="raster")
assert isinstance(normalized_array, np.ndarray)
np.testing.assert_equal(normalized_array.ndim, 3)
np.testing.assert_array_almost_equal(
normalized_array,
[
[
[0.0, 0.0, 0.0],
[0.1111, 0.0, 0.4444],
[0.2222, 0.0, 1.0],
[0.3333, 0.111, 0.6667],
[0.4444, 0.111, 0.8889],
],
[
[0.0, 0.0, 0.0],
[0.1111, 0.0, 0.4444],
[0.2222, 0.0, 1.0],
[0.3333, 0.111, 0.6667],
[0.4444, 0.111, 0.8889],
],
],
3,
)


def test_normalize_dataframe_invalid_column():
"""Test that invalid column selection raises the correct exception."""
with pytest.raises(InvalidColumnException):
normalize(DF, columns=["D"])


def test_normalize_array_tabular_invalid_shape():
"""Test that invalid input data shape for tabular format raises the correct exception."""
with pytest.raises(InvalidDataShapeException):
normalize(ARRAY_RASTER_3D, array_type="tabular")


def test_normalize_array_raster_invalid_shape():
"""Test that invalid input data shape for raster format raises the correct exception."""
with pytest.raises(InvalidDataShapeException):
normalize(ARRAY_RASTER[0], array_type="raster")
Loading
Loading