diff --git a/docs/transformations/normalize.md b/docs/transformations/normalize.md new file mode 100644 index 00000000..6ae88915 --- /dev/null +++ b/docs/transformations/normalize.md @@ -0,0 +1,3 @@ +# Normalize + +::: eis_toolkit.transformations.normalize \ No newline at end of file diff --git a/docs/transformations/standardize.md b/docs/transformations/standardize.md new file mode 100644 index 00000000..ffb97f23 --- /dev/null +++ b/docs/transformations/standardize.md @@ -0,0 +1,3 @@ +# Standardize + +::: eis_toolkit.transformations.standardize \ No newline at end of file diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py index ae78b62d..2b792966 100644 --- a/eis_toolkit/cli.py +++ b/eis_toolkit/cli.py @@ -2356,6 +2356,111 @@ def winsorize_transform_cli( typer.echo(f"Winsorize transform completed, writing raster to {output_raster}.") +# NORMALIZE RASTER +@app.command() +def normalize_raster_cli( + input_raster: Annotated[Path, INPUT_FILE_OPTION], + output_raster: Annotated[Path, OUTPUT_FILE_OPTION], +): + """Normalize input raster data.""" + from eis_toolkit.transformations.normalize import normalize + + typer.echo("Progress: 10%") + + with rasterio.open(input_raster) as raster: + data = raster.read() + out_meta = raster.meta.copy() + typer.echo("Progress: 25%") + + out_image = normalize(data=data, array_type="raster") + typer.echo("Progress: 75%") + + with rasterio.open(output_raster, "w", **out_meta) as dest: + dest.write(out_image) + typer.echo("Progress: 100%") + + typer.echo(f"Normalizing completed, output raster written to {output_raster}.") + + +# NORMALIZE VECTOR +@app.command() +def normalize_vector_cli( + input_vector: Annotated[Path, INPUT_FILE_OPTION], + output_vector: Annotated[Path, OUTPUT_FILE_OPTION], + columns: Optional[List[str]] = None, +): + """Normalize input vector data.""" + from eis_toolkit.transformations.normalize import normalize + + typer.echo("Progress: 10%") + + gdf = gpd.read_file(input_vector) + df = pd.DataFrame(gdf.drop(columns="geometry")) + geometries = gdf["geometry"] + typer.echo("Progress: 25%") + + normalized_df = normalize(data=df, columns=columns) + typer.echo("Progess 75%") + + out_gdf = gpd.GeoDataFrame(normalized_df, geometry=geometries) + out_gdf.to_file(output_vector) + typer.echo("Progress: 100%") + + typer.echo(f"Normalizing completed, output vector written to {output_vector}.") + + +# STANDARDIZE RASTER +@app.command() +def standardize_raster_cli( + input_raster: Annotated[Path, INPUT_FILE_OPTION], output_raster: Annotated[Path, OUTPUT_FILE_OPTION] +): + """Standardize input raster data.""" + from eis_toolkit.transformations.standardize import standardize + + typer.echo("Progress: 10%") + + with rasterio.open(input_raster) as raster: + data = raster.read() + out_meta = raster.meta.copy() + typer.echo("Progress: 25%") + + out_image = standardize(data=data, array_type="raster") + typer.echo("Progress: 75%") + + with rasterio.open(output_raster, "w", **out_meta) as dest: + dest.write(out_image) + typer.echo("Progress: 100%") + + typer.echo(f"Standardizing completed, output raster written to {output_raster}.") + + +# STANDARDIZE VECTOR +@app.command() +def standardize_vector_cli( + input_vector: Annotated[Path, INPUT_FILE_OPTION], + output_vector: Annotated[Path, OUTPUT_FILE_OPTION], + columns: Optional[List[str]] = None, +): + """Standardize input vector data.""" + from eis_toolkit.transformations.standardize import standardize + + typer.echo("Progress: 10%") + + gdf = gpd.read_file(input_vector) + df = pd.DataFrame(gdf.drop(columns="geometry")) + geometries = gdf["geometry"] + typer.echo("Progress: 25%") + + standardized_df = standardize(data=df, columns=columns) + typer.echo("Progess: 75%") + + out_gdf = gpd.GeoDataFrame(standardized_df, geometry=geometries) + out_gdf.to_file(output_vector) + typer.echo("Progress: 100%") + + typer.echo(f"Standardizing completed, output vector written to {output_vector}.") + + # ---VALIDATION --- # TODO diff --git a/eis_toolkit/transformations/normalize.py b/eis_toolkit/transformations/normalize.py new file mode 100644 index 00000000..0eae8db8 --- /dev/null +++ b/eis_toolkit/transformations/normalize.py @@ -0,0 +1,63 @@ +import numpy as np +import pandas as pd +from beartype import beartype +from beartype.typing import Literal, Optional, Sequence, Union + +from eis_toolkit.exceptions import InvalidColumnException, InvalidDataShapeException, InvalidParameterValueException + + +@beartype +def normalize( + data: Union[np.ndarray, pd.DataFrame], + array_type: Literal["tabular", "raster"] = "tabular", + columns: Optional[Sequence[str]] = None, +) -> Union[np.ndarray, pd.DataFrame]: + """Normalize input data. + + Scales values to range [0,1]. + + Normalization is applied to each variable independently. For DataFrames, each column is + treated as a variable and for Numpy arrays, either each column or each 2D array is treated + as a variable based on `array_type` value. + + Args: + data: Input data to be normalized, either a numpy array or a pandas DataFrame. + array_type: Specifies how the data is interpreted if input is numpy array. + `tabular` is used for 2D data where each column is a variable (data preparation for ML modeling), + and `raster` for 2D/3D data where 2D array is a variable. + columns: Column selection for DataFrame input, ignored if input is numpy array. Defaults to None + (all found numeric columns used). + + Returns: + Normalized data in the input format. + + Raises: + InvalidParameterValueException: If array type selection is invalid. + InvalidDataShapeException: If shape of Numpy array is invalid for selected array type. + """ + out_data = data.copy().astype(np.float64) + if isinstance(data, pd.DataFrame): + if columns is None or columns == []: + columns = data.select_dtypes(include=[np.number]).columns + for col in columns: + if col not in data.columns: + raise InvalidColumnException(f"Column {col} was not found in the input DataFrame.") + out_data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min()) + else: + if array_type == "tabular": + if data.ndim != 2: + raise InvalidDataShapeException("Tabular data must be a 2D numpy array.") + out_data = (data - data.min(axis=0)) / (data.max(axis=0) - data.min(axis=0)) + elif array_type == "raster": + if data.ndim == 2: # Treat like a single-band raster + out_data = (data - data.min()) / (data.max() - data.min()) + elif data.ndim == 3: + for i in range(data.shape[0]): + min = data[i, :, :].min() + max = data[i, :, :].max() + out_data[i] = (data[i] - min) / (max - min) + else: + raise InvalidDataShapeException("Raster data must be a 2D or 3D numpy array.") + else: + raise InvalidParameterValueException("data_type must be either 'tabular' or 'raster'.") + return out_data diff --git a/eis_toolkit/transformations/standardize.py b/eis_toolkit/transformations/standardize.py new file mode 100644 index 00000000..35b5b230 --- /dev/null +++ b/eis_toolkit/transformations/standardize.py @@ -0,0 +1,61 @@ +import numpy as np +import pandas as pd +from beartype import beartype +from beartype.typing import Literal, Optional, Sequence, Union + +from eis_toolkit.exceptions import InvalidColumnException, InvalidDataShapeException, InvalidParameterValueException + + +@beartype +def standardize( + data: Union[np.ndarray, pd.DataFrame], + array_type: Literal["tabular", "raster"] = "tabular", + columns: Optional[Sequence[str]] = None, +) -> Union[np.ndarray, pd.DataFrame]: + """Standardize input data. + + Scales data mean of 0 and standard deviation of 1. + + Standardization is applied to each variable independently. For DataFrames, each column is + treated as a variable and for Numpy arrays, either each column or each 2D array is treated + as a variable based on `array_type` value. + + Args: + data: Input data to be standardized, either a numpy array or a pandas DataFrame. + array_type: Specifies how the data is interpreted if input is numpy array. + `tabular` is used for 2D data where each column is a variable (data preparation for ML modeling), + and `raster` for 2D/3D data where 2D array is a variable. + columns: Column selection for DataFrame input, ignored if input is numpy array. Defaults to None + (all found numeric columns used). + + Returns: + Standardized data in the input format. + + Raises: + InvalidParameterValueException: If array type selection is invalid. + InvalidDataShapeException: If shape of Numpy array is invalid for selected array type. + """ + out_data = data.copy().astype(np.float64) + if isinstance(data, pd.DataFrame): + if columns is None or columns == []: + columns = data.select_dtypes(include=[np.number]).columns + for col in columns: + if col not in data.columns: + raise InvalidColumnException(f"Column {col} was not found in the input DataFrame.") + out_data[col] = (data[col] - data[col].mean()) / data[col].std(ddof=0) + else: + if array_type == "tabular": + if data.ndim != 2: + raise InvalidDataShapeException("Tabular data must be a 2D numpy array.") + out_data = (data - data.mean(axis=0)) / data.std(axis=0, ddof=0) + elif array_type == "raster": + if data.ndim == 2: # Treat like a single-band raster + out_data = (data - data.mean()) / data.std(ddof=0) + elif data.ndim == 3: + for i in range(data.shape[0]): + out_data[i, :, :] = (data[i, :, :] - data[i, :, :].mean()) / data[i, :, :].std(ddof=0) + else: + raise InvalidDataShapeException("Raster data must be a 2D or 3D numpy array.") + else: + raise InvalidParameterValueException("array_type must be either 'tabular' or 'raster'.") + return out_data diff --git a/tests/transformations/normalize_test.py b/tests/transformations/normalize_test.py new file mode 100644 index 00000000..8b0162b6 --- /dev/null +++ b/tests/transformations/normalize_test.py @@ -0,0 +1,103 @@ +import numpy as np +import pandas as pd +import pytest + +from eis_toolkit.exceptions import InvalidColumnException, InvalidDataShapeException +from eis_toolkit.transformations.normalize import normalize + +DF = pd.DataFrame( + { + "A": [1, 2, 3, 4, 5], # Min 1, max 5 + "B": [1, 1, 1, 2, 2], # Min 1, max 2 + "C": [1, 5, 10, 7, 9], # Min 1, max 10 + }, + dtype=np.float64, +) +ARRAY_TABULAR = DF.to_numpy() +ARRAY_RASTER = ARRAY_TABULAR +ARRAY_RASTER_3D = np.stack([ARRAY_RASTER, ARRAY_RASTER]) + + +def test_normalize_dataframe(): + """Test that normalization of DataFrame works as expected.""" + normalized_df = normalize(DF) + assert isinstance(normalized_df, pd.DataFrame) + np.testing.assert_array_equal(normalized_df["A"].to_numpy(), [0.0, 0.25, 0.5, 0.75, 1.0]) + np.testing.assert_array_equal(normalized_df["B"].to_numpy(), [0.0, 0.0, 0.0, 1.0, 1.0]) + np.testing.assert_array_almost_equal(normalized_df["C"].to_numpy(), [0.0, 0.4444, 1.0, 0.6667, 0.8889], decimal=3) + + +def test_noramlize_dataframe_column_selection(): + """Test that normalization of DataFrame with column selection works as expected.""" + normalized_df = normalize(DF, columns=["A", "B"]) + assert isinstance(normalized_df, pd.DataFrame) + np.testing.assert_array_equal(normalized_df["A"].to_numpy(), [0.0, 0.25, 0.5, 0.75, 1.0]) + np.testing.assert_array_equal(normalized_df["B"].to_numpy(), [0.0, 0.0, 0.0, 1.0, 1.0]) + np.testing.assert_array_equal(normalized_df["C"].to_numpy(), [1, 5, 10, 7, 9]) + + +def test_normalize_array_tabular(): + """Test that normalization of numpy array with tabular format works as expected.""" + normalized_array = normalize(ARRAY_TABULAR, array_type="tabular") + assert isinstance(normalized_array, np.ndarray) + np.testing.assert_equal(normalized_array.ndim, 2) + np.testing.assert_array_equal(normalized_array[:, 0], [0.0, 0.25, 0.5, 0.75, 1.0]) + np.testing.assert_array_equal(normalized_array[:, 1], [0.0, 0.0, 0.0, 1.0, 1.0]) + np.testing.assert_array_almost_equal(normalized_array[:, 2], [0.0, 0.4444, 1.0, 0.6667, 0.8889], decimal=3) + + +def test_normalize_array_raster(): + """Test that normalization of 2D numpy array with raster format works as expected.""" + normalized_array = normalize(ARRAY_RASTER, array_type="raster") + assert isinstance(normalized_array, np.ndarray) + np.testing.assert_equal(normalized_array.ndim, 2) + np.testing.assert_array_almost_equal( + normalized_array, + [[0.0, 0.0, 0.0], [0.1111, 0.0, 0.4444], [0.2222, 0.0, 1.0], [0.3333, 0.111, 0.6667], [0.4444, 0.111, 0.8889]], + 3, + ) + + +def test_normalize_array_raster_3D(): + """Test that normalization of 3D numpy array with raster format works as expected.""" + normalized_array = normalize(ARRAY_RASTER_3D, array_type="raster") + assert isinstance(normalized_array, np.ndarray) + np.testing.assert_equal(normalized_array.ndim, 3) + np.testing.assert_array_almost_equal( + normalized_array, + [ + [ + [0.0, 0.0, 0.0], + [0.1111, 0.0, 0.4444], + [0.2222, 0.0, 1.0], + [0.3333, 0.111, 0.6667], + [0.4444, 0.111, 0.8889], + ], + [ + [0.0, 0.0, 0.0], + [0.1111, 0.0, 0.4444], + [0.2222, 0.0, 1.0], + [0.3333, 0.111, 0.6667], + [0.4444, 0.111, 0.8889], + ], + ], + 3, + ) + + +def test_normalize_dataframe_invalid_column(): + """Test that invalid column selection raises the correct exception.""" + with pytest.raises(InvalidColumnException): + normalize(DF, columns=["D"]) + + +def test_normalize_array_tabular_invalid_shape(): + """Test that invalid input data shape for tabular format raises the correct exception.""" + with pytest.raises(InvalidDataShapeException): + normalize(ARRAY_RASTER_3D, array_type="tabular") + + +def test_normalize_array_raster_invalid_shape(): + """Test that invalid input data shape for raster format raises the correct exception.""" + with pytest.raises(InvalidDataShapeException): + normalize(ARRAY_RASTER[0], array_type="raster") diff --git a/tests/transformations/standardize_test.py b/tests/transformations/standardize_test.py new file mode 100644 index 00000000..946c87d8 --- /dev/null +++ b/tests/transformations/standardize_test.py @@ -0,0 +1,121 @@ +import numpy as np +import pandas as pd +import pytest + +from eis_toolkit.exceptions import InvalidColumnException, InvalidDataShapeException +from eis_toolkit.transformations.standardize import standardize + +DF = pd.DataFrame( + { + "A": [1, 2, 3, 4, 5], # Mean 3, std sqrt(2) + "B": [1, 1, 1, 2, 2], # Mean 1.4, std sqrt(1.4) + "C": [1, 5, 10, 7, 9], # Mean 6.4, std sqrt(51.2) + }, + dtype=np.float64, +) +ARRAY_TABULAR = DF.to_numpy() +ARRAY_RASTER = ARRAY_TABULAR +ARRAY_RASTER_3D = np.stack([ARRAY_RASTER, ARRAY_RASTER]) + + +def test_standardize_dataframe(): + """Test that standardization of DataFrame works as expected.""" + standardized_df = standardize(DF) + assert isinstance(standardized_df, pd.DataFrame) + np.testing.assert_array_almost_equal( + standardized_df["A"].to_numpy(), [-1.4142, -0.7071, 0.0, 0.7071, 1.4142], decimal=3 + ) + np.testing.assert_array_almost_equal( + standardized_df["B"].to_numpy(), [-0.816, -0.816, -0.816, 1.225, 1.225], decimal=3 + ) + np.testing.assert_array_almost_equal( + standardized_df["C"].to_numpy(), [-1.688, -0.438, 1.125, 0.187, 0.812], decimal=3 + ) + + +def test_noramlize_dataframe_column_selection(): + """Test that standardization of DataFrame with column selection works as expected.""" + standardized_df = standardize(DF, columns=["A", "B"]) + assert isinstance(standardized_df, pd.DataFrame) + np.testing.assert_array_almost_equal( + standardized_df["A"].to_numpy(), [-1.4142, -0.7071, 0.0, 0.7071, 1.4142], decimal=3 + ) + np.testing.assert_array_almost_equal( + standardized_df["B"].to_numpy(), [-0.816, -0.816, -0.816, 1.225, 1.225], decimal=3 + ) + np.testing.assert_array_equal(standardized_df["C"].to_numpy(), [1, 5, 10, 7, 9]) + + +def test_standardize_array_tabular(): + """Test that standardization of numpy array with tabular format works as expected.""" + standardized_array = standardize(ARRAY_TABULAR, array_type="tabular") + assert isinstance(standardized_array, np.ndarray) + np.testing.assert_equal(standardized_array.ndim, 2) + np.testing.assert_array_almost_equal(standardized_array[:, 0], [-1.4142, -0.7071, 0.0, 0.7071, 1.4142], decimal=3) + np.testing.assert_array_almost_equal(standardized_array[:, 1], [-0.816, -0.816, -0.816, 1.225, 1.225], decimal=3) + np.testing.assert_array_almost_equal(standardized_array[:, 2], [-1.688, -0.438, 1.125, 0.187, 0.812], decimal=3) + + +def test_standardize_array_raster(): + """Test that standardization of 2D numpy array with raster format works as expected.""" + standardized_array = standardize(ARRAY_RASTER, array_type="raster") + assert isinstance(standardized_array, np.ndarray) + np.testing.assert_equal(standardized_array.ndim, 2) + np.testing.assert_array_almost_equal( + standardized_array, + [ + [-0.8914, -0.8914, -0.8914], + [-0.5485, -0.8914, 0.4800], + [-0.2057, -0.8914, 2.1943], + [0.1371, -0.5486, 1.1657], + [0.4800, -0.5486, 1.8515], + ], + 3, + ) + + +def test_standardize_array_raster_3D(): + """Test that standardization of 3D numpy array with raster format works as expected.""" + standardized_array = standardize(ARRAY_RASTER_3D, array_type="raster") + print(ARRAY_RASTER_3D) + print(standardized_array) + assert isinstance(standardized_array, np.ndarray) + np.testing.assert_equal(standardized_array.ndim, 3) + np.testing.assert_array_almost_equal( + standardized_array, + [ + [ + [-0.8914, -0.8914, -0.8914], + [-0.5485, -0.8914, 0.4800], + [-0.2057, -0.8914, 2.1943], + [0.1371, -0.5486, 1.1657], + [0.4800, -0.5486, 1.8515], + ], + [ + [-0.8914, -0.8914, -0.8914], + [-0.5485, -0.8914, 0.4800], + [-0.2057, -0.8914, 2.1943], + [0.1371, -0.5486, 1.1657], + [0.4800, -0.5486, 1.8515], + ], + ], + 3, + ) + + +def test_standardize_dataframe_invalid_column(): + """Test that invalid column selection raises the correct exception.""" + with pytest.raises(InvalidColumnException): + standardize(DF, columns=["D"]) + + +def test_standardize_array_tabular_invalid_shape(): + """Test that invalid input data shape for tabular format raises the correct exception.""" + with pytest.raises(InvalidDataShapeException): + standardize(ARRAY_RASTER_3D, array_type="tabular") + + +def test_standardize_array_raster_invalid_shape(): + """Test that invalid input data shape for raster format raises the correct exception.""" + with pytest.raises(InvalidDataShapeException): + standardize(ARRAY_RASTER[0], array_type="raster")