Merge pull request #192 from GispoCoding/191-add-smotetomek-for-class…

…-imbalance-handling 191 add SMOTETomek for class balancing
GispoCoding · Oct 10, 2023 · 7e9da81 · 7e9da81
2 parents e36407a + cf143bf
commit 7e9da81
Show file tree

Hide file tree

Showing 8 changed files with 2,500 additions and 1,558 deletions.
diff --git a/docs/data_tools/class_balancing.md b/docs/data_tools/class_balancing.md
@@ -0,0 +1,3 @@
+# Class balancing
+
+::: eis_toolkit.data_tools.class_balancing
diff --git a/eis_toolkit/data_tools/__init__.py b/eis_toolkit/data_tools/__init__.py
diff --git a/eis_toolkit/data_tools/class_balancing.py b/eis_toolkit/data_tools/class_balancing.py
@@ -0,0 +1,44 @@
+from typing import Optional, Union
+
+import numpy as np
+import pandas as pd
+from beartype import beartype
+from imblearn.combine import SMOTETomek
+
+from eis_toolkit import exceptions
+
+
+@beartype
+def balance_SMOTETomek(
+    X: Union[pd.DataFrame, np.ndarray],
+    y: Union[pd.Series, np.ndarray],
+    sampling_strategy: Union[float, str, dict] = "auto",
+    random_state: Optional[int] = None,
+) -> tuple[Union[pd.DataFrame, np.ndarray], Union[pd.Series, np.ndarray]]:
+    """Balances the classes of input dataset using SMOTETomek resampling method.
+
+    Args:
+        X: The feature matrix (input data as a DataFrame).
+        y: The target labels corresponding to the feature matrix.
+        sampling_strategy: Parameter controlling how to perform the resampling.
+            If float, specifies the ratio of samples in minority class to samples of majority class,
+            if str, specifies classes to be resampled ("minority", "not minority", "not majority", "all", "auto"),
+            if dict, the keys should be targeted classes and values the desired number of samples for the class.
+            Defaults to "auto", which will resample all classes except the majority class.
+        random_state: Parameter controlling randomization of the algorithm. Can be given a seed (number).
+            Defaults to None, which randomizes the seed.
+
+    Returns:
+        Resampled feature matrix and target labels.
+
+    Raises:
+        NonMatchingParameterLengthsException: If X and y have different length.
+    """
+
+    if len(X) != len(y):
+        raise exceptions.NonMatchingParameterLengthsException(
+            "Feature matrix X and target labels y must have the same length."
+        )
+
+    X_res, y_res = SMOTETomek(sampling_strategy=sampling_strategy, random_state=random_state).fit_resample(X, y)
+    return X_res, y_res
diff --git a/environment.yml b/environment.yml
@@ -20,5 +20,6 @@ dependencies:
   - seaborn >=0.12.2
   - pykrige >=1.7.0
   - rtree >= 1.0.1
+  - imbalanced-learn >= 0.11.0
   # Dependencies for testing
   - pytest >=7.2.1
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -2,7 +2,7 @@
 name = "eis_toolkit"
 version = "0.1.0"
 description = "EIS Toolkit is a comprehensive collection of tools suitable for mineral prospectivity mapping. This toolkit has been developed as part of the Exploration Information System project which has been funded by European Union."
-authors = ["pavetsu14 <[email protected]>"]
+authors = []
 maintainers = ["Gispo Ltd. <[email protected]>"]
 license = "EUPL-1.2"
 readme = "README.md"
@@ -33,6 +33,7 @@ beartype = "^0.13.1"
 seaborn = "^0.12.2"
 pykrige = "^1.7.0"
 rtree = "^1.0.1"
+imbalanced-learn = "^0.11.0"
 
 [tool.poetry.dev-dependencies]
 jupyterlab = "^3.4.5"

diff --git a/requirements.txt b/requirements.txt
@@ -24,9 +24,10 @@ google-pasta==0.2.0 ; python_version >= "3.8" and python_version < "3.11"
 grpcio==1.48.1 ; python_version >= "3.8" and python_version < "3.11"
 h5py==3.7.0 ; python_version >= "3.8" and python_version < "3.11"
 idna==3.3 ; python_version >= "3.8" and python_version < "3.11"
+imbalanced-learn==0.11.0 ; python_version >= "3.8" and python_version < "3.11"
 importlib-metadata==4.12.0 ; python_version >= "3.8" and python_version < "3.11"
 jinja2==3.1.2 ; python_version >= "3.8" and python_version < "3.11"
-joblib==1.1.0 ; python_version >= "3.8" and python_version < "3.11"
+joblib==1.3.2 ; python_version >= "3.8" and python_version < "3.11"
 keras-preprocessing==1.1.2 ; python_version >= "3.8" and python_version < "3.11"
 keras==2.9.0 ; python_version >= "3.8" and python_version < "3.11"
 kiwisolver==1.4.4 ; python_version >= "3.8" and python_version < "3.11"

diff --git a/tests/data_tools/class_balancing_test.py b/tests/data_tools/class_balancing_test.py
@@ -0,0 +1,42 @@
+import numpy as np
+import pytest
+from sklearn.datasets import make_classification
+
+from eis_toolkit import exceptions
+from eis_toolkit.data_tools.class_balancing import balance_SMOTETomek
+
+# CREATE TEST DATA
+X, y = make_classification(
+    n_classes=2,
+    class_sep=2,
+    weights=[0.1, 0.9],
+    n_informative=3,
+    n_redundant=1,
+    flip_y=0,
+    n_features=20,
+    n_clusters_per_class=1,
+    n_samples=1000,
+    random_state=10,
+)
+
+
+def test_SMOTETomek():
+    """Test that balance_SMOTETomek function works as expected."""
+    assert not np.array_equal(np.count_nonzero(y == 0), np.count_nonzero(y == 1))  # Class imbalance before balancing
+
+    X_res, y_res = balance_SMOTETomek(X, y)
+
+    np.testing.assert_equal(len(X_res), len(y_res))
+    np.testing.assert_equal(np.count_nonzero(y_res == 0), np.count_nonzero(y_res == 1))  # Class balance after balancing
+
+
+def test_invalid_label_length():
+    """Test that different length for feature matrix and labels raises the correct exception."""
+    with pytest.raises(exceptions.NonMatchingParameterLengthsException):
+        balance_SMOTETomek(X, np.append(y, "C"))
+
+
+def test_invalid_sampling_strategy():
+    """Test that invalid value for sampling strategy raises the correct exception (generated by imblearn)."""
+    with pytest.raises(ValueError):
+        balance_SMOTETomek(X, y, sampling_strategy="invalid_strategy")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Class balancing

		::: eis_toolkit.data_tools.class_balancing