Merge: Fix rare degenerate comp_df bug (#399)

- Adds a utility `df_add_noise_to_degenerate_rows` to add noise to degenerate rows of a numerical dataframe - Adds a test for the utility - Disallows `CustomDiscreteParameter` accepting `data` that has degenerate rows - Adds input tests for `CustomDiscreteParameter` - Adds the fix to the `comp_df` of `SubstanceParameter` - End-to-end test for degenerate substance encoding
emdgroup · Nov 11, 2024 · 63fdaff · 63fdaff
2 parents b7297fa + 2666f0e
commit 63fdaff
Show file tree

Hide file tree

Showing 8 changed files with 147 additions and 14 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,13 +8,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - `allow_missing` and `allow_extra` keyword arguments to `Objective.transform`
 - Example for a traditional mixture
+- `add_noise_to_perturb_degenerate_rows` utility
 
 ### Changed
 - `SubstanceParameter` encodings are now computed exclusively with the
   `scikit-fingerprints` package, granting access to all fingerprints available therein
 - Example for slot-based mixtures has been revised and grouped together with the new 
   traditional mixture example
 - Memory caching is now non-verbose
+- `CustomDiscreteParameter` does not allow duplicated rows in `data` anymore
+
+### Fixed
+- Rare bug arising from degenerate `SubstanceParameter.comp_df` rows that caused
+  wrong number of recommendations being returned
 
 ### Deprecations
 - Passing a dataframe via the `data` argument to `Objective.transform` is no longer

diff --git a/baybe/parameters/custom.py b/baybe/parameters/custom.py
@@ -58,6 +58,7 @@ def _validate_custom_data(  # noqa: DOC101, DOC103
             ValueError: If the dataframe contains ``NaN``.
             ValueError: If the dataframe index contains duplicates.
             ValueError: If the dataframe contains columns with only one unique value.
+            ValueError: If the dataframe contains duplicated rows.
         """
         if value.select_dtypes("number").shape[1] != value.shape[1]:
             raise ValueError(
@@ -89,6 +90,13 @@ def _validate_custom_data(  # noqa: DOC101, DOC103
                 f"The custom dataframe for parameter {self.name} has columns "
                 "that contain only a single value and hence carry no information."
             )
+        if value.duplicated().any():
+            raise ValueError(
+                f"The custom dataframe for parameter {self.name} has duplicated rows. "
+                f"This is not supported because it can lead to ambiguous computational "
+                f"representations of candidate points. Please ensure all labels have a "
+                f"unique numerical representation."
+            )
 
     @override
     @property

diff --git a/baybe/parameters/substance.py b/baybe/parameters/substance.py
@@ -13,7 +13,11 @@
 from baybe.parameters.enum import SubstanceEncoding
 from baybe.parameters.validation import validate_decorrelation
 from baybe.utils.basic import group_duplicate_values
-from baybe.utils.dataframe import df_drop_single_value_columns, df_uncorrelated_features
+from baybe.utils.dataframe import (
+    add_noise_to_perturb_degenerate_rows,
+    df_drop_single_value_columns,
+    df_uncorrelated_features,
+)
 
 try:  # For python < 3.11, use the exceptiongroup backport
     ExceptionGroup
@@ -150,6 +154,9 @@ def comp_df(self) -> pd.DataFrame:
             else:
                 comp_df = df_uncorrelated_features(comp_df, threshold=self.decorrelate)
 
+        # Add noise to degenerate rows if present
+        add_noise_to_perturb_degenerate_rows(comp_df)
+
         return comp_df
 
 

diff --git a/baybe/utils/dataframe.py b/baybe/utils/dataframe.py
@@ -360,6 +360,55 @@ def df_uncorrelated_features(
     return data
 
 
+def add_noise_to_perturb_degenerate_rows(
+    df: pd.DataFrame, noise_ratio: float = 0.001
+) -> pd.DataFrame:
+    """Add noise to degenerate rows to make them numerically distinguishable.
+
+    Note that the dataframe is changed in-place and also returned. The dataframe is
+    left untouched if no rows are degenerate.
+
+    Args:
+        df: The dataframe to be modified.
+        noise_ratio: The magnitude of generated uniform noise relative to the
+            min-max range of values for each column.
+
+    Returns:
+        The modified dataframe.
+
+    Raises:
+        TypeError: If the provided dataframe has non-numerical content.
+    """
+    # Find degenerate rows, exit if there are none
+    degen_rows = df.duplicated(keep=False)
+    if not degen_rows.any():
+        return df
+
+    # Assert that the input is purely numerical
+    if any(df[col].dtype.kind not in "iufb" for col in df.columns):
+        raise TypeError(
+            f"'{add_noise_to_perturb_degenerate_rows.__name__}' only supports purely "
+            f"numerical dataframes."
+        )
+
+    # Find the min-max range for each column. Constant columns will be assigned a range
+    # of 1 as fallback as otherwise they would be left untouched
+    column_ranges = df.max() - df.min()
+    column_ranges = column_ranges.replace(0, 1)
+
+    # Generate noise
+    noise = np.random.uniform(
+        -noise_ratio, noise_ratio, size=(degen_rows.sum(), df.shape[1])
+    )
+    noise_df = pd.DataFrame(noise, columns=df.columns, index=df.index[degen_rows])
+
+    # Scale noise by column ranges and add it to the original dataframe
+    noise_df = noise_df * column_ranges
+    df.loc[degen_rows] += noise_df
+
+    return df
+
+
 def fuzzy_row_match(
     left_df: pd.DataFrame,
     right_df: pd.DataFrame,

diff --git a/tests/test_custom_parameter.py b/tests/test_custom_parameter.py
@@ -1,9 +1,4 @@
-"""Test for initial simple input, recommendation and adding fake results.
-
-Fake target measurements are simulated for each round. Noise is added every second
-round. From the three recommendations only one is actually added to test the matching
-and metadata. Target objective is minimize to test computational transformation.
-"""
+"""Tests for the custom parameter."""
 
 import pytest
 

diff --git a/tests/test_substance_parameter.py b/tests/test_substance_parameter.py
@@ -19,3 +19,24 @@
 def test_run_iterations(campaign, batch_size, n_iterations):
     """Test running some iterations with fake results and a substance parameter."""
     run_iterations(campaign, n_iterations, batch_size)
+
+
+@pytest.mark.skipif(
+    not CHEM_INSTALLED, reason="Optional chem dependency not installed."
+)
+def test_degenerate_comp_df():
+    """Test a degenerate comp_df is detected and fixed numerically."""
+    from baybe.parameters import SubstanceParameter
+
+    # These molecules are know to cause a degenerate representation with rdkit fp's
+    dict_base = {
+        "Potassium acetate": r"O=C([O-])C.[K+]",
+        "Potassium pivalate": r"O=C([O-])C(C)(C)C.[K+]",
+        "Cesium acetate": r"O=C([O-])C.[Cs+]",
+        "Cesium pivalate": r"O=C([O-])C(C)(C)C.[Cs+]",
+    }
+    p = SubstanceParameter(name="p", data=dict_base, encoding="RDKITFINGERPRINT")
+
+    assert (
+        not p.comp_df.duplicated().any()
+    ), "A degenerate comp_df was not correctly treated."
diff --git a/tests/utils/test_dataframe.py b/tests/utils/test_dataframe.py
@@ -0,0 +1,43 @@
+"""Tests for dataframe utilities."""
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from baybe.utils.dataframe import add_noise_to_perturb_degenerate_rows
+
+
+def test_degenerate_rows():
+    """Test noise-based deduplication of degenerate rows."""
+    # Create random dataframe
+    df = pd.DataFrame(np.random.randint(0, 100, size=(5, 3))).astype(float)
+
+    # Manually create some degenerate rows
+    df.loc[1] = df.loc[0]  # Make row 1 identical to row 0
+    df.loc[3] = df.loc[2]  # Make row 3 identical to row 2
+    df.iloc[:, -1] = 50.0  # Make last column constant to test the edge case
+
+    # Add noise
+    add_noise_to_perturb_degenerate_rows(df)
+
+    # Assert that the utility fixed the degenerate rows
+    assert not df.duplicated().any(), "Degenerate rows were not fixed by the utility."
+
+
+def test_degenerate_rows_invalid_input():
+    """Test that the utility correctly handles invalid input."""
+    # Create random dataframe
+    df = pd.DataFrame(np.random.randint(0, 100, size=(5, 3))).astype(float)
+
+    # Manually create some degenerate rows
+    df.loc[1] = df.loc[0]  # Make row 1 identical to row 0
+    df.loc[3] = df.loc[2]  # Make row 3 identical to row 2
+
+    # Insert invalid data types
+    df = df.astype(object)  # to avoid pandas column dtype warnings
+    df["invalid"] = "A"
+    df.iloc[1, 0] = "B"
+
+    # Add noise
+    with pytest.raises(TypeError):
+        add_noise_to_perturb_degenerate_rows(df)
diff --git a/tests/validation/test_parameter_validation.py b/tests/validation/test_parameter_validation.py
@@ -167,14 +167,18 @@ def test_invalid_encoding_substance_parameter():
 @pytest.mark.parametrize(
     "data",
     [
-        param(pd.DataFrame([[1, 2], [3, np.nan]], index=["A", "B"]), id="nan"),
-        param(pd.DataFrame([[1, 2], [3, np.inf]], index=["A", "B"]), id="infinity"),
-        param(pd.DataFrame([[1, 2], [3, 4]], index=["A", "A"]), id="duplicates"),
-        param(pd.DataFrame([[1, 2]], index=["A"]), id="only_one_value"),
-        param(pd.DataFrame([[1, 2], [1, 2]], index=["A", "B"]), id="zero_variance"),
+        param(pd.DataFrame([[1, 2], [3, np.nan]], index=["A", "B"]), id="contains_nan"),
+        param(pd.DataFrame([[1, 2], [3, np.inf]], index=["A", "B"]), id="contains_inf"),
+        param(pd.DataFrame([[1, 2], [3, 4]], index=["A", "A"]), id="duplicate_idxs"),
+        param(pd.DataFrame([[1, 2]], index=["A"]), id="wrong_label_number"),
+        param(pd.DataFrame([[1, 2], [1, 2]], index=["A", "B"]), id="zero_var_col"),
         param(pd.DataFrame([[1, 2], [3, "a"]], index=["A", "B"]), id="wrong_type"),
-        param(pd.DataFrame([[1, 2], [3, 4]], index=["A", 1]), id="not_a_string"),
-        param(pd.DataFrame([[1, 2], [3, 4]], index=["A", ""]), id="empty_string"),
+        param(pd.DataFrame([[1, 2], [3, 4]], index=["A", 1]), id="non_string_idx"),
+        param(pd.DataFrame([[1, 2], [3, 4]], index=["A", ""]), id="empty_string_idx"),
+        param(
+            pd.DataFrame([[1, 2], [1, 2], [3, 4]], index=["A", "B", "C"]),
+            id="duplicate_rows",
+        ),
     ],
 )
 def test_invalid_data_custom_parameter(data):