Skip to content

Commit

Permalink
Merge: Fix rare degenerate comp_df bug (#399)
Browse files Browse the repository at this point in the history
- Adds a utility `df_add_noise_to_degenerate_rows` to add noise to
degenerate rows of a numerical dataframe
- Adds a test for the utility
- Disallows `CustomDiscreteParameter` accepting `data` that has
degenerate rows
- Adds input tests for `CustomDiscreteParameter`
- Adds the fix to the `comp_df` of `SubstanceParameter`
- End-to-end test for degenerate substance encoding
  • Loading branch information
Scienfitz authored Nov 11, 2024
2 parents b7297fa + 2666f0e commit 63fdaff
Show file tree
Hide file tree
Showing 8 changed files with 147 additions and 14 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- `allow_missing` and `allow_extra` keyword arguments to `Objective.transform`
- Example for a traditional mixture
- `add_noise_to_perturb_degenerate_rows` utility

### Changed
- `SubstanceParameter` encodings are now computed exclusively with the
`scikit-fingerprints` package, granting access to all fingerprints available therein
- Example for slot-based mixtures has been revised and grouped together with the new
traditional mixture example
- Memory caching is now non-verbose
- `CustomDiscreteParameter` does not allow duplicated rows in `data` anymore

### Fixed
- Rare bug arising from degenerate `SubstanceParameter.comp_df` rows that caused
wrong number of recommendations being returned

### Deprecations
- Passing a dataframe via the `data` argument to `Objective.transform` is no longer
Expand Down
8 changes: 8 additions & 0 deletions baybe/parameters/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def _validate_custom_data( # noqa: DOC101, DOC103
ValueError: If the dataframe contains ``NaN``.
ValueError: If the dataframe index contains duplicates.
ValueError: If the dataframe contains columns with only one unique value.
ValueError: If the dataframe contains duplicated rows.
"""
if value.select_dtypes("number").shape[1] != value.shape[1]:
raise ValueError(
Expand Down Expand Up @@ -89,6 +90,13 @@ def _validate_custom_data( # noqa: DOC101, DOC103
f"The custom dataframe for parameter {self.name} has columns "
"that contain only a single value and hence carry no information."
)
if value.duplicated().any():
raise ValueError(
f"The custom dataframe for parameter {self.name} has duplicated rows. "
f"This is not supported because it can lead to ambiguous computational "
f"representations of candidate points. Please ensure all labels have a "
f"unique numerical representation."
)

@override
@property
Expand Down
9 changes: 8 additions & 1 deletion baybe/parameters/substance.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@
from baybe.parameters.enum import SubstanceEncoding
from baybe.parameters.validation import validate_decorrelation
from baybe.utils.basic import group_duplicate_values
from baybe.utils.dataframe import df_drop_single_value_columns, df_uncorrelated_features
from baybe.utils.dataframe import (
add_noise_to_perturb_degenerate_rows,
df_drop_single_value_columns,
df_uncorrelated_features,
)

try: # For python < 3.11, use the exceptiongroup backport
ExceptionGroup
Expand Down Expand Up @@ -150,6 +154,9 @@ def comp_df(self) -> pd.DataFrame:
else:
comp_df = df_uncorrelated_features(comp_df, threshold=self.decorrelate)

# Add noise to degenerate rows if present
add_noise_to_perturb_degenerate_rows(comp_df)

return comp_df


Expand Down
49 changes: 49 additions & 0 deletions baybe/utils/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,55 @@ def df_uncorrelated_features(
return data


def add_noise_to_perturb_degenerate_rows(
df: pd.DataFrame, noise_ratio: float = 0.001
) -> pd.DataFrame:
"""Add noise to degenerate rows to make them numerically distinguishable.
Note that the dataframe is changed in-place and also returned. The dataframe is
left untouched if no rows are degenerate.
Args:
df: The dataframe to be modified.
noise_ratio: The magnitude of generated uniform noise relative to the
min-max range of values for each column.
Returns:
The modified dataframe.
Raises:
TypeError: If the provided dataframe has non-numerical content.
"""
# Find degenerate rows, exit if there are none
degen_rows = df.duplicated(keep=False)
if not degen_rows.any():
return df

# Assert that the input is purely numerical
if any(df[col].dtype.kind not in "iufb" for col in df.columns):
raise TypeError(
f"'{add_noise_to_perturb_degenerate_rows.__name__}' only supports purely "
f"numerical dataframes."
)

# Find the min-max range for each column. Constant columns will be assigned a range
# of 1 as fallback as otherwise they would be left untouched
column_ranges = df.max() - df.min()
column_ranges = column_ranges.replace(0, 1)

# Generate noise
noise = np.random.uniform(
-noise_ratio, noise_ratio, size=(degen_rows.sum(), df.shape[1])
)
noise_df = pd.DataFrame(noise, columns=df.columns, index=df.index[degen_rows])

# Scale noise by column ranges and add it to the original dataframe
noise_df = noise_df * column_ranges
df.loc[degen_rows] += noise_df

return df


def fuzzy_row_match(
left_df: pd.DataFrame,
right_df: pd.DataFrame,
Expand Down
7 changes: 1 addition & 6 deletions tests/test_custom_parameter.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
"""Test for initial simple input, recommendation and adding fake results.
Fake target measurements are simulated for each round. Noise is added every second
round. From the three recommendations only one is actually added to test the matching
and metadata. Target objective is minimize to test computational transformation.
"""
"""Tests for the custom parameter."""

import pytest

Expand Down
21 changes: 21 additions & 0 deletions tests/test_substance_parameter.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,24 @@
def test_run_iterations(campaign, batch_size, n_iterations):
"""Test running some iterations with fake results and a substance parameter."""
run_iterations(campaign, n_iterations, batch_size)


@pytest.mark.skipif(
not CHEM_INSTALLED, reason="Optional chem dependency not installed."
)
def test_degenerate_comp_df():
"""Test a degenerate comp_df is detected and fixed numerically."""
from baybe.parameters import SubstanceParameter

# These molecules are know to cause a degenerate representation with rdkit fp's
dict_base = {
"Potassium acetate": r"O=C([O-])C.[K+]",
"Potassium pivalate": r"O=C([O-])C(C)(C)C.[K+]",
"Cesium acetate": r"O=C([O-])C.[Cs+]",
"Cesium pivalate": r"O=C([O-])C(C)(C)C.[Cs+]",
}
p = SubstanceParameter(name="p", data=dict_base, encoding="RDKITFINGERPRINT")

assert (
not p.comp_df.duplicated().any()
), "A degenerate comp_df was not correctly treated."
43 changes: 43 additions & 0 deletions tests/utils/test_dataframe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""Tests for dataframe utilities."""

import numpy as np
import pandas as pd
import pytest

from baybe.utils.dataframe import add_noise_to_perturb_degenerate_rows


def test_degenerate_rows():
"""Test noise-based deduplication of degenerate rows."""
# Create random dataframe
df = pd.DataFrame(np.random.randint(0, 100, size=(5, 3))).astype(float)

# Manually create some degenerate rows
df.loc[1] = df.loc[0] # Make row 1 identical to row 0
df.loc[3] = df.loc[2] # Make row 3 identical to row 2
df.iloc[:, -1] = 50.0 # Make last column constant to test the edge case

# Add noise
add_noise_to_perturb_degenerate_rows(df)

# Assert that the utility fixed the degenerate rows
assert not df.duplicated().any(), "Degenerate rows were not fixed by the utility."


def test_degenerate_rows_invalid_input():
"""Test that the utility correctly handles invalid input."""
# Create random dataframe
df = pd.DataFrame(np.random.randint(0, 100, size=(5, 3))).astype(float)

# Manually create some degenerate rows
df.loc[1] = df.loc[0] # Make row 1 identical to row 0
df.loc[3] = df.loc[2] # Make row 3 identical to row 2

# Insert invalid data types
df = df.astype(object) # to avoid pandas column dtype warnings
df["invalid"] = "A"
df.iloc[1, 0] = "B"

# Add noise
with pytest.raises(TypeError):
add_noise_to_perturb_degenerate_rows(df)
18 changes: 11 additions & 7 deletions tests/validation/test_parameter_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,14 +167,18 @@ def test_invalid_encoding_substance_parameter():
@pytest.mark.parametrize(
"data",
[
param(pd.DataFrame([[1, 2], [3, np.nan]], index=["A", "B"]), id="nan"),
param(pd.DataFrame([[1, 2], [3, np.inf]], index=["A", "B"]), id="infinity"),
param(pd.DataFrame([[1, 2], [3, 4]], index=["A", "A"]), id="duplicates"),
param(pd.DataFrame([[1, 2]], index=["A"]), id="only_one_value"),
param(pd.DataFrame([[1, 2], [1, 2]], index=["A", "B"]), id="zero_variance"),
param(pd.DataFrame([[1, 2], [3, np.nan]], index=["A", "B"]), id="contains_nan"),
param(pd.DataFrame([[1, 2], [3, np.inf]], index=["A", "B"]), id="contains_inf"),
param(pd.DataFrame([[1, 2], [3, 4]], index=["A", "A"]), id="duplicate_idxs"),
param(pd.DataFrame([[1, 2]], index=["A"]), id="wrong_label_number"),
param(pd.DataFrame([[1, 2], [1, 2]], index=["A", "B"]), id="zero_var_col"),
param(pd.DataFrame([[1, 2], [3, "a"]], index=["A", "B"]), id="wrong_type"),
param(pd.DataFrame([[1, 2], [3, 4]], index=["A", 1]), id="not_a_string"),
param(pd.DataFrame([[1, 2], [3, 4]], index=["A", ""]), id="empty_string"),
param(pd.DataFrame([[1, 2], [3, 4]], index=["A", 1]), id="non_string_idx"),
param(pd.DataFrame([[1, 2], [3, 4]], index=["A", ""]), id="empty_string_idx"),
param(
pd.DataFrame([[1, 2], [1, 2], [3, 4]], index=["A", "B", "C"]),
id="duplicate_rows",
),
],
)
def test_invalid_data_custom_parameter(data):
Expand Down

0 comments on commit 63fdaff

Please sign in to comment.