Skip to content

Commit

Permalink
optimize boolean inference (#1713)
Browse files Browse the repository at this point in the history
* remove generation in inference

* update transform

* remove set generation

* update release notes

* speed up mapping

* reverse change

* speed improvements

* speed up inference
  • Loading branch information
simha104 authored Jul 25, 2023
1 parent 752c715 commit 732cf90
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 34 deletions.
3 changes: 2 additions & 1 deletion docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@ Release Notes
Future Release
==============
* Enhancements
* Optimized ``Boolean`` inference by removing generation of mappings and sets of boolean values (:pr:`1713`)
* Speed up Boolean and Integer inference by caching results of corresponding nullable type inference (:pr:`1733`)
* Fixes
* Changes
* Documentation Changes
* Testing Changes

Thanks to the following people for contributing to this release:
:user:`sbadithe`
:user:`sbadithe`, :user:`simha104`

v0.25.1 Jul 18, 2023
====================
Expand Down
27 changes: 18 additions & 9 deletions woodwork/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,24 @@
"correlation_metrics": ["mutual_info", "pearson", "spearman", "max", "all"],
"medcouple_threshold": 0.3, # Must be between 0.0 and 1.0
"medcouple_sample_size": 10000,
"boolean_inference_strings": [
["yes", "no"],
["y", "n"],
["true", "false"],
["t", "f"],
],
# when adding to boolean_inference_ints, add `0, 1` to the list directly
# rather than making it nested lists
"boolean_inference_ints": [],
"boolean_inference_strings": {
frozenset(["yes", "no"]),
frozenset(["y", "n"]),
frozenset(["true", "false"]),
frozenset(["t", "f"]),
},
"boolean_transform_mappings": {
"yes": True,
"no": False,
"y": True,
"n": False,
"true": True,
"false": False,
"t": True,
"f": False,
},
# when adding to boolean_inference_ints, add `0, 1` to the set directly
"boolean_inference_ints": {},
}


Expand Down
19 changes: 12 additions & 7 deletions woodwork/logical_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -970,15 +970,20 @@ def _coerce_boolean(series, null_invalid_values=False):

def _transform_boolean(series, null_invalid_values):
boolean_inference_list = config.get_option("boolean_inference_strings").copy()
boolean_inference_list.extend([["1", "0"], ["1.0", "0.0"]])
valid = {}
for booleans in boolean_inference_list:
valid[booleans[0]] = True
valid[booleans[1]] = False
boolean_inference_list.update({frozenset(["1", "0"]), frozenset(["1.0", "0.0"])})
boolean_transform_mappings = config.get_option("boolean_transform_mappings").copy()
boolean_transform_mappings.update(
{
"1": True,
"0": False,
"1.0": True,
"0.0": False,
},
)
if null_invalid_values:
series = series.apply(lambda x: valid.get(x, np.nan))
series = series.apply(lambda x: boolean_transform_mappings.get(x, np.nan))
else:
series = series.apply(lambda x: valid.get(x, x))
series = series.apply(lambda x: boolean_transform_mappings.get(x, x))
return series


Expand Down
2 changes: 1 addition & 1 deletion woodwork/tests/logical_types/test_logical_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -1132,7 +1132,7 @@ def tester_df_ints():
assert df2.values.tolist() == [[bool(i % 2)] for i in range(100)]

if ints_to_config:
with config.with_options(boolean_inference_ints=[0, 1]):
with config.with_options(boolean_inference_ints={0, 1}):
tester_df_ints()
else:
tester_df_ints()
Expand Down
26 changes: 10 additions & 16 deletions woodwork/type_sys/inference_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,35 +118,29 @@ def boolean_func(series, is_boolean_nullable=None):


def boolean_nullable_func(series):
if pdtypes.is_bool_dtype(series.dtype) and not pdtypes.is_categorical_dtype(
series.dtype,
dtype = series.dtype
if pdtypes.is_bool_dtype(dtype) and not pdtypes.is_categorical_dtype(
dtype,
):
return True
elif pdtypes.is_object_dtype(series.dtype):
elif pdtypes.is_object_dtype(dtype):
series_no_null = series.dropna()
try:
series_no_null_unq = set(series_no_null)
if series_no_null_unq in [
{False, True},
{True},
{False},
]:
if series_no_null_unq.issubset({False, True}):
return True
series_lower = set(str(s).lower() for s in set(series_no_null))
if series_lower in [
set(boolean_list)
for boolean_list in config.get_option("boolean_inference_strings")
]:
series_lower = set(str(s).lower() for s in series_no_null_unq)
if series_lower in config.get_option("boolean_inference_strings"):
return True
except (
TypeError
): # Necessary to check for non-hashable values because of object dtype consideration
return False
elif pdtypes.is_integer_dtype(series.dtype) and len(
elif pdtypes.is_integer_dtype(dtype) and len(
config.get_option("boolean_inference_ints"),
):
series_unique = set(series)
if series_unique == set(config.get_option("boolean_inference_ints")):
series_unique = set(series.unique())
if series_unique == config.get_option("boolean_inference_ints"):
return True
return False

Expand Down

0 comments on commit 732cf90

Please sign in to comment.