optimize boolean inference (#1713)

* remove generation in inference * update transform * remove set generation * update release notes * speed up mapping * reverse change * speed improvements * speed up inference
alteryx · Jul 25, 2023 · 732cf90 · 732cf90
1 parent 752c715
commit 732cf90
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 34 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -6,14 +6,15 @@ Release Notes
 Future Release
 ==============
     * Enhancements
+      * Optimized ``Boolean`` inference by removing generation of mappings and sets of boolean values (:pr:`1713`)
       * Speed up Boolean and Integer inference by caching results of corresponding nullable type inference (:pr:`1733`)
     * Fixes
     * Changes
     * Documentation Changes
     * Testing Changes
 
   Thanks to the following people for contributing to this release:
-  :user:`sbadithe`
+  :user:`sbadithe`, :user:`simha104`
 
 v0.25.1 Jul 18, 2023
 ====================

diff --git a/woodwork/config.py b/woodwork/config.py
@@ -54,15 +54,24 @@
     "correlation_metrics": ["mutual_info", "pearson", "spearman", "max", "all"],
     "medcouple_threshold": 0.3,  # Must be between 0.0 and 1.0
     "medcouple_sample_size": 10000,
-    "boolean_inference_strings": [
-        ["yes", "no"],
-        ["y", "n"],
-        ["true", "false"],
-        ["t", "f"],
-    ],
-    # when adding to boolean_inference_ints, add `0, 1` to the list directly
-    # rather than making it nested lists
-    "boolean_inference_ints": [],
+    "boolean_inference_strings": {
+        frozenset(["yes", "no"]),
+        frozenset(["y", "n"]),
+        frozenset(["true", "false"]),
+        frozenset(["t", "f"]),
+    },
+    "boolean_transform_mappings": {
+        "yes": True,
+        "no": False,
+        "y": True,
+        "n": False,
+        "true": True,
+        "false": False,
+        "t": True,
+        "f": False,
+    },
+    # when adding to boolean_inference_ints, add `0, 1` to the set directly
+    "boolean_inference_ints": {},
 }
 
 

diff --git a/woodwork/logical_types.py b/woodwork/logical_types.py
@@ -970,15 +970,20 @@ def _coerce_boolean(series, null_invalid_values=False):
 
 def _transform_boolean(series, null_invalid_values):
     boolean_inference_list = config.get_option("boolean_inference_strings").copy()
-    boolean_inference_list.extend([["1", "0"], ["1.0", "0.0"]])
-    valid = {}
-    for booleans in boolean_inference_list:
-        valid[booleans[0]] = True
-        valid[booleans[1]] = False
+    boolean_inference_list.update({frozenset(["1", "0"]), frozenset(["1.0", "0.0"])})
+    boolean_transform_mappings = config.get_option("boolean_transform_mappings").copy()
+    boolean_transform_mappings.update(
+        {
+            "1": True,
+            "0": False,
+            "1.0": True,
+            "0.0": False,
+        },
+    )
     if null_invalid_values:
-        series = series.apply(lambda x: valid.get(x, np.nan))
+        series = series.apply(lambda x: boolean_transform_mappings.get(x, np.nan))
     else:
-        series = series.apply(lambda x: valid.get(x, x))
+        series = series.apply(lambda x: boolean_transform_mappings.get(x, x))
     return series
 
 

diff --git a/woodwork/tests/logical_types/test_logical_types.py b/woodwork/tests/logical_types/test_logical_types.py
@@ -1132,7 +1132,7 @@ def tester_df_ints():
         assert df2.values.tolist() == [[bool(i % 2)] for i in range(100)]
 
     if ints_to_config:
-        with config.with_options(boolean_inference_ints=[0, 1]):
+        with config.with_options(boolean_inference_ints={0, 1}):
             tester_df_ints()
     else:
         tester_df_ints()

diff --git a/woodwork/type_sys/inference_functions.py b/woodwork/type_sys/inference_functions.py
@@ -118,35 +118,29 @@ def boolean_func(series, is_boolean_nullable=None):
 
 
 def boolean_nullable_func(series):
-    if pdtypes.is_bool_dtype(series.dtype) and not pdtypes.is_categorical_dtype(
-        series.dtype,
+    dtype = series.dtype
+    if pdtypes.is_bool_dtype(dtype) and not pdtypes.is_categorical_dtype(
+        dtype,
     ):
         return True
-    elif pdtypes.is_object_dtype(series.dtype):
+    elif pdtypes.is_object_dtype(dtype):
         series_no_null = series.dropna()
         try:
             series_no_null_unq = set(series_no_null)
-            if series_no_null_unq in [
-                {False, True},
-                {True},
-                {False},
-            ]:
+            if series_no_null_unq.issubset({False, True}):
                 return True
-            series_lower = set(str(s).lower() for s in set(series_no_null))
-            if series_lower in [
-                set(boolean_list)
-                for boolean_list in config.get_option("boolean_inference_strings")
-            ]:
+            series_lower = set(str(s).lower() for s in series_no_null_unq)
+            if series_lower in config.get_option("boolean_inference_strings"):
                 return True
         except (
             TypeError
         ):  # Necessary to check for non-hashable values because of object dtype consideration
             return False
-    elif pdtypes.is_integer_dtype(series.dtype) and len(
+    elif pdtypes.is_integer_dtype(dtype) and len(
         config.get_option("boolean_inference_ints"),
     ):
-        series_unique = set(series)
-        if series_unique == set(config.get_option("boolean_inference_ints")):
+        series_unique = set(series.unique())
+        if series_unique == config.get_option("boolean_inference_ints"):
             return True
     return False