From 732cf90f051cef0d65fa3a3ecfefecf9d07743b2 Mon Sep 17 00:00:00 2001 From: Pranav Simha Date: Tue, 25 Jul 2023 10:00:43 -0700 Subject: [PATCH] optimize boolean inference (#1713) * remove generation in inference * update transform * remove set generation * update release notes * speed up mapping * reverse change * speed improvements * speed up inference --- docs/source/release_notes.rst | 3 ++- woodwork/config.py | 27 ++++++++++++------- woodwork/logical_types.py | 19 ++++++++----- .../tests/logical_types/test_logical_types.py | 2 +- woodwork/type_sys/inference_functions.py | 26 +++++++----------- 5 files changed, 43 insertions(+), 34 deletions(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 13015f20a..460466898 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -6,6 +6,7 @@ Release Notes Future Release ============== * Enhancements + * Optimized ``Boolean`` inference by removing generation of mappings and sets of boolean values (:pr:`1713`) * Speed up Boolean and Integer inference by caching results of corresponding nullable type inference (:pr:`1733`) * Fixes * Changes @@ -13,7 +14,7 @@ Future Release * Testing Changes Thanks to the following people for contributing to this release: - :user:`sbadithe` + :user:`sbadithe`, :user:`simha104` v0.25.1 Jul 18, 2023 ==================== diff --git a/woodwork/config.py b/woodwork/config.py index a43aac24b..e7ad8dccc 100644 --- a/woodwork/config.py +++ b/woodwork/config.py @@ -54,15 +54,24 @@ "correlation_metrics": ["mutual_info", "pearson", "spearman", "max", "all"], "medcouple_threshold": 0.3, # Must be between 0.0 and 1.0 "medcouple_sample_size": 10000, - "boolean_inference_strings": [ - ["yes", "no"], - ["y", "n"], - ["true", "false"], - ["t", "f"], - ], - # when adding to boolean_inference_ints, add `0, 1` to the list directly - # rather than making it nested lists - "boolean_inference_ints": [], + "boolean_inference_strings": { + frozenset(["yes", "no"]), + frozenset(["y", "n"]), + frozenset(["true", "false"]), + frozenset(["t", "f"]), + }, + "boolean_transform_mappings": { + "yes": True, + "no": False, + "y": True, + "n": False, + "true": True, + "false": False, + "t": True, + "f": False, + }, + # when adding to boolean_inference_ints, add `0, 1` to the set directly + "boolean_inference_ints": {}, } diff --git a/woodwork/logical_types.py b/woodwork/logical_types.py index 5b0997468..b833644d7 100644 --- a/woodwork/logical_types.py +++ b/woodwork/logical_types.py @@ -970,15 +970,20 @@ def _coerce_boolean(series, null_invalid_values=False): def _transform_boolean(series, null_invalid_values): boolean_inference_list = config.get_option("boolean_inference_strings").copy() - boolean_inference_list.extend([["1", "0"], ["1.0", "0.0"]]) - valid = {} - for booleans in boolean_inference_list: - valid[booleans[0]] = True - valid[booleans[1]] = False + boolean_inference_list.update({frozenset(["1", "0"]), frozenset(["1.0", "0.0"])}) + boolean_transform_mappings = config.get_option("boolean_transform_mappings").copy() + boolean_transform_mappings.update( + { + "1": True, + "0": False, + "1.0": True, + "0.0": False, + }, + ) if null_invalid_values: - series = series.apply(lambda x: valid.get(x, np.nan)) + series = series.apply(lambda x: boolean_transform_mappings.get(x, np.nan)) else: - series = series.apply(lambda x: valid.get(x, x)) + series = series.apply(lambda x: boolean_transform_mappings.get(x, x)) return series diff --git a/woodwork/tests/logical_types/test_logical_types.py b/woodwork/tests/logical_types/test_logical_types.py index 6140d0ec9..ef6fb58bc 100644 --- a/woodwork/tests/logical_types/test_logical_types.py +++ b/woodwork/tests/logical_types/test_logical_types.py @@ -1132,7 +1132,7 @@ def tester_df_ints(): assert df2.values.tolist() == [[bool(i % 2)] for i in range(100)] if ints_to_config: - with config.with_options(boolean_inference_ints=[0, 1]): + with config.with_options(boolean_inference_ints={0, 1}): tester_df_ints() else: tester_df_ints() diff --git a/woodwork/type_sys/inference_functions.py b/woodwork/type_sys/inference_functions.py index 34c06fb27..c4e482b9a 100644 --- a/woodwork/type_sys/inference_functions.py +++ b/woodwork/type_sys/inference_functions.py @@ -118,35 +118,29 @@ def boolean_func(series, is_boolean_nullable=None): def boolean_nullable_func(series): - if pdtypes.is_bool_dtype(series.dtype) and not pdtypes.is_categorical_dtype( - series.dtype, + dtype = series.dtype + if pdtypes.is_bool_dtype(dtype) and not pdtypes.is_categorical_dtype( + dtype, ): return True - elif pdtypes.is_object_dtype(series.dtype): + elif pdtypes.is_object_dtype(dtype): series_no_null = series.dropna() try: series_no_null_unq = set(series_no_null) - if series_no_null_unq in [ - {False, True}, - {True}, - {False}, - ]: + if series_no_null_unq.issubset({False, True}): return True - series_lower = set(str(s).lower() for s in set(series_no_null)) - if series_lower in [ - set(boolean_list) - for boolean_list in config.get_option("boolean_inference_strings") - ]: + series_lower = set(str(s).lower() for s in series_no_null_unq) + if series_lower in config.get_option("boolean_inference_strings"): return True except ( TypeError ): # Necessary to check for non-hashable values because of object dtype consideration return False - elif pdtypes.is_integer_dtype(series.dtype) and len( + elif pdtypes.is_integer_dtype(dtype) and len( config.get_option("boolean_inference_ints"), ): - series_unique = set(series) - if series_unique == set(config.get_option("boolean_inference_ints")): + series_unique = set(series.unique()) + if series_unique == config.get_option("boolean_inference_ints"): return True return False