From 4ec2bd6f2f4f7f7b82aff50bf2685da972efad79 Mon Sep 17 00:00:00 2001 From: Liz Smith Date: Tue, 13 Jun 2023 15:50:09 -0500 Subject: [PATCH 01/15] New preset implementation and test (#867) * memory optimization preset ttrying again ttrying again 3 ttrying again 4 accidentally pushed my updated makefile * Wrote catch for invalid presets, wrote test for catch for invalid presets, debugged new optimization preset * Forgot to run pre-commit, fixed those issues * black doing weird things * made preset validation more maintainable by moving it to the constructor and getting rid of preset list --- dataprofiler/profilers/profiler_options.py | 29 +++++++++++++++++-- .../profiler_options/test_profiler_presets.py | 23 +++++++++++++++ 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/dataprofiler/profilers/profiler_options.py b/dataprofiler/profilers/profiler_options.py index 76b3654e3..711c5f13d 100644 --- a/dataprofiler/profilers/profiler_options.py +++ b/dataprofiler/profilers/profiler_options.py @@ -6,6 +6,7 @@ import copy import re import warnings +from typing import Any from ..labelers.base_data_labeler import BaseDataLabeler @@ -1557,7 +1558,8 @@ def __init__(self, presets: str = None) -> None: :ivar unstructured_options: option set for unstructured dataset profiling. :vartype unstructured_options: UnstructuredOptions :ivar presets: A pre-configured mapping of a string name to group of options: - "complete", "data_types", and "numeric_stats_disabled". Default: None + "complete", "data_types", "numeric_stats_disabled", + and "memory_optimization". Default: None :vartype presets: Optional[str] """ self.structured_options = StructuredOptions() @@ -1570,6 +1572,10 @@ def __init__(self, presets: str = None) -> None: self._data_types_presets() elif self.presets == "numeric_stats_disabled": self._numeric_stats_disabled_presets() + elif self.presets == "memory_optimization": + self._memory_optimization_presets() + else: + raise ValueError("The preset entered is not a valid preset.") def _complete_presets(self) -> None: self.set({"*.is_enabled": True}) @@ -1583,6 +1589,25 @@ def _numeric_stats_disabled_presets(self) -> None: self.set({"*.float.is_numeric_stats_enabled": False}) self.set({"structured_options.text.is_numeric_stats_enabled": False}) + def _memory_optimization_presets(self) -> None: + self.set({"structured_options.row_statistics.is_enabled": False}) + self.set({"structured_options.multiprocess.is_enabled": False}) + self.set({"structured_options.data_labeler.is_enabled": False}) + self.set({"structured_options.datetime.is_enabled": False}) + self.set({"structured_options.order.is_enabled": False}) + self.set({"structured_options.chi2_homogeneity.is_enabled": False}) + self.set({"structured_options.null_replication_metrics.is_enabled": False}) + self.set({"unstructured_options.data_labeler.is_enabled": False}) + self.set( + { + ( + "structured_options.category." + "max_sample_size_to_check_stop_condition" + ): 5000 + } + ) + self.set({"structured_options.category.stop_condition_unique_value_ratio": 0.5}) + def _validate_helper(self, variable_path: str = "ProfilerOptions") -> list[str]: """ Validate the options do not conflict and cause errors. @@ -1620,7 +1645,7 @@ def _validate_helper(self, variable_path: str = "ProfilerOptions") -> list[str]: return errors - def set(self, options: dict[str, bool]) -> None: + def set(self, options: dict[str, Any]) -> None: """ Overwrite BaseOption.set. diff --git a/dataprofiler/tests/profilers/profiler_options/test_profiler_presets.py b/dataprofiler/tests/profilers/profiler_options/test_profiler_presets.py index 8f8bc0d6b..b1daa0bd3 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_profiler_presets.py +++ b/dataprofiler/tests/profilers/profiler_options/test_profiler_presets.py @@ -33,3 +33,26 @@ def test_profiler_preset_numeric_stats_disabled(self, *mocks): self.assertFalse(options.structured_options.null_replication_metrics.is_enabled) self.assertTrue(options.structured_options.category.is_enabled) self.assertTrue(options.structured_options.order.is_enabled) + + def test_profiler_preset_memory_optimization(self, *mocks): + options = ProfilerOptions(presets="memory_optimization") + self.assertFalse(options.structured_options.row_statistics.is_enabled) + self.assertFalse(options.structured_options.multiprocess.is_enabled) + self.assertFalse(options.structured_options.data_labeler.is_enabled) + self.assertFalse(options.structured_options.datetime.is_enabled) + self.assertFalse(options.structured_options.order.is_enabled) + self.assertFalse(options.structured_options.chi2_homogeneity.is_enabled) + self.assertFalse(options.structured_options.null_replication_metrics.is_enabled) + self.assertFalse(options.unstructured_options.data_labeler.is_enabled) + self.assertEqual( + options.structured_options.category.max_sample_size_to_check_stop_condition, + 5000, + ) + self.assertEqual( + options.structured_options.category.stop_condition_unique_value_ratio, 0.5 + ) + + def test_profiler_preset_failure(self, *mocks): + expected_error = "The preset entered is not a valid preset." + with self.assertRaisesRegex(ValueError, expected_error): + ProfilerOptions(presets="failing_preset") From 0023ed5823f288d7df4ddb257ee40d190cc5f44c Mon Sep 17 00:00:00 2001 From: Richard Bann <87214439+drahc1R@users.noreply.github.com> Date: Wed, 14 Jun 2023 13:09:32 -0400 Subject: [PATCH 02/15] RowStatisticsOptions: Add option (#865) * RowStatisticsOptions: Add null row count Added null_row_count as an option in RowStatisticsOptions. It toggles the functionality for row_has_null_ratio and row_is_null_ratio in _update_row_statistics. * Unit test for RowStatisticOptions: * Black formatting * RowStatisticsOptions: Add null row count Added null_row_count as an option in RowStatisticsOptions. It toggles the functionality for row_has_null_ratio and row_is_null_ratio in _update_row_statistics. * Unit test for RowStatisticOptions: * Black formatting * added a unit test for RowStatisticsOptions * Deleted test cases that were written in the wrong file * updated testing for null_count toggle in _update_row_statistics * removed the RowStatisticsOptions from test_profiler_options imports * add line * Created toggle option for null_count * RowStatisticsOptions: Add implementation * Revert "RowStatisticsOptions: Add implementation" This reverts commit 2da6a938d78690203d6ced4d1d16edb73138494e. * RowStatsticsOptions: Create option * fixed pre-commit error * Update dataprofiler/profilers/profiler_options.py Co-authored-by: Taylor Turner * Update dataprofiler/profilers/profiler_options.py Co-authored-by: Taylor Turner * fixed documentation --------- Co-authored-by: Taylor Turner --- dataprofiler/profilers/profiler_options.py | 19 +++++++++++++++++-- .../test_row_statistics_options.py | 2 +- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/dataprofiler/profilers/profiler_options.py b/dataprofiler/profilers/profiler_options.py index 711c5f13d..467649aa3 100644 --- a/dataprofiler/profilers/profiler_options.py +++ b/dataprofiler/profilers/profiler_options.py @@ -1071,15 +1071,25 @@ def _validate_helper(self, variable_path: str = "UniqueCountOptions") -> list[st class RowStatisticsOptions(BooleanOption): """For configuring options for row statistics.""" - def __init__(self, is_enabled: bool = True, unique_count: bool = True) -> None: + def __init__( + self, + is_enabled: bool = True, + unique_count: bool = True, + null_count: bool = True, + ) -> None: """ Initialize options for row statistics. :ivar is_enabled: boolean option to enable/disable. :vartype is_enabled: bool + :ivar unique_count: boolean option to enable/disable unique_count + :vartype unique_count: bool + ivar null_count: boolean option to enable/disable null_count + :vartype null_count: bool """ BooleanOption.__init__(self, is_enabled=is_enabled) self.unique_count = UniqueCountOptions(is_enabled=unique_count) + self.null_count = BooleanOption(is_enabled=null_count) def _validate_helper( self, variable_path: str = "RowStatisticsOptions" @@ -1095,9 +1105,14 @@ def _validate_helper( errors = super()._validate_helper(variable_path=variable_path) if not isinstance(self.unique_count, UniqueCountOptions): errors.append( - f"{variable_path}.full_hashing must be an UniqueCountOptions." + f"{variable_path}.unique_count must be an UniqueCountOptions." ) + + if not isinstance(self.null_count, BooleanOption): + errors.append(f"{variable_path}.null_count must be an BooleanOption.") + errors += self.unique_count._validate_helper(variable_path + ".unique_counts") + errors += self.null_count._validate_helper(variable_path + ".null_count") return super()._validate_helper(variable_path) diff --git a/dataprofiler/tests/profilers/profiler_options/test_row_statistics_options.py b/dataprofiler/tests/profilers/profiler_options/test_row_statistics_options.py index b32ef4ee8..687888b1e 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_row_statistics_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_row_statistics_options.py @@ -7,7 +7,7 @@ class TestRowStatisticsOptions(TestBooleanOption): option_class = RowStatisticsOptions - keys = ["unique_count"] + keys = ["unique_count", "null_count"] def get_options(self, **params): options = RowStatisticsOptions() From 7d9d995d6d6c35dec2ddbbfb4722aff6cf2d8144 Mon Sep 17 00:00:00 2001 From: Liz Smith Date: Thu, 15 Jun 2023 14:51:20 -0500 Subject: [PATCH 03/15] Preset test updated w new names and different toggles (#880) * memory optimization preset ttrying again ttrying again 3 ttrying again 4 accidentally pushed my updated makefile * trying * trying * black doing weird things * trying * made preset validation more maintainable by moving it to the constructor and getting rid of preset list * Update to open-source in prep for wrapper changes for mem op preset * updated preset toggles and preset name (mem op -> large data) * updated tests to match * continued name and test and toggle updates * fix comments --- dataprofiler/profilers/profiler_options.py | 17 +++++------------ .../profiler_options/test_profiler_presets.py | 15 +++++---------- 2 files changed, 10 insertions(+), 22 deletions(-) diff --git a/dataprofiler/profilers/profiler_options.py b/dataprofiler/profilers/profiler_options.py index 467649aa3..2b1f7b06a 100644 --- a/dataprofiler/profilers/profiler_options.py +++ b/dataprofiler/profilers/profiler_options.py @@ -1574,7 +1574,7 @@ def __init__(self, presets: str = None) -> None: :vartype unstructured_options: UnstructuredOptions :ivar presets: A pre-configured mapping of a string name to group of options: "complete", "data_types", "numeric_stats_disabled", - and "memory_optimization". Default: None + and "lower_memory_sketching". Default: None :vartype presets: Optional[str] """ self.structured_options = StructuredOptions() @@ -1587,8 +1587,8 @@ def __init__(self, presets: str = None) -> None: self._data_types_presets() elif self.presets == "numeric_stats_disabled": self._numeric_stats_disabled_presets() - elif self.presets == "memory_optimization": - self._memory_optimization_presets() + elif self.presets == "lower_memory_sketching": + self._lower_memory_sketching_presets() else: raise ValueError("The preset entered is not a valid preset.") @@ -1604,15 +1604,8 @@ def _numeric_stats_disabled_presets(self) -> None: self.set({"*.float.is_numeric_stats_enabled": False}) self.set({"structured_options.text.is_numeric_stats_enabled": False}) - def _memory_optimization_presets(self) -> None: - self.set({"structured_options.row_statistics.is_enabled": False}) - self.set({"structured_options.multiprocess.is_enabled": False}) - self.set({"structured_options.data_labeler.is_enabled": False}) - self.set({"structured_options.datetime.is_enabled": False}) - self.set({"structured_options.order.is_enabled": False}) - self.set({"structured_options.chi2_homogeneity.is_enabled": False}) - self.set({"structured_options.null_replication_metrics.is_enabled": False}) - self.set({"unstructured_options.data_labeler.is_enabled": False}) + def _lower_memory_sketching_presets(self) -> None: + self.set({"row_statistics.unique_count.hashing_method": "hll"}) self.set( { ( diff --git a/dataprofiler/tests/profilers/profiler_options/test_profiler_presets.py b/dataprofiler/tests/profilers/profiler_options/test_profiler_presets.py index b1daa0bd3..545d9259c 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_profiler_presets.py +++ b/dataprofiler/tests/profilers/profiler_options/test_profiler_presets.py @@ -34,16 +34,11 @@ def test_profiler_preset_numeric_stats_disabled(self, *mocks): self.assertTrue(options.structured_options.category.is_enabled) self.assertTrue(options.structured_options.order.is_enabled) - def test_profiler_preset_memory_optimization(self, *mocks): - options = ProfilerOptions(presets="memory_optimization") - self.assertFalse(options.structured_options.row_statistics.is_enabled) - self.assertFalse(options.structured_options.multiprocess.is_enabled) - self.assertFalse(options.structured_options.data_labeler.is_enabled) - self.assertFalse(options.structured_options.datetime.is_enabled) - self.assertFalse(options.structured_options.order.is_enabled) - self.assertFalse(options.structured_options.chi2_homogeneity.is_enabled) - self.assertFalse(options.structured_options.null_replication_metrics.is_enabled) - self.assertFalse(options.unstructured_options.data_labeler.is_enabled) + def test_profiler_preset_lower_memory_sketching(self, *mocks): + options = ProfilerOptions(presets="lower_memory_sketching") + self.assertEqual( + options.structured_options.row_statistics.unique_count.hashing_method, "hll" + ) self.assertEqual( options.structured_options.category.max_sample_size_to_check_stop_condition, 5000, From 661a8d060b1ccfa5fdd0f9931eda317998516c18 Mon Sep 17 00:00:00 2001 From: Richard Bann <87214439+drahc1R@users.noreply.github.com> Date: Thu, 15 Jun 2023 16:54:05 -0400 Subject: [PATCH 04/15] RowStatisticsOptions: Implementing option (#871) * Implementing option * Implementing option * took out redundant if statement. added test case for when null_count is disabled. * attempt to check for conflicts between profile merges * added test to check if two profilers have null_count enabled before merging them together * fixed typo and added a trycatch to prevent failing test * No mocks needed. Fixed assertRaisesRegex error * Changed variables names and added a new test to check for check the null_count when null_count is disabled. * Changed name of test, moved tests to TestStructuredProfilerRowStatistics. Fixed position of if statement to prevent unnecessary code from running. * added null_count test cases * fixed indentation mistake * fixed typo * removed a useless commented a line * Updated test name --- dataprofiler/profilers/profile_builder.py | 100 +++++++++++------- .../tests/profilers/test_profile_builder.py | 69 +++++++++++- 2 files changed, 125 insertions(+), 44 deletions(-) diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py index d64eaa175..f3757256b 100644 --- a/dataprofiler/profilers/profile_builder.py +++ b/dataprofiler/profilers/profile_builder.py @@ -1582,6 +1582,15 @@ def _add_error_checks( # type: ignore[override] "Attempting to merge two profiles with unique row " "count option enabled on one profile but not the other." ) + # Check null_count options + if ( + self.options.row_statistics.null_count.is_enabled + != other.options.row_statistics.null_count.is_enabled + ): + raise ValueError( + "Attempting to merge two profiles with null row " + "count option enabled on one profile but not the other." + ) # Check hashing_method options if ( self.options.row_statistics.unique_count.hashing_method @@ -1967,7 +1976,10 @@ def _get_unique_row_ratio(self) -> float | None: def _get_row_is_null_ratio(self) -> float | None: """Return whether row is null ratio.""" - if not self.options.row_statistics.is_enabled: + if ( + not self.options.row_statistics.is_enabled + or not self.options.row_statistics.null_count.is_enabled + ): return None if self._min_col_samples_used: @@ -1976,7 +1988,10 @@ def _get_row_is_null_ratio(self) -> float | None: def _get_row_has_null_ratio(self) -> float | None: """Return whether row has null ratio.""" - if not self.options.row_statistics.is_enabled: + if ( + not self.options.row_statistics.is_enabled + or not self.options.row_statistics.null_count.is_enabled + ): return None if self._min_col_samples_used: @@ -2051,48 +2066,51 @@ def _update_row_statistics( self.hashed_row_object.add(record) # Calculate Null Column Count - null_rows = set() - null_in_row_count = set() - first_col_flag = True - for column in self._profile: - null_type_dict = column.null_types_index - null_row_indices = set() - if null_type_dict: - null_row_indices = set.union(*null_type_dict.values()) - - # If sample ids provided, only consider nulls in rows that - # were fully sampled - if sample_ids is not None: - # This is the amount (integer) indices were shifted by in the - # event of overlap - shift = column._index_shift - if shift is None: - # Shift is None if index is str or if no overlap detected - null_row_indices = null_row_indices.intersection( - data.index[sample_ids[: self._min_sampled_from_batch]] - ) + if self.options.row_statistics.null_count.is_enabled: + null_rows = set() + null_in_row_count = set() + first_col_flag = True + for column in self._profile: + null_type_dict = column.null_types_index + null_row_indices = set() + if null_type_dict: + null_row_indices = set.union(*null_type_dict.values()) + + # If sample ids provided, only consider nulls in rows that + # were fully sampled + if sample_ids is not None: + # This is the amount (integer) indices were shifted by in the + # event of overlap + shift = column._index_shift + if shift is None: + # Shift is None if index is str or if no overlap detected + null_row_indices = null_row_indices.intersection( + data.index[sample_ids[: self._min_sampled_from_batch]] + ) + else: + # Only shift if index shift detected (must be ints) + null_row_indices = null_row_indices.intersection( + data.index[sample_ids[: self._min_sampled_from_batch]] + + shift + ) + + # Find the common null indices between the columns + if first_col_flag: + null_rows = null_row_indices + null_in_row_count = null_row_indices + first_col_flag = False else: - # Only shift if index shift detected (must be ints) - null_row_indices = null_row_indices.intersection( - data.index[sample_ids[: self._min_sampled_from_batch]] + shift - ) + null_rows = null_rows.intersection(null_row_indices) + null_in_row_count = null_in_row_count.union(null_row_indices) - # Find the common null indices between the columns - if first_col_flag: - null_rows = null_row_indices - null_in_row_count = null_row_indices - first_col_flag = False + # If sample_ids provided, + # increment since that means only new data read + if sample_ids is not None: + self.row_has_null_count += len(null_in_row_count) + self.row_is_null_count += len(null_rows) else: - null_rows = null_rows.intersection(null_row_indices) - null_in_row_count = null_in_row_count.union(null_row_indices) - - # If sample_ids provided, increment since that means only new data read - if sample_ids is not None: - self.row_has_null_count += len(null_in_row_count) - self.row_is_null_count += len(null_rows) - else: - self.row_has_null_count = len(null_in_row_count) - self.row_is_null_count = len(null_rows) + self.row_has_null_count = len(null_in_row_count) + self.row_is_null_count = len(null_rows) def _get_correlation( self, clean_samples: dict, batch_properties: dict diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py index 3a5505192..d7dbd2a10 100644 --- a/dataprofiler/tests/profilers/test_profile_builder.py +++ b/dataprofiler/tests/profilers/test_profile_builder.py @@ -233,6 +233,8 @@ def test_add_profilers(self, *mocks): self.assertEqual( "", merged_profile.file_type ) + self.assertTrue(merged_profile.options.row_statistics.null_count.is_enabled) + self.assertTrue(merged_profile.options.row_statistics.unique_count.is_enabled) self.assertEqual(2, merged_profile.row_has_null_count) self.assertEqual(2, merged_profile.row_is_null_count) self.assertEqual(7, merged_profile.total_samples) @@ -3555,6 +3557,57 @@ def setUpClass(cls): cls.data, len(cls.data), options=profiler_options_full ) + def test_adding_profiles_of_mismatched_null_count_options(self): + profiler_options_null_count = ProfilerOptions() + profiler_options_null_count.set( + { + "*.is_enabled": False, + "row_statistics.*.is_enabled": True, + "row_statistics.null_count.is_enabled": True, + } + ) + profiler_options_null_disabled = ProfilerOptions() + profiler_options_null_disabled.set( + { + "*.is_enabled": False, + "row_statistics.*.is_enabled": True, + "row_statistics.null_count.is_enabled": False, + } + ) + data = pd.DataFrame([1, None, 3, 4, 5, None, 1]) + with test_utils.mock_timeit(): + profiler_w_null_count = dp.StructuredProfiler( + data[:2], options=profiler_options_null_count + ) + profiler_w_disabled_null_count = dp.StructuredProfiler( + data[2:], options=profiler_options_null_disabled + ) + + with self.assertRaisesRegex( + ValueError, + "Attempting to merge two profiles with null row " + "count option enabled on one profile but not the other.", + ): + profiler_w_null_count + profiler_w_disabled_null_count + + def test_profile_null_count_not_enabled(self): + profiler_options_null_disabled = ProfilerOptions() + profiler_options_null_disabled.set( + { + "*.is_enabled": False, + "row_statistics.*.is_enabled": True, + "row_statistics.null_count.is_enabled": False, + } + ) + data = pd.DataFrame([1, None, 3, 4, 5, None, 1]) + with test_utils.mock_timeit(): + profiler_w_disabled_null_count = dp.StructuredProfiler( + data[2:], options=profiler_options_null_disabled + ) + + self.assertEqual(0, profiler_w_disabled_null_count.row_has_null_count) + self.assertEqual(0, profiler_w_disabled_null_count.row_is_null_count) + def test_correct_rows_ingested(self): test_dict = { "1": ["nan", "null", None, None, ""], @@ -3618,9 +3671,7 @@ def test_correct_null_row_counts(self): def test_row_is_null_ratio_row_stats_disabled(self): profiler_options_1 = ProfilerOptions() profiler_options_1.set( - { - "*.is_enabled": False, - } + {"*.is_enabled": False, "row_statistics.null_count.is_enabled": False} ) profiler = StructuredProfiler(pd.DataFrame([]), options=profiler_options_1) self.assertIsNone(profiler._get_row_is_null_ratio()) @@ -4085,6 +4136,18 @@ def test_unique_row_ratio_empty_profiler(self): profiler = StructuredProfiler(pd.DataFrame([]), options=profiler_options) self.assertEqual(0, profiler._get_unique_row_ratio()) + def test_null_count_empty_profiler(self): + profiler_options = ProfilerOptions() + profiler_options.set( + { + "*.is_enabled": False, + "row_statistics.null_count.is_enabled": False, + } + ) + profiler = StructuredProfiler(pd.DataFrame([]), options=profiler_options) + self.assertIsNone(profiler._get_row_is_null_ratio()) + self.assertIsNone(profiler._get_row_has_null_ratio()) + def test_correct_duplicate_row_count_full_row_hashing(self): self.assertEqual(15, len(self.trained_schema_full.hashed_row_object)) self.assertEqual(20, self.trained_schema_full.total_samples) From 4d2b6add147f8f07e97d5a3aaa7610aad8b948b3 Mon Sep 17 00:00:00 2001 From: clee1152 Date: Tue, 27 Jun 2023 09:50:33 -0400 Subject: [PATCH 05/15] Implemented options for `num_quantiles` feature in `NumericStatsMixin` --- dataprofiler/profilers/numerical_column_stats.py | 6 ++++-- dataprofiler/profilers/profiler_options.py | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index ae058464e..abb29c844 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -82,9 +82,11 @@ def __init__(self, options: NumericalOptions = None) -> None: self._mode_is_enabled: bool = True self.num_zeros: int = 0 self.num_negatives: int = 0 + self.num_quantiles: int = 1000 # By default, we use 1000 quantiles if options: self.bias_correction = options.bias_correction.is_enabled self._top_k_modes = options.mode.top_k_modes + self.num_quantiles = options.num_quantiles self._median_is_enabled = options.median.is_enabled self._median_abs_dev_is_enabled = options.median_abs_deviation.is_enabled self._mode_is_enabled = options.mode.is_enabled @@ -111,9 +113,8 @@ def __init__(self, options: NumericalOptions = None) -> None: "suggested_bin_count": self.min_histogram_bin, "histogram": {"bin_counts": None, "bin_edges": None}, } - num_quantiles: int = 1000 # TODO: add to options self.quantiles: list[float] | dict = { - bin_num: None for bin_num in range(num_quantiles - 1) + bin_num: None for bin_num in range(self.num_quantiles - 1) } self.__calculations = { "min": NumericStatsMixin._get_min, @@ -383,6 +384,7 @@ def profile(self) -> dict: skewness=self.np_type_to_type(self.skewness), kurtosis=self.np_type_to_type(self.kurtosis), histogram=self._get_best_histogram_for_profile(), + num_quantiles=self.num_quantiles, quantiles=self.quantiles, median_abs_deviation=self.np_type_to_type(self.median_abs_deviation), num_zeros=self.np_type_to_type(self.num_zeros), diff --git a/dataprofiler/profilers/profiler_options.py b/dataprofiler/profilers/profiler_options.py index 2b1f7b06a..cecbf2484 100644 --- a/dataprofiler/profilers/profiler_options.py +++ b/dataprofiler/profilers/profiler_options.py @@ -352,6 +352,8 @@ def __init__(self) -> None: :vartype num_zeros: BooleanOption :ivar num_negatives: boolean option to enable/disable num_negatives :vartype num_negatives: BooleanOption + :ivar num_quantiles: boolean option to enable/disable num_quantiles + :vartype num_quantiles: BooleanOption :ivar is_numeric_stats_enabled: boolean to enable/disable all numeric stats :vartype is_numeric_stats_enabled: bool @@ -367,6 +369,7 @@ def __init__(self) -> None: self.median_abs_deviation = BooleanOption(is_enabled=True) self.num_zeros = BooleanOption(is_enabled=True) self.num_negatives = BooleanOption(is_enabled=True) + self.num_quantiles = BooleanOption(is_enabled=True) self.histogram_and_quantiles = HistogramOption() # By default, we correct for bias self.bias_correction = BooleanOption(is_enabled=True) @@ -396,6 +399,7 @@ def is_numeric_stats_enabled(self) -> bool: or self.histogram_and_quantiles.is_enabled or self.num_zeros.is_enabled or self.num_negatives.is_enabled + or self.num_quantiles.is_enabled ): return True return False @@ -425,6 +429,7 @@ def is_numeric_stats_enabled(self, value: bool) -> None: self.num_zeros.is_enabled = value self.num_negatives.is_enabled = value self.histogram_and_quantiles.is_enabled = value + self.num_quantiles.is_enabled = value @property def properties(self) -> dict[str, BooleanOption]: @@ -464,6 +469,7 @@ def _validate_helper(self, variable_path: str = "NumericalOptions") -> list[str] "bias_correction", "num_zeros", "num_negatives", + "num_quantiles", ]: if not isinstance(self.properties[item], BooleanOption): errors.append(f"{variable_path}.{item} must be a BooleanOption.") @@ -569,6 +575,8 @@ def __init__(self) -> None: :vartype num_zeros: BooleanOption :ivar num_negatives: boolean option to enable/disable num_negatives :vartype num_negatives: BooleanOption + :ivar num_quantiles: boolean option to enable/disable num_quantiles + :vartype num_quantiles: BooleanOption :ivar is_numeric_stats_enabled: boolean to enable/disable all numeric stats :vartype is_numeric_stats_enabled: bool @@ -666,6 +674,8 @@ def __init__(self) -> None: :vartype num_zeros: BooleanOption :ivar num_negatives: boolean option to enable/disable num_negatives :vartype num_negatives: BooleanOption + :ivar num_quantiles: boolean option to enable/disable num_quantiles + :vartype num_quantiles: BooleanOption :ivar is_numeric_stats_enabled: boolean to enable/disable all numeric stats :vartype is_numeric_stats_enabled: bool @@ -725,6 +735,8 @@ def __init__(self) -> None: :vartype num_zeros: BooleanOption :ivar num_negatives: boolean option to enable/disable num_negatives :vartype num_negatives: BooleanOption + :ivar num_quantiles: boolean option to enable/disable num_quantiles + :vartype num_quantiles: BooleanOption :ivar is_numeric_stats_enabled: boolean to enable/disable all numeric stats :vartype is_numeric_stats_enabled: bool @@ -788,6 +800,7 @@ def is_numeric_stats_enabled(self) -> bool: or self.median.is_enabled or self.median_abs_deviation.is_enabled or self.histogram_and_quantiles.is_enabled + or self.num_quantiles.is_enabled ): return True return False @@ -814,6 +827,7 @@ def is_numeric_stats_enabled(self, value: bool) -> None: self.kurtosis.is_enabled = value self.median_abs_deviation.is_enabled = value self.histogram_and_quantiles.is_enabled = value + self.num_quantiles.is_enabled = value class DateTimeOptions(BaseInspectorOptions): From 47903d227b3789985139ffcdfbdba8a6ccdb31b4 Mon Sep 17 00:00:00 2001 From: clee1152 Date: Tue, 27 Jun 2023 09:56:44 -0400 Subject: [PATCH 06/15] Implemented options for `num_quantiles.py`. --- .../profilers/numerical_column_stats.py | 4 +- dataprofiler/profilers/profiler_options.py | 50 +++++++++++++++++-- 2 files changed, 47 insertions(+), 7 deletions(-) diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index abb29c844..627dc1fb0 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -82,11 +82,11 @@ def __init__(self, options: NumericalOptions = None) -> None: self._mode_is_enabled: bool = True self.num_zeros: int = 0 self.num_negatives: int = 0 - self.num_quantiles: int = 1000 # By default, we use 1000 quantiles + self.num_quantiles: int = 1000 # By default, we use 1000 quantiles if options: self.bias_correction = options.bias_correction.is_enabled self._top_k_modes = options.mode.top_k_modes - self.num_quantiles = options.num_quantiles + self.num_quantiles = options.num_quantiles.num_quantiles self._median_is_enabled = options.median.is_enabled self._median_abs_dev_is_enabled = options.median_abs_deviation.is_enabled self._mode_is_enabled = options.mode.is_enabled diff --git a/dataprofiler/profilers/profiler_options.py b/dataprofiler/profilers/profiler_options.py index cecbf2484..09d9f6736 100644 --- a/dataprofiler/profilers/profiler_options.py +++ b/dataprofiler/profilers/profiler_options.py @@ -269,6 +269,42 @@ def _validate_helper(self, variable_path: str = "ModeOption") -> list[str]: return errors +class NumQuantilesOption(BooleanOption): + """For setting number of quantile options.""" + + def __init__(self, is_enabled: bool = True, num_quantiles: int = 1000) -> None: + """ + Initialize options for number of quantiles. + + :ivar is_enabled: boolean option to enable/disable the option. + :vartype is_enabled: bool + :ivar num_quantiles: the number of quantiles to bin the data. + :vartype num_quantiles: int + """ + self.num_quantiles = num_quantiles + super().__init__(is_enabled=is_enabled) + + def _validate_helper(self, variable_path: str = "NumQuantilesOption") -> list[str]: + """ + Validate the options do not conflict and cause errors. + + :param variable_path: current path to variable set. + :type variable_path: str + :return: list of errors (if raise_error is false) + :rtype: list(str) + """ + errors = super()._validate_helper(variable_path=variable_path) + + if self.num_quantiles is not None and ( + not isinstance(self.num_quantiles, int) or self.num_quantiles < 1 + ): + errors.append( + "{}.num_quantiles must be either None" + " or a positive integer".format(variable_path) + ) + return errors + + class BaseInspectorOptions(BooleanOption): """For setting Base options.""" @@ -353,7 +389,8 @@ def __init__(self) -> None: :ivar num_negatives: boolean option to enable/disable num_negatives :vartype num_negatives: BooleanOption :ivar num_quantiles: boolean option to enable/disable num_quantiles - :vartype num_quantiles: BooleanOption + and set the number of quantiles + :vartype num_quantiles: NumQuantilesOption :ivar is_numeric_stats_enabled: boolean to enable/disable all numeric stats :vartype is_numeric_stats_enabled: bool @@ -369,7 +406,7 @@ def __init__(self) -> None: self.median_abs_deviation = BooleanOption(is_enabled=True) self.num_zeros = BooleanOption(is_enabled=True) self.num_negatives = BooleanOption(is_enabled=True) - self.num_quantiles = BooleanOption(is_enabled=True) + self.num_quantiles = NumQuantilesOption(is_enabled=True) self.histogram_and_quantiles = HistogramOption() # By default, we correct for bias self.bias_correction = BooleanOption(is_enabled=True) @@ -576,7 +613,8 @@ def __init__(self) -> None: :ivar num_negatives: boolean option to enable/disable num_negatives :vartype num_negatives: BooleanOption :ivar num_quantiles: boolean option to enable/disable num_quantiles - :vartype num_quantiles: BooleanOption + and set the number of quantiles + :vartype num_quantiles: NumQuantilesOption :ivar is_numeric_stats_enabled: boolean to enable/disable all numeric stats :vartype is_numeric_stats_enabled: bool @@ -675,7 +713,8 @@ def __init__(self) -> None: :ivar num_negatives: boolean option to enable/disable num_negatives :vartype num_negatives: BooleanOption :ivar num_quantiles: boolean option to enable/disable num_quantiles - :vartype num_quantiles: BooleanOption + and set the number of quantiles + :vartype num_quantiles: NumQuantilesOption :ivar is_numeric_stats_enabled: boolean to enable/disable all numeric stats :vartype is_numeric_stats_enabled: bool @@ -736,7 +775,8 @@ def __init__(self) -> None: :ivar num_negatives: boolean option to enable/disable num_negatives :vartype num_negatives: BooleanOption :ivar num_quantiles: boolean option to enable/disable num_quantiles - :vartype num_quantiles: BooleanOption + and set the number of quantiles + :vartype num_quantiles: NumQuantilesOption :ivar is_numeric_stats_enabled: boolean to enable/disable all numeric stats :vartype is_numeric_stats_enabled: bool From 4045c58440ef51ff3917ce15faeef7a20a146a67 Mon Sep 17 00:00:00 2001 From: clee1152 Date: Tue, 27 Jun 2023 09:57:39 -0400 Subject: [PATCH 07/15] Added tests for `num_quantiles`. --- .../test_numerical_options.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py b/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py index 63ed6f04f..56664dc53 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py @@ -20,6 +20,7 @@ class TestNumericalOptions(TestBaseInspectorOptions): "median_abs_deviation", "num_zeros", "num_negatives", + "num_quantiles", "histogram_and_quantiles", ] numeric_keys = [ @@ -35,6 +36,7 @@ class TestNumericalOptions(TestBaseInspectorOptions): "histogram_and_quantiles", "num_zeros", "num_negatives", + "num_quantiles", ] def test_init(self): @@ -123,6 +125,24 @@ def test_validate_helper(self): self.assertEqual([mode_error], options._validate_helper()) options.set({"mode.top_k_modes": 5}) + # Zero num_quantiles + options.set( + {"num_quantiles.is_enabled": True, "num_quantiles.num_quantiles": 0} + ) + num_quantiles_error = ( + "{}.num_quantiles.num_quantiles must be either None" + " or a positive integer".format(optpth) + ) + self.assertEqual([num_quantiles_error], options._validate_helper()) + # Negative num_quantiles + options.set({"num_quantiles.num_quantiles": -5}) + num_quantiles_error = ( + "{}.num_quantiles.num_quantiles must be either None" + " or a positive integer".format(optpth) + ) + self.assertEqual([num_quantiles_error], options._validate_helper()) + options.set({"num_quantiles.num_quantiles": 1000}) + # Disable Sum and Enable Variance options.set( { From ea603d2c01bc59d240bb8c0b5c629c35f3f89a5c Mon Sep 17 00:00:00 2001 From: clee1152 Date: Tue, 27 Jun 2023 16:48:59 -0400 Subject: [PATCH 08/15] Fixed tests for num_quantiles option in column profiles. --- dataprofiler/profilers/numerical_column_stats.py | 1 - dataprofiler/tests/profilers/test_float_column_profile.py | 1 + dataprofiler/tests/profilers/test_int_column_profile.py | 3 +++ .../tests/profilers/test_numeric_stats_mixin_profile.py | 2 ++ 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index 627dc1fb0..a87447cf4 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -384,7 +384,6 @@ def profile(self) -> dict: skewness=self.np_type_to_type(self.skewness), kurtosis=self.np_type_to_type(self.kurtosis), histogram=self._get_best_histogram_for_profile(), - num_quantiles=self.num_quantiles, quantiles=self.quantiles, median_abs_deviation=self.np_type_to_type(self.median_abs_deviation), num_zeros=self.np_type_to_type(self.num_zeros), diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py index 9c7db90ea..f2dd7b23e 100644 --- a/dataprofiler/tests/profilers/test_float_column_profile.py +++ b/dataprofiler/tests/profilers/test_float_column_profile.py @@ -32,6 +32,7 @@ def test_base_case(self): self.assertTrue(profiler.kurtosis is np.nan) self.assertTrue(profiler.stddev is np.nan) self.assertIsNone(profiler.histogram_selection) + self.assertEqual(profiler.num_quantiles, 1000) self.assertEqual(len(profiler.quantiles), 999) self.assertIsNone(profiler.data_type_ratio) diff --git a/dataprofiler/tests/profilers/test_int_column_profile.py b/dataprofiler/tests/profilers/test_int_column_profile.py index 50ca859cf..5a2a77540 100644 --- a/dataprofiler/tests/profilers/test_int_column_profile.py +++ b/dataprofiler/tests/profilers/test_int_column_profile.py @@ -1146,6 +1146,7 @@ def test_json_encode(self): "_mode_is_enabled": True, "num_zeros": 0, "num_negatives": 0, + "num_quantiles": 1000, "histogram_methods": expected_historam_methods, "_stored_histogram": { "total_loss": 0, @@ -1187,6 +1188,7 @@ def test_json_encode_after_update(self, time): int_options = IntOptions() int_options.histogram_and_quantiles.bin_count_or_method = 5 + int_options.num_quantiles.num_quantiles = 4 profiler = IntColumn("0", int_options) mocked_quantiles = [0.25, 0.50, 0.75] @@ -1221,6 +1223,7 @@ def test_json_encode_after_update(self, time): "_mode_is_enabled": True, "num_zeros": 1, "num_negatives": 0, + "num_quantiles": 4, "histogram_selection": None, "user_set_histogram_bin": 5, "bias_correction": True, diff --git a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py index 6e4299db6..70214b20b 100644 --- a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py +++ b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py @@ -720,6 +720,7 @@ def test_profile(self): num_profiler.histogram_selection = "auto" num_profiler.histogram_methods["auto"]["histogram"] = mock_profile["histogram"] num_profiler.quantiles = mock_profile["quantiles"] + num_profiler.num_quantiles = 4 num_profiler.times = mock_profile["times"] time_array = [float(i) for i in range(100, 0, -1)] @@ -1195,6 +1196,7 @@ def test_json_encode(self): "_mode_is_enabled": True, "num_zeros": 0, "num_negatives": 0, + "num_quantiles": 1000, "histogram_methods": expected_historam_methods, "_stored_histogram": { "total_loss": 0, From 1c44351439cf8afe0868adbc3032a3427aae4a21 Mon Sep 17 00:00:00 2001 From: clee1152 Date: Tue, 27 Jun 2023 21:18:04 -0400 Subject: [PATCH 09/15] Added scipy requirements to requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b4d420d4f..7c63ff126 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ fastavro>=1.0.0.post1 python-snappy>=0.5.4 charset-normalizer>=1.3.6 psutil>=4.0.0 -scipy>=1.4.1 +scipy>=1.4.1,<1.11.0 requests>=2.28.1 networkx>=2.5.1 typing-extensions>=3.10.0.2 From de8104c443ea5d1fcfcbcd619be4b5f8f5a045a4 Mon Sep 17 00:00:00 2001 From: clee1152 Date: Wed, 28 Jun 2023 09:50:25 -0400 Subject: [PATCH 10/15] Added tests for num_quantiles option. --- dataprofiler/profilers/profiler_options.py | 3 +- .../test_num_quantiles_option.py | 142 ++++++++++++++++++ .../test_numerical_options.py | 18 --- 3 files changed, 143 insertions(+), 20 deletions(-) create mode 100644 dataprofiler/tests/profilers/profiler_options/test_num_quantiles_option.py diff --git a/dataprofiler/profilers/profiler_options.py b/dataprofiler/profilers/profiler_options.py index 09d9f6736..3ac96f9f4 100644 --- a/dataprofiler/profilers/profiler_options.py +++ b/dataprofiler/profilers/profiler_options.py @@ -299,8 +299,7 @@ def _validate_helper(self, variable_path: str = "NumQuantilesOption") -> list[st not isinstance(self.num_quantiles, int) or self.num_quantiles < 1 ): errors.append( - "{}.num_quantiles must be either None" - " or a positive integer".format(variable_path) + "{}.num_quantiles must be " "a positive integer.".format(variable_path) ) return errors diff --git a/dataprofiler/tests/profilers/profiler_options/test_num_quantiles_option.py b/dataprofiler/tests/profilers/profiler_options/test_num_quantiles_option.py new file mode 100644 index 000000000..6ad5ec327 --- /dev/null +++ b/dataprofiler/tests/profilers/profiler_options/test_num_quantiles_option.py @@ -0,0 +1,142 @@ +from dataprofiler.profilers.profiler_options import NumQuantilesOption +from dataprofiler.tests.profilers.profiler_options.test_boolean_option import ( + TestBooleanOption, +) + + +class TestNumQuantilesOption(TestBooleanOption): + + option_class = NumQuantilesOption + keys = [] + + def test_init(self): + option = self.get_options() + + self.assertDictEqual( + {"is_enabled": True, "num_quantiles": 1000}, option.properties + ) + option = self.get_options(is_enabled=False) + self.assertDictEqual( + {"is_enabled": False, "num_quantiles": 1000}, option.properties + ) + option = self.get_options(is_enabled=False, num_quantiles=50) + self.assertDictEqual( + {"is_enabled": False, "num_quantiles": 50}, option.properties + ) + + def test_set_helper(self): + super().test_set_helper + option = self.get_options() + + # validate, variable path being passed + expected_error = ( + "type object 'test.num_quantiles' has no " "attribute 'is_enabled'" + ) + with self.assertRaisesRegex(AttributeError, expected_error): + option._set_helper({"num_quantiles.is_enabled": True}, "test") + + def test_set(self): + super().test_set() + option = self.get_options() + option.set({"num_quantiles": 50}) + self.assertDictEqual( + {"is_enabled": True, "num_quantiles": 50}, option.properties + ) + + def test_validate_helper(self): + super().test_validate_helper() + + optpth = self.get_options_path() + + # Default configuration + option = self.get_options(num_quantiles=1000) + self.assertEqual([], option._validate_helper()) + + # Valid configurations + option = self.get_options(num_quantiles=50) + self.assertEqual([], option._validate_helper()) + option = self.get_options(num_quantiles=2000) + self.assertEqual([], option._validate_helper()) + option = self.get_options(num_quantiles=1) + self.assertEqual([], option._validate_helper()) + + # Option num_quantiles + option = self.get_options(num_quantiles="Hello World") + expected_error = [f"{optpth}.num_quantiles must be a positive integer."] + self.assertSetEqual(set(expected_error), set(option._validate_helper())) + + # Option num_quantiles cannot be a float, must be an int + option = self.get_options(num_quantiles=1.1) + expected_error = [f"{optpth}.num_quantiles must be a positive integer."] + self.assertSetEqual(set(expected_error), set(option._validate_helper())) + + # Option num_quantiles must be a positive integer + option = self.get_options(num_quantiles=0) + expected_error = [f"{optpth}.num_quantiles must be a positive integer."] + self.assertSetEqual(set(expected_error), set(option._validate_helper())) + + # Option num_quantiles must be a positive integer + option = self.get_options(num_quantiles=-5) + expected_error = [f"{optpth}.num_quantiles must be a positive integer."] + self.assertSetEqual(set(expected_error), set(option._validate_helper())) + + def test_validate(self): + super().test_validate() + + optpth = self.get_options_path() + + # Default configuration + option = self.get_options(num_quantiles=1000) + self.assertEqual([], option._validate_helper()) + + # Valid configurations + option = self.get_options(num_quantiles=50) + self.assertEqual([], option._validate_helper()) + option = self.get_options(num_quantiles=2000) + self.assertEqual([], option._validate_helper()) + option = self.get_options(num_quantiles=1) + self.assertEqual([], option._validate_helper()) + + # Option num_quantiles cannot be a string, must be an int + option = self.get_options(num_quantiles="Hello World") + expected_error = ( + "NumQuantilesOption.num_quantiles must be a " "positive integer" + ) + with self.assertRaisesRegex(ValueError, expected_error): + option.validate() + + # Option num_quantiles cannot be a float, must be an int + option = self.get_options(num_quantiles=1.1) + expected_error = ( + "NumQuantilesOption.num_quantiles must be a " "positive integer" + ) + with self.assertRaisesRegex(ValueError, expected_error): + option.validate() + + # Option num_quantiles must be a positive integer + option = self.get_options(num_quantiles=0) + expected_error = ( + "NumQuantilesOption.num_quantiles must be a " "positive integer" + ) + with self.assertRaisesRegex(ValueError, expected_error): + option.validate() + + # Option num_quantiles must be a positive integer + option = self.get_options(num_quantiles=-5) + expected_error = ( + "NumQuantilesOption.num_quantiles must be a " "positive integer" + ) + with self.assertRaisesRegex(ValueError, expected_error): + option.validate() + + def test_eq(self): + super().test_eq() + + options = self.get_options() + options2 = self.get_options() + options.num_quantiles = 30 + self.assertNotEqual(options, options2) + options2.num_quantiles = 50 + self.assertNotEqual(options, options2) + options2.num_quantiles = 30 + self.assertEqual(options, options2) diff --git a/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py b/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py index 56664dc53..560c82db5 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py @@ -125,24 +125,6 @@ def test_validate_helper(self): self.assertEqual([mode_error], options._validate_helper()) options.set({"mode.top_k_modes": 5}) - # Zero num_quantiles - options.set( - {"num_quantiles.is_enabled": True, "num_quantiles.num_quantiles": 0} - ) - num_quantiles_error = ( - "{}.num_quantiles.num_quantiles must be either None" - " or a positive integer".format(optpth) - ) - self.assertEqual([num_quantiles_error], options._validate_helper()) - # Negative num_quantiles - options.set({"num_quantiles.num_quantiles": -5}) - num_quantiles_error = ( - "{}.num_quantiles.num_quantiles must be either None" - " or a positive integer".format(optpth) - ) - self.assertEqual([num_quantiles_error], options._validate_helper()) - options.set({"num_quantiles.num_quantiles": 1000}) - # Disable Sum and Enable Variance options.set( { From 26554ae58a67a272314f433134635796af154747 Mon Sep 17 00:00:00 2001 From: clee1152 Date: Wed, 28 Jun 2023 13:25:41 -0400 Subject: [PATCH 11/15] Update dataprofiler/profilers/profiler_options.py Co-authored-by: Taylor Turner --- dataprofiler/profilers/profiler_options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/profilers/profiler_options.py b/dataprofiler/profilers/profiler_options.py index 3ac96f9f4..8e100f547 100644 --- a/dataprofiler/profilers/profiler_options.py +++ b/dataprofiler/profilers/profiler_options.py @@ -299,7 +299,7 @@ def _validate_helper(self, variable_path: str = "NumQuantilesOption") -> list[st not isinstance(self.num_quantiles, int) or self.num_quantiles < 1 ): errors.append( - "{}.num_quantiles must be " "a positive integer.".format(variable_path) + "{}.num_quantiles must be a positive integer.".format(variable_path) ) return errors From 382f11c1ce2b0c8b75fb1eda295420a9d2a043c4 Mon Sep 17 00:00:00 2001 From: clee1152 Date: Wed, 28 Jun 2023 13:26:33 -0400 Subject: [PATCH 12/15] Update dataprofiler/tests/profilers/profiler_options/test_num_quantiles_option.py Co-authored-by: Taylor Turner --- .../profilers/profiler_options/test_num_quantiles_option.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/tests/profilers/profiler_options/test_num_quantiles_option.py b/dataprofiler/tests/profilers/profiler_options/test_num_quantiles_option.py index 6ad5ec327..e2537f411 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_num_quantiles_option.py +++ b/dataprofiler/tests/profilers/profiler_options/test_num_quantiles_option.py @@ -75,7 +75,7 @@ def test_validate_helper(self): expected_error = [f"{optpth}.num_quantiles must be a positive integer."] self.assertSetEqual(set(expected_error), set(option._validate_helper())) - # Option num_quantiles must be a positive integer + # Option num_quantiles cannot be a negative integer option = self.get_options(num_quantiles=-5) expected_error = [f"{optpth}.num_quantiles must be a positive integer."] self.assertSetEqual(set(expected_error), set(option._validate_helper())) From 3a19c241427d17b6774934da6f2463a7212b0456 Mon Sep 17 00:00:00 2001 From: clee1152 Date: Wed, 28 Jun 2023 13:26:40 -0400 Subject: [PATCH 13/15] Minor edits to tests --- .../test_num_quantiles_option.py | 22 ++++++------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/dataprofiler/tests/profilers/profiler_options/test_num_quantiles_option.py b/dataprofiler/tests/profilers/profiler_options/test_num_quantiles_option.py index e2537f411..a59d81293 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_num_quantiles_option.py +++ b/dataprofiler/tests/profilers/profiler_options/test_num_quantiles_option.py @@ -30,7 +30,7 @@ def test_set_helper(self): # validate, variable path being passed expected_error = ( - "type object 'test.num_quantiles' has no " "attribute 'is_enabled'" + "type object 'test.num_quantiles' has no attribute 'is_enabled'" ) with self.assertRaisesRegex(AttributeError, expected_error): option._set_helper({"num_quantiles.is_enabled": True}, "test") @@ -70,7 +70,7 @@ def test_validate_helper(self): expected_error = [f"{optpth}.num_quantiles must be a positive integer."] self.assertSetEqual(set(expected_error), set(option._validate_helper())) - # Option num_quantiles must be a positive integer + # Option num_quantiles may not be zero, must be greater than one(1) option = self.get_options(num_quantiles=0) expected_error = [f"{optpth}.num_quantiles must be a positive integer."] self.assertSetEqual(set(expected_error), set(option._validate_helper())) @@ -99,33 +99,25 @@ def test_validate(self): # Option num_quantiles cannot be a string, must be an int option = self.get_options(num_quantiles="Hello World") - expected_error = ( - "NumQuantilesOption.num_quantiles must be a " "positive integer" - ) + expected_error = "NumQuantilesOption.num_quantiles must be a positive integer" with self.assertRaisesRegex(ValueError, expected_error): option.validate() # Option num_quantiles cannot be a float, must be an int option = self.get_options(num_quantiles=1.1) - expected_error = ( - "NumQuantilesOption.num_quantiles must be a " "positive integer" - ) + expected_error = "NumQuantilesOption.num_quantiles must be a positive integer" with self.assertRaisesRegex(ValueError, expected_error): option.validate() # Option num_quantiles must be a positive integer option = self.get_options(num_quantiles=0) - expected_error = ( - "NumQuantilesOption.num_quantiles must be a " "positive integer" - ) + expected_error = "NumQuantilesOption.num_quantiles must be a positive integer" with self.assertRaisesRegex(ValueError, expected_error): option.validate() - # Option num_quantiles must be a positive integer + # Option num_quantiles cannot be a negative integer option = self.get_options(num_quantiles=-5) - expected_error = ( - "NumQuantilesOption.num_quantiles must be a " "positive integer" - ) + expected_error = "NumQuantilesOption.num_quantiles must be a positive integer" with self.assertRaisesRegex(ValueError, expected_error): option.validate() From bc5f195773bc37bdf413ae4c077c44ce3a9de383 Mon Sep 17 00:00:00 2001 From: Christopher Lee Date: Wed, 28 Jun 2023 14:09:36 -0400 Subject: [PATCH 14/15] Empty commit From 48cbdf4a498a5b54dd7b0a3529227dec3e408e3c Mon Sep 17 00:00:00 2001 From: Christopher Lee Date: Wed, 28 Jun 2023 14:13:38 -0400 Subject: [PATCH 15/15] Precommits --- dataprofiler/profilers/profiler_options.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dataprofiler/profilers/profiler_options.py b/dataprofiler/profilers/profiler_options.py index 8e100f547..7737ac435 100644 --- a/dataprofiler/profilers/profiler_options.py +++ b/dataprofiler/profilers/profiler_options.py @@ -298,9 +298,8 @@ def _validate_helper(self, variable_path: str = "NumQuantilesOption") -> list[st if self.num_quantiles is not None and ( not isinstance(self.num_quantiles, int) or self.num_quantiles < 1 ): - errors.append( - "{}.num_quantiles must be a positive integer.".format(variable_path) - ) + errors.append(f"{variable_path}.num_quantiles must be a positive integer.") + return errors