Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HistogramOptions now named to HistogramAndQuantilesOptions, implement num_quantiles option to HistogramAndQuantilesOptions #956

Merged
merged 24 commits into from
Jul 20, 2023
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
eb57b12
Implemented options for `num_quantiles` feature in `NumericStatsMixin`
clee1152 Jun 27, 2023
a739473
Implemented options for `num_quantiles.py`.
clee1152 Jun 27, 2023
d20da0c
Added tests for `num_quantiles`.
clee1152 Jun 27, 2023
3676bc7
Fixed tests for num_quantiles option in column profiles.
clee1152 Jun 27, 2023
00debad
Added tests for num_quantiles option.
clee1152 Jun 28, 2023
edaa50b
Update dataprofiler/tests/profilers/profiler_options/test_num_quantil…
clee1152 Jun 28, 2023
7cf48b6
Minor edits to tests
clee1152 Jun 28, 2023
3c69c2a
Empty commit
clee1152 Jun 28, 2023
10f11d3
Precommits
clee1152 Jul 10, 2023
bfb0ebd
Minor change to Makefile.
clee1152 Jul 10, 2023
ca8e060
Got rid of print statements.
clee1152 Jul 10, 2023
6d027ca
"empty commit"
clee1152 Jul 13, 2023
b47a632
Added deprecation for `HistogramOption`.
clee1152 Jul 19, 2023
47b778d
Cleaner deprecation: removed `HistogramOption` class.
clee1152 Jul 19, 2023
e830658
json_decoder, test_h_a_q changes.
clee1152 Jul 19, 2023
ef5a883
Fixed string bug for testing in test_histogram_and_quantiles
clee1152 Jul 20, 2023
9dac09e
Explicitfied deprecation warning, validated profile conversion from h…
clee1152 Jul 20, 2023
3932679
Minor deprecation warning wording change.
clee1152 Jul 20, 2023
ac1606e
Update dataprofiler/tests/profilers/profiler_options/test_histogram_a…
clee1152 Jul 20, 2023
6c568d8
Changed deprecation warning.
clee1152 Jul 20, 2023
2bbed1e
Changed quantiles option to initialize to None.
clee1152 Jul 20, 2023
7c40959
Minor change
clee1152 Jul 20, 2023
38c3301
Minor change again D:
clee1152 Jul 20, 2023
518e848
Minor change again again D: sadness
clee1152 Jul 20, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions dataprofiler/profilers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
DataLabelerOptions,
DateTimeOptions,
FloatOptions,
HistogramOption,
HistogramAndQuantilesOption,
HyperLogLogOptions,
IntOptions,
ModeOption,
Expand Down Expand Up @@ -66,7 +66,8 @@

json_decoder._options = {
BooleanOption.__name__: BooleanOption,
HistogramOption.__name__: HistogramOption,
JGSweets marked this conversation as resolved.
Show resolved Hide resolved
"HistogramOption": HistogramAndQuantilesOption,
HistogramAndQuantilesOption.__name__: HistogramAndQuantilesOption,
ModeOption.__name__: ModeOption,
BaseInspectorOptions.__name__: BaseInspectorOptions,
NumericalOptions.__name__: NumericalOptions,
Expand Down
9 changes: 9 additions & 0 deletions dataprofiler/profilers/json_decoder.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Contains methods to decode components of a Profiler."""
from __future__ import annotations

import warnings
from typing import TYPE_CHECKING

if TYPE_CHECKING:
Expand Down Expand Up @@ -72,6 +73,14 @@ def get_option_class(class_name: str) -> type[BaseOption]:
options_class: type[BaseOption] | None = _options.get(class_name)
if options_class is None:
raise ValueError(f"Invalid option class {class_name} " f"failed to load.")

if class_name == "HistogramOption":
warnings.warn(
f"{class_name} will be deprecated in the future. HistogramOption has been "
clee1152 marked this conversation as resolved.
Show resolved Hide resolved
"mapped to HistogramAndQuantilesOption while JSON decoding. Please begin "
"utilizing the new HistogramAndQuantilesOption class.",
DeprecationWarning,
)
return options_class


Expand Down
14 changes: 12 additions & 2 deletions dataprofiler/profilers/numerical_column_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,15 @@ def __init__(self, options: NumericalOptions = None) -> None:
self._mode_is_enabled: bool = True
self.num_zeros: int | np.int64 = np.int64(0)
self.num_negatives: int | np.int64 = np.int64(0)
self._num_quantiles: int = 1000 # TODO: add to options
self._num_quantiles: int = 1000 # By default, we use 1000 quantiles

if options:
self.bias_correction = options.bias_correction.is_enabled
self._top_k_modes = options.mode.top_k_modes
self._median_is_enabled = options.median.is_enabled
self._median_abs_dev_is_enabled = options.median_abs_deviation.is_enabled
self._mode_is_enabled = options.mode.is_enabled
self._num_quantiles = options.histogram_and_quantiles.num_quantiles
bin_count_or_method = options.histogram_and_quantiles.bin_count_or_method
if isinstance(bin_count_or_method, str):
self.histogram_bin_method_names = [bin_count_or_method]
Expand All @@ -113,7 +114,9 @@ def __init__(self, options: NumericalOptions = None) -> None:
"suggested_bin_count": self.min_histogram_bin,
"histogram": {"bin_counts": None, "bin_edges": None},
}
self.quantiles: list[float] | None = None
clee1152 marked this conversation as resolved.
Show resolved Hide resolved
self.quantiles: list[float] | dict = {
bin_num: None for bin_num in range(self._num_quantiles - 1)
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

☠️

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm shook-eth

self.__calculations = {
"min": NumericStatsMixin._get_min,
"max": NumericStatsMixin._get_max,
Expand Down Expand Up @@ -459,6 +462,13 @@ def convert_histogram_key_types_to_np(histogram_info: dict):
self.histogram_methods[key]
)

# Convert quantile keys to correct types
if isinstance(self.quantiles, dict):
new_quantiles = dict()
for key in self.quantiles.keys():
new_quantiles[int(key)] = self.quantiles[key]
self.quantiles = new_quantiles

if self.min is not None:
self.min = np.float64(self.min)
if self.max is not None:
Expand Down
21 changes: 18 additions & 3 deletions dataprofiler/profilers/profiler_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,13 +210,14 @@ def _validate_helper(self, variable_path: str = "BooleanOption") -> list[str]:
return errors


class HistogramOption(BooleanOption["HistogramOption"]):
class HistogramAndQuantilesOption(BooleanOption["HistogramAndQuantilesOption"]):
"""For setting histogram options."""

def __init__(
self,
is_enabled: bool = True,
bin_count_or_method: str | int | list[str] = "auto",
num_quantiles: int = 1000,
) -> None:
"""
Initialize Options for histograms.
Expand All @@ -226,11 +227,17 @@ def __init__(
:ivar bin_count_or_method: bin count or the method with which to
calculate histograms
:vartype bin_count_or_method: Union[str, int, list(str)]
:ivar num_quantiles: boolean option to enable/disable num_quantiles
and set the number of quantiles
:vartype num_quantiles: int
"""
self.bin_count_or_method = bin_count_or_method
self.num_quantiles = num_quantiles
super().__init__(is_enabled=is_enabled)

def _validate_helper(self, variable_path: str = "HistogramOption") -> list[str]:
def _validate_helper(
self, variable_path: str = "HistogramAndQuantilesOption"
) -> list[str]:
"""
Validate the options do not conflict and cause errors.

Expand Down Expand Up @@ -260,6 +267,12 @@ def _validate_helper(self, variable_path: str = "HistogramOption") -> list[str]:
"than 1, a string, or list of strings from the "
"following: {}.".format(variable_path, valid_methods)
)

if self.num_quantiles is not None and (
not isinstance(self.num_quantiles, int) or self.num_quantiles < 1
):
errors.append(f"{variable_path}.num_quantiles must be a positive integer.")

return errors


Expand Down Expand Up @@ -396,7 +409,9 @@ def __init__(self) -> None:
self.median_abs_deviation: BooleanOption = BooleanOption(is_enabled=True)
self.num_zeros: BooleanOption = BooleanOption(is_enabled=True)
self.num_negatives: BooleanOption = BooleanOption(is_enabled=True)
self.histogram_and_quantiles: HistogramOption = HistogramOption()
self.histogram_and_quantiles: HistogramAndQuantilesOption = (
clee1152 marked this conversation as resolved.
Show resolved Hide resolved
HistogramAndQuantilesOption()
)
# By default, we correct for bias
self.bias_correction: BooleanOption = BooleanOption(is_enabled=True)
BaseInspectorOptions.__init__(self)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def test_json_encode(self):
"data": {"is_enabled": True},
},
"histogram_and_quantiles": {
"class": "HistogramOption",
"class": "HistogramAndQuantilesOption",
"data": mock.ANY,
},
"bias_correction": {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
import json

from dataprofiler.profilers.json_decoder import load_option
from dataprofiler.profilers.json_encoder import ProfileEncoder
from dataprofiler.profilers.profiler_options import HistogramOption
from dataprofiler.profilers.profiler_options import HistogramAndQuantilesOption

from .. import utils as test_utils
from .test_boolean_option import TestBooleanOption


class TestHistogramOption(TestBooleanOption):
class TestHistogramAndQuantilesOption(TestBooleanOption):

option_class = HistogramOption
option_class = HistogramAndQuantilesOption
keys = []

def test_init(self):
option = self.get_options()
self.assertTrue(option.is_enabled)
self.assertEqual(option.bin_count_or_method, "auto")
self.assertEqual(option.num_quantiles, 1000)

def test_set_helper(self):
option = self.get_options()
Expand All @@ -26,6 +29,13 @@ def test_set_helper(self):
with self.assertRaisesRegex(AttributeError, expected_error):
option._set_helper({"bin_count_or_method.is_enabled": True}, "test")

# validate, variable path being passed
expected_error = (
"type object 'test.num_quantiles' has no attribute 'is_enabled'"
)
with self.assertRaisesRegex(AttributeError, expected_error):
option._set_helper({"num_quantiles.is_enabled": True}, "test")

def test_set(self):
option = self.get_options()

Expand Down Expand Up @@ -71,13 +81,58 @@ def _assert_set_helper(prop, value):
with self.assertRaisesRegex(AttributeError, expected_error):
option.set({"bin_count_or_method.is_enabled": True})

# Treat num_quantiles as a BooleanOption
expected_error = "type object 'num_quantiles' has no attribute " "'is_enabled'"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fix this spacing

with self.assertRaisesRegex(AttributeError, expected_error):
option.set({"num_quantiles.is_enabled": True})

# Test set option for num_quantiles
option.set({"num_quantiles": 50})
self.assertEqual(option.num_quantiles, 50)

def test_validate_helper(self):
super().test_validate_helper()

optpth = self.get_options_path()

# Default configuration
option = self.get_options(num_quantiles=1000)
self.assertEqual([], option._validate_helper())

# Valid configurations
option = self.get_options(num_quantiles=50)
self.assertEqual([], option._validate_helper())
option = self.get_options(num_quantiles=2000)
self.assertEqual([], option._validate_helper())
option = self.get_options(num_quantiles=1)
self.assertEqual([], option._validate_helper())

# Option num_quantiles
option = self.get_options(num_quantiles="Hello World")
expected_error = [f"{optpth}.num_quantiles must be a positive integer."]
self.assertSetEqual(set(expected_error), set(option._validate_helper()))

# Option num_quantiles cannot be a float, must be an int
option = self.get_options(num_quantiles=1.1)
expected_error = [f"{optpth}.num_quantiles must be a positive integer."]
self.assertSetEqual(set(expected_error), set(option._validate_helper()))

# Option num_quantiles may not be zero, must be greater than one(1)
option = self.get_options(num_quantiles=0)
expected_error = [f"{optpth}.num_quantiles must be a positive integer."]
self.assertSetEqual(set(expected_error), set(option._validate_helper()))

# Option num_quantiles cannot be a negative integer
option = self.get_options(num_quantiles=-5)
expected_error = [f"{optpth}.num_quantiles must be a positive integer."]
self.assertSetEqual(set(expected_error), set(option._validate_helper()))

def test_validate(self):

super().test_validate()

optpth = self.get_options_path()

params_to_check = [
# non errors
dict(prop="is_enabled", value_list=[False, True], errors=[]),
Expand Down Expand Up @@ -115,7 +170,7 @@ def test_validate(self):
"1",
],
errors=[
"HistogramOption.bin_count_or_method must be an integer "
"HistogramAndQuantilesOption.bin_count_or_method must be an integer "
"more than 1, a string, or list of strings from the "
"following: ['auto', 'fd', 'doane', 'scott', 'rice', "
"'sturges', 'sqrt']."
Expand Down Expand Up @@ -155,13 +210,45 @@ def test_validate(self):
# this time testing raising an error
option.bin_count_or_method = "fake method"
expected_error = (
r"HistogramOption.bin_count_or_method must be an integer more than "
r"HistogramAndQuantilesOption.bin_count_or_method must be an integer more than "
r"1, a string, or list of strings from the following: "
r"\['auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt']."
)
with self.assertRaisesRegex(ValueError, expected_error):
option.validate()

# Valid configurations
option = self.get_options(num_quantiles=50)
self.assertEqual([], option._validate_helper())
option = self.get_options(num_quantiles=2000)
self.assertEqual([], option._validate_helper())
option = self.get_options(num_quantiles=1)
self.assertEqual([], option._validate_helper())

# Option num_quantiles cannot be a string, must be an int
option = self.get_options(num_quantiles="Hello World")
expected_error = f"{optpth}.num_quantiles must be a positive integer"
with self.assertRaisesRegex(ValueError, expected_error):
option.validate()

# Option num_quantiles cannot be a float, must be an int
option = self.get_options(num_quantiles=1.1)
expected_error = f"{optpth}.num_quantiles must be a positive integer"
with self.assertRaisesRegex(ValueError, expected_error):
option.validate()

# Option num_quantiles must be a positive integer
option = self.get_options(num_quantiles=0)
expected_error = f"{optpth}.num_quantiles must be a positive integer"
with self.assertRaisesRegex(ValueError, expected_error):
option.validate()

# Option num_quantiles cannot be a negative integer
option = self.get_options(num_quantiles=-5)
expected_error = f"{optpth}.num_quantiles must be a positive integer"
with self.assertRaisesRegex(ValueError, expected_error):
option.validate()

def test_eq(self):
super().test_eq()

Expand All @@ -173,18 +260,52 @@ def test_eq(self):
self.assertNotEqual(options, options2)
options2.bin_count_or_method = "sturges"
self.assertEqual(options, options2)
options.num_quantiles = 30
self.assertNotEqual(options, options2)
options2.num_quantiles = 50
self.assertNotEqual(options, options2)
options2.num_quantiles = 30
self.assertEqual(options, options2)

def test_json_encode(self):
option = HistogramOption(is_enabled=False, bin_count_or_method="doane")
option = HistogramAndQuantilesOption(
is_enabled=False, bin_count_or_method="doane"
)

serialized = json.dumps(option, cls=ProfileEncoder)

expected = {
"class": "HistogramOption",
"class": "HistogramAndQuantilesOption",
"data": {
"bin_count_or_method": "doane",
"num_quantiles": 1000,
"is_enabled": False,
},
}

self.assertDictEqual(expected, json.loads(serialized))

def test_json_decode_warn(self):
old_histogram = {
"class": "HistogramOption",
"data": {
"bin_count_or_method": "doane",
"is_enabled": False,
},
}

expected = HistogramAndQuantilesOption(
is_enabled=False, bin_count_or_method="doane"
)

expected_string = json.dumps(old_histogram, cls=ProfileEncoder)

expected_warning = (
"HistogramOption will be deprecated in the future. HistogramOption has been "
"mapped to HistogramAndQuantilesOption while JSON decoding. Please begin "
"utilizing the new HistogramAndQuantilesOption class."
clee1152 marked this conversation as resolved.
Show resolved Hide resolved
)

with self.assertWarnsRegex(DeprecationWarning, expected_warning):
deserialized = load_option(json.loads(expected_string))
test_utils.assert_profiles_equal(deserialized, expected)
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def test_json_encode(self):
"data": {"is_enabled": True},
},
"histogram_and_quantiles": {
"class": "HistogramOption",
"class": "HistogramAndQuantilesOption",
"data": mock.ANY,
},
"bias_correction": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,7 @@ def test_json_encode(self):
"data": {"is_enabled": True},
},
"histogram_and_quantiles": {
"class": "HistogramOption",
"class": "HistogramAndQuantilesOption",
"data": mock.ANY,
},
"bias_correction": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,6 @@ def test_numerical_stats_option(self, *mocks):
self.assertIsNone(profile_column["statistics"]["min"])
self.assertIsNone(profile_column["statistics"]["max"])
self.assertTrue(np.isnan(profile_column["statistics"]["variance"]))
self.assertIsNone(profile_column["statistics"]["quantiles"])
self.assertTrue(np.isnan(profile_column["statistics"]["skewness"]))
self.assertTrue(np.isnan(profile_column["statistics"]["kurtosis"]))

Expand Down Expand Up @@ -242,7 +241,6 @@ def test_disabling_all_stats(self, *mocks):
self.assertIsNone(profile_column["statistics"]["min"])
self.assertIsNone(profile_column["statistics"]["max"])
self.assertTrue(np.isnan(profile_column["statistics"]["variance"]))
self.assertIsNone(profile_column["statistics"]["quantiles"])
self.assertTrue(profile_column["statistics"]["skewness"] is np.nan)
self.assertTrue(profile_column["statistics"]["kurtosis"] is np.nan)
self.assertTrue(
Expand Down
Loading