Skip to content

Commit

Permalink
HistogramAndQuantilesOption sync with dev branch (#987)
Browse files Browse the repository at this point in the history
* Changes to HistogramAndQuantilesOption now sync with concurrent updates to dev branch.

* Changes to scipy version, fixing comments

* Slight docstrings change

* revert back -- other PR to fix

* empty

* fix
  • Loading branch information
clee1152 authored Aug 2, 2023
1 parent f28c17a commit da09c1d
Show file tree
Hide file tree
Showing 13 changed files with 353 additions and 205 deletions.
5 changes: 3 additions & 2 deletions dataprofiler/profilers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
DataLabelerOptions,
DateTimeOptions,
FloatOptions,
HistogramOption,
HistogramAndQuantilesOption,
HyperLogLogOptions,
IntOptions,
ModeOption,
Expand Down Expand Up @@ -66,7 +66,8 @@

json_decoder._options = {
BooleanOption.__name__: BooleanOption,
HistogramOption.__name__: HistogramOption,
"HistogramOption": HistogramAndQuantilesOption,
HistogramAndQuantilesOption.__name__: HistogramAndQuantilesOption,
ModeOption.__name__: ModeOption,
BaseInspectorOptions.__name__: BaseInspectorOptions,
NumericalOptions.__name__: NumericalOptions,
Expand Down
9 changes: 9 additions & 0 deletions dataprofiler/profilers/json_decoder.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Contains methods to decode components of a Profiler."""
from __future__ import annotations

import warnings
from typing import TYPE_CHECKING

if TYPE_CHECKING:
Expand Down Expand Up @@ -72,6 +73,14 @@ def get_option_class(class_name: str) -> type[BaseOption]:
options_class: type[BaseOption] | None = _options.get(class_name)
if options_class is None:
raise ValueError(f"Invalid option class {class_name} " f"failed to load.")

if class_name == "HistogramOption":
warnings.warn(
f"{class_name} will be deprecated in the future. During the JSON encode"
" process, HistogramOption is mapped to HistogramAndQuantilesOption. "
"Please begin utilizing the new HistogramAndQuantilesOption class.",
DeprecationWarning,
)
return options_class


Expand Down
3 changes: 2 additions & 1 deletion dataprofiler/profilers/numerical_column_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,15 @@ def __init__(self, options: NumericalOptions = None) -> None:
self._mode_is_enabled: bool = True
self.num_zeros: int | np.int64 = np.int64(0)
self.num_negatives: int | np.int64 = np.int64(0)
self._num_quantiles: int = 1000 # TODO: add to options
self._num_quantiles: int = 1000 # By default, we use 1000 quantiles

if options:
self.bias_correction = options.bias_correction.is_enabled
self._top_k_modes = options.mode.top_k_modes
self._median_is_enabled = options.median.is_enabled
self._median_abs_dev_is_enabled = options.median_abs_deviation.is_enabled
self._mode_is_enabled = options.mode.is_enabled
self._num_quantiles = options.histogram_and_quantiles.num_quantiles
bin_count_or_method = options.histogram_and_quantiles.bin_count_or_method
if isinstance(bin_count_or_method, str):
self.histogram_bin_method_names = [bin_count_or_method]
Expand Down
20 changes: 17 additions & 3 deletions dataprofiler/profilers/profiler_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,13 +210,14 @@ def _validate_helper(self, variable_path: str = "BooleanOption") -> list[str]:
return errors


class HistogramOption(BooleanOption["HistogramOption"]):
class HistogramAndQuantilesOption(BooleanOption["HistogramAndQuantilesOption"]):
"""For setting histogram options."""

def __init__(
self,
is_enabled: bool = True,
bin_count_or_method: str | int | list[str] = "auto",
num_quantiles: int = 1000,
) -> None:
"""
Initialize Options for histograms.
Expand All @@ -226,11 +227,16 @@ def __init__(
:ivar bin_count_or_method: bin count or the method with which to
calculate histograms
:vartype bin_count_or_method: Union[str, int, list(str)]
:ivar num_quantiles: number of quantiles
:vartype num_quantiles: int
"""
self.bin_count_or_method = bin_count_or_method
self.num_quantiles = num_quantiles
super().__init__(is_enabled=is_enabled)

def _validate_helper(self, variable_path: str = "HistogramOption") -> list[str]:
def _validate_helper(
self, variable_path: str = "HistogramAndQuantilesOption"
) -> list[str]:
"""
Validate the options do not conflict and cause errors.
Expand Down Expand Up @@ -260,6 +266,12 @@ def _validate_helper(self, variable_path: str = "HistogramOption") -> list[str]:
"than 1, a string, or list of strings from the "
"following: {}.".format(variable_path, valid_methods)
)

if self.num_quantiles is not None and (
not isinstance(self.num_quantiles, int) or self.num_quantiles < 1
):
errors.append(f"{variable_path}.num_quantiles must be a positive integer.")

return errors


Expand Down Expand Up @@ -396,7 +408,9 @@ def __init__(self) -> None:
self.median_abs_deviation: BooleanOption = BooleanOption(is_enabled=True)
self.num_zeros: BooleanOption = BooleanOption(is_enabled=True)
self.num_negatives: BooleanOption = BooleanOption(is_enabled=True)
self.histogram_and_quantiles: HistogramOption = HistogramOption()
self.histogram_and_quantiles: HistogramAndQuantilesOption = (
HistogramAndQuantilesOption()
)
# By default, we correct for bias
self.bias_correction: BooleanOption = BooleanOption(is_enabled=True)
BaseInspectorOptions.__init__(self)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def test_json_encode(self):
"data": {"is_enabled": True},
},
"histogram_and_quantiles": {
"class": "HistogramOption",
"class": "HistogramAndQuantilesOption",
"data": mock.ANY,
},
"bias_correction": {
Expand Down
Loading

0 comments on commit da09c1d

Please sign in to comment.