Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implemented options for num_quantiles feature in NumericStatsMixin #896

Closed
wants to merge 15 commits into from
Closed
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions dataprofiler/profilers/numerical_column_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,11 @@ def __init__(self, options: NumericalOptions = None) -> None:
self._mode_is_enabled: bool = True
self.num_zeros: int = 0
self.num_negatives: int = 0
self.num_quantiles: int = 1000 # By default, we use 1000 quantiles
if options:
self.bias_correction = options.bias_correction.is_enabled
self._top_k_modes = options.mode.top_k_modes
self.num_quantiles = options.num_quantiles.num_quantiles
self._median_is_enabled = options.median.is_enabled
self._median_abs_dev_is_enabled = options.median_abs_deviation.is_enabled
self._mode_is_enabled = options.mode.is_enabled
Expand All @@ -111,9 +113,8 @@ def __init__(self, options: NumericalOptions = None) -> None:
"suggested_bin_count": self.min_histogram_bin,
"histogram": {"bin_counts": None, "bin_edges": None},
}
num_quantiles: int = 1000 # TODO: add to options
self.quantiles: list[float] | dict = {
bin_num: None for bin_num in range(num_quantiles - 1)
bin_num: None for bin_num in range(self.num_quantiles - 1)
}
self.__calculations = {
"min": NumericStatsMixin._get_min,
Expand Down
100 changes: 59 additions & 41 deletions dataprofiler/profilers/profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1582,6 +1582,15 @@ def _add_error_checks( # type: ignore[override]
"Attempting to merge two profiles with unique row "
"count option enabled on one profile but not the other."
)
# Check null_count options
if (
self.options.row_statistics.null_count.is_enabled
!= other.options.row_statistics.null_count.is_enabled
):
raise ValueError(
"Attempting to merge two profiles with null row "
"count option enabled on one profile but not the other."
)
# Check hashing_method options
if (
self.options.row_statistics.unique_count.hashing_method
Expand Down Expand Up @@ -1967,7 +1976,10 @@ def _get_unique_row_ratio(self) -> float | None:

def _get_row_is_null_ratio(self) -> float | None:
"""Return whether row is null ratio."""
if not self.options.row_statistics.is_enabled:
if (
not self.options.row_statistics.is_enabled
or not self.options.row_statistics.null_count.is_enabled
):
return None

if self._min_col_samples_used:
Expand All @@ -1976,7 +1988,10 @@ def _get_row_is_null_ratio(self) -> float | None:

def _get_row_has_null_ratio(self) -> float | None:
"""Return whether row has null ratio."""
if not self.options.row_statistics.is_enabled:
if (
not self.options.row_statistics.is_enabled
or not self.options.row_statistics.null_count.is_enabled
):
return None

if self._min_col_samples_used:
Expand Down Expand Up @@ -2051,48 +2066,51 @@ def _update_row_statistics(
self.hashed_row_object.add(record)

# Calculate Null Column Count
null_rows = set()
null_in_row_count = set()
first_col_flag = True
for column in self._profile:
null_type_dict = column.null_types_index
null_row_indices = set()
if null_type_dict:
null_row_indices = set.union(*null_type_dict.values())

# If sample ids provided, only consider nulls in rows that
# were fully sampled
if sample_ids is not None:
# This is the amount (integer) indices were shifted by in the
# event of overlap
shift = column._index_shift
if shift is None:
# Shift is None if index is str or if no overlap detected
null_row_indices = null_row_indices.intersection(
data.index[sample_ids[: self._min_sampled_from_batch]]
)
if self.options.row_statistics.null_count.is_enabled:
null_rows = set()
null_in_row_count = set()
first_col_flag = True
for column in self._profile:
null_type_dict = column.null_types_index
null_row_indices = set()
if null_type_dict:
null_row_indices = set.union(*null_type_dict.values())

# If sample ids provided, only consider nulls in rows that
# were fully sampled
if sample_ids is not None:
# This is the amount (integer) indices were shifted by in the
# event of overlap
shift = column._index_shift
if shift is None:
# Shift is None if index is str or if no overlap detected
null_row_indices = null_row_indices.intersection(
data.index[sample_ids[: self._min_sampled_from_batch]]
)
else:
# Only shift if index shift detected (must be ints)
null_row_indices = null_row_indices.intersection(
data.index[sample_ids[: self._min_sampled_from_batch]]
+ shift
)

# Find the common null indices between the columns
if first_col_flag:
null_rows = null_row_indices
null_in_row_count = null_row_indices
first_col_flag = False
else:
# Only shift if index shift detected (must be ints)
null_row_indices = null_row_indices.intersection(
data.index[sample_ids[: self._min_sampled_from_batch]] + shift
)
null_rows = null_rows.intersection(null_row_indices)
null_in_row_count = null_in_row_count.union(null_row_indices)

# Find the common null indices between the columns
if first_col_flag:
null_rows = null_row_indices
null_in_row_count = null_row_indices
first_col_flag = False
# If sample_ids provided,
# increment since that means only new data read
if sample_ids is not None:
self.row_has_null_count += len(null_in_row_count)
self.row_is_null_count += len(null_rows)
else:
null_rows = null_rows.intersection(null_row_indices)
null_in_row_count = null_in_row_count.union(null_row_indices)

# If sample_ids provided, increment since that means only new data read
if sample_ids is not None:
self.row_has_null_count += len(null_in_row_count)
self.row_is_null_count += len(null_rows)
else:
self.row_has_null_count = len(null_in_row_count)
self.row_is_null_count = len(null_rows)
self.row_has_null_count = len(null_in_row_count)
self.row_is_null_count = len(null_rows)

def _get_correlation(
self, clean_samples: dict, batch_properties: dict
Expand Down
94 changes: 90 additions & 4 deletions dataprofiler/profilers/profiler_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import copy
import re
import warnings
from typing import Any

from ..labelers.base_data_labeler import BaseDataLabeler

Expand Down Expand Up @@ -268,6 +269,41 @@ def _validate_helper(self, variable_path: str = "ModeOption") -> list[str]:
return errors


class NumQuantilesOption(BooleanOption):
clee1152 marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we call this QuantilesOption instead? That way it is extensible to more than just the number of quantiles.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I agree -- more extensible for future development

"""For setting number of quantile options."""

def __init__(self, is_enabled: bool = True, num_quantiles: int = 1000) -> None:
"""
Initialize options for number of quantiles.

:ivar is_enabled: boolean option to enable/disable the option.
:vartype is_enabled: bool
:ivar num_quantiles: the number of quantiles to bin the data.
:vartype num_quantiles: int
"""
self.num_quantiles = num_quantiles
super().__init__(is_enabled=is_enabled)

def _validate_helper(self, variable_path: str = "NumQuantilesOption") -> list[str]:
"""
Validate the options do not conflict and cause errors.

:param variable_path: current path to variable set.
:type variable_path: str
:return: list of errors (if raise_error is false)
:rtype: list(str)
"""
errors = super()._validate_helper(variable_path=variable_path)

if self.num_quantiles is not None and (
not isinstance(self.num_quantiles, int) or self.num_quantiles < 1
):
errors.append(
"{}.num_quantiles must be " "a positive integer.".format(variable_path)
clee1152 marked this conversation as resolved.
Show resolved Hide resolved
)
return errors


class BaseInspectorOptions(BooleanOption):
"""For setting Base options."""

Expand Down Expand Up @@ -351,6 +387,9 @@ def __init__(self) -> None:
:vartype num_zeros: BooleanOption
:ivar num_negatives: boolean option to enable/disable num_negatives
:vartype num_negatives: BooleanOption
:ivar num_quantiles: boolean option to enable/disable num_quantiles
clee1152 marked this conversation as resolved.
Show resolved Hide resolved
and set the number of quantiles
:vartype num_quantiles: NumQuantilesOption
:ivar is_numeric_stats_enabled: boolean to enable/disable all numeric
stats
:vartype is_numeric_stats_enabled: bool
Expand All @@ -366,6 +405,7 @@ def __init__(self) -> None:
self.median_abs_deviation = BooleanOption(is_enabled=True)
self.num_zeros = BooleanOption(is_enabled=True)
self.num_negatives = BooleanOption(is_enabled=True)
self.num_quantiles = NumQuantilesOption(is_enabled=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering if this should be in NumericalOptions? We already have
self.histogram_and_quantiles = HistogramOption().

Should we rename HistogramOption and insert it in that class?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this makes sense. Does this mean we will not go with your previous suggestion of renaming NumQuantilesOption to QuantileOptions and instead just insert it into HistogramOption?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair Q. @taylorfturner may have more thoughts, but if we are keeping
histogram_and_quantiles
Then we should at least rename HistogramOption -> HistogramAndQuantilesOption
We could just move over the num_quantiles setting

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That sounds good to me! Will see what @taylorfturner thinks, then implement.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, @clee1152 and I just talked about this -- I'm game for self. histogram_and_quantiles = HistogramAndQuantilesOption() where self.num_quantiles is set in class HistogramAndQuantilesOption() @JGSweets

self.histogram_and_quantiles = HistogramOption()
# By default, we correct for bias
self.bias_correction = BooleanOption(is_enabled=True)
Expand Down Expand Up @@ -395,6 +435,7 @@ def is_numeric_stats_enabled(self) -> bool:
or self.histogram_and_quantiles.is_enabled
or self.num_zeros.is_enabled
or self.num_negatives.is_enabled
or self.num_quantiles.is_enabled
):
return True
return False
Expand Down Expand Up @@ -424,6 +465,7 @@ def is_numeric_stats_enabled(self, value: bool) -> None:
self.num_zeros.is_enabled = value
self.num_negatives.is_enabled = value
self.histogram_and_quantiles.is_enabled = value
self.num_quantiles.is_enabled = value

@property
def properties(self) -> dict[str, BooleanOption]:
Expand Down Expand Up @@ -463,6 +505,7 @@ def _validate_helper(self, variable_path: str = "NumericalOptions") -> list[str]
"bias_correction",
"num_zeros",
"num_negatives",
"num_quantiles",
]:
if not isinstance(self.properties[item], BooleanOption):
errors.append(f"{variable_path}.{item} must be a BooleanOption.")
Expand Down Expand Up @@ -568,6 +611,9 @@ def __init__(self) -> None:
:vartype num_zeros: BooleanOption
:ivar num_negatives: boolean option to enable/disable num_negatives
:vartype num_negatives: BooleanOption
:ivar num_quantiles: boolean option to enable/disable num_quantiles
and set the number of quantiles
:vartype num_quantiles: NumQuantilesOption
:ivar is_numeric_stats_enabled: boolean to enable/disable all numeric
stats
:vartype is_numeric_stats_enabled: bool
Expand Down Expand Up @@ -665,6 +711,9 @@ def __init__(self) -> None:
:vartype num_zeros: BooleanOption
:ivar num_negatives: boolean option to enable/disable num_negatives
:vartype num_negatives: BooleanOption
:ivar num_quantiles: boolean option to enable/disable num_quantiles
and set the number of quantiles
:vartype num_quantiles: NumQuantilesOption
:ivar is_numeric_stats_enabled: boolean to enable/disable all numeric
stats
:vartype is_numeric_stats_enabled: bool
Expand Down Expand Up @@ -724,6 +773,9 @@ def __init__(self) -> None:
:vartype num_zeros: BooleanOption
:ivar num_negatives: boolean option to enable/disable num_negatives
:vartype num_negatives: BooleanOption
:ivar num_quantiles: boolean option to enable/disable num_quantiles
and set the number of quantiles
:vartype num_quantiles: NumQuantilesOption
:ivar is_numeric_stats_enabled: boolean to enable/disable all numeric
stats
:vartype is_numeric_stats_enabled: bool
Expand Down Expand Up @@ -787,6 +839,7 @@ def is_numeric_stats_enabled(self) -> bool:
or self.median.is_enabled
or self.median_abs_deviation.is_enabled
or self.histogram_and_quantiles.is_enabled
or self.num_quantiles.is_enabled
):
return True
return False
Expand All @@ -813,6 +866,7 @@ def is_numeric_stats_enabled(self, value: bool) -> None:
self.kurtosis.is_enabled = value
self.median_abs_deviation.is_enabled = value
self.histogram_and_quantiles.is_enabled = value
self.num_quantiles.is_enabled = value


class DateTimeOptions(BaseInspectorOptions):
Expand Down Expand Up @@ -1070,15 +1124,25 @@ def _validate_helper(self, variable_path: str = "UniqueCountOptions") -> list[st
class RowStatisticsOptions(BooleanOption):
"""For configuring options for row statistics."""

def __init__(self, is_enabled: bool = True, unique_count: bool = True) -> None:
def __init__(
self,
is_enabled: bool = True,
unique_count: bool = True,
null_count: bool = True,
) -> None:
"""
Initialize options for row statistics.

:ivar is_enabled: boolean option to enable/disable.
:vartype is_enabled: bool
:ivar unique_count: boolean option to enable/disable unique_count
:vartype unique_count: bool
ivar null_count: boolean option to enable/disable null_count
:vartype null_count: bool
"""
BooleanOption.__init__(self, is_enabled=is_enabled)
self.unique_count = UniqueCountOptions(is_enabled=unique_count)
self.null_count = BooleanOption(is_enabled=null_count)

def _validate_helper(
self, variable_path: str = "RowStatisticsOptions"
Expand All @@ -1094,9 +1158,14 @@ def _validate_helper(
errors = super()._validate_helper(variable_path=variable_path)
if not isinstance(self.unique_count, UniqueCountOptions):
errors.append(
f"{variable_path}.full_hashing must be an UniqueCountOptions."
f"{variable_path}.unique_count must be an UniqueCountOptions."
)

if not isinstance(self.null_count, BooleanOption):
errors.append(f"{variable_path}.null_count must be an BooleanOption.")

errors += self.unique_count._validate_helper(variable_path + ".unique_counts")
errors += self.null_count._validate_helper(variable_path + ".null_count")
return super()._validate_helper(variable_path)


Expand Down Expand Up @@ -1557,7 +1626,8 @@ def __init__(self, presets: str = None) -> None:
:ivar unstructured_options: option set for unstructured dataset profiling.
:vartype unstructured_options: UnstructuredOptions
:ivar presets: A pre-configured mapping of a string name to group of options:
"complete", "data_types", and "numeric_stats_disabled". Default: None
"complete", "data_types", "numeric_stats_disabled",
and "lower_memory_sketching". Default: None
:vartype presets: Optional[str]
"""
self.structured_options = StructuredOptions()
Expand All @@ -1570,6 +1640,10 @@ def __init__(self, presets: str = None) -> None:
self._data_types_presets()
elif self.presets == "numeric_stats_disabled":
self._numeric_stats_disabled_presets()
elif self.presets == "lower_memory_sketching":
self._lower_memory_sketching_presets()
else:
raise ValueError("The preset entered is not a valid preset.")

def _complete_presets(self) -> None:
self.set({"*.is_enabled": True})
Expand All @@ -1583,6 +1657,18 @@ def _numeric_stats_disabled_presets(self) -> None:
self.set({"*.float.is_numeric_stats_enabled": False})
self.set({"structured_options.text.is_numeric_stats_enabled": False})

def _lower_memory_sketching_presets(self) -> None:
self.set({"row_statistics.unique_count.hashing_method": "hll"})
self.set(
{
(
"structured_options.category."
"max_sample_size_to_check_stop_condition"
): 5000
}
)
self.set({"structured_options.category.stop_condition_unique_value_ratio": 0.5})

def _validate_helper(self, variable_path: str = "ProfilerOptions") -> list[str]:
"""
Validate the options do not conflict and cause errors.
Expand Down Expand Up @@ -1620,7 +1706,7 @@ def _validate_helper(self, variable_path: str = "ProfilerOptions") -> list[str]:

return errors

def set(self, options: dict[str, bool]) -> None:
def set(self, options: dict[str, Any]) -> None:
"""
Overwrite BaseOption.set.

Expand Down
Loading