capitalone · clee1152 · Jun 13, 2023 · Jun 14, 2023 · Jun 15, 2023 · Jun 15, 2023
@@ -82,9 +82,11 @@ def __init__(self, options: NumericalOptions = None) -> None:
         self._mode_is_enabled: bool = True
         self.num_zeros: int = 0
         self.num_negatives: int = 0
+        self.num_quantiles: int = 1000  # By default, we use 1000 quantiles
         if options:
             self.bias_correction = options.bias_correction.is_enabled
             self._top_k_modes = options.mode.top_k_modes
+            self.num_quantiles = options.num_quantiles.num_quantiles
             self._median_is_enabled = options.median.is_enabled
             self._median_abs_dev_is_enabled = options.median_abs_deviation.is_enabled
             self._mode_is_enabled = options.mode.is_enabled
@@ -111,9 +113,8 @@ def __init__(self, options: NumericalOptions = None) -> None:
                 "suggested_bin_count": self.min_histogram_bin,
                 "histogram": {"bin_counts": None, "bin_edges": None},
             }
-        num_quantiles: int = 1000  # TODO: add to options
         self.quantiles: list[float] | dict = {
-            bin_num: None for bin_num in range(num_quantiles - 1)
+            bin_num: None for bin_num in range(self.num_quantiles - 1)
         }
         self.__calculations = {
             "min": NumericStatsMixin._get_min,

@@ -1582,6 +1582,15 @@ def _add_error_checks(  # type: ignore[override]
                 "Attempting to merge two profiles with unique row "
                 "count option enabled on one profile but not the other."
             )
+        # Check null_count options
+        if (
+            self.options.row_statistics.null_count.is_enabled
+            != other.options.row_statistics.null_count.is_enabled
+        ):
+            raise ValueError(
+                "Attempting to merge two profiles with null row "
+                "count option enabled on one profile but not the other."
+            )
         # Check hashing_method options
         if (
             self.options.row_statistics.unique_count.hashing_method
@@ -1967,7 +1976,10 @@ def _get_unique_row_ratio(self) -> float | None:
 
     def _get_row_is_null_ratio(self) -> float | None:
         """Return whether row is null ratio."""
-        if not self.options.row_statistics.is_enabled:
+        if (
+            not self.options.row_statistics.is_enabled
+            or not self.options.row_statistics.null_count.is_enabled
+        ):
             return None
 
         if self._min_col_samples_used:
@@ -1976,7 +1988,10 @@ def _get_row_is_null_ratio(self) -> float | None:
 
     def _get_row_has_null_ratio(self) -> float | None:
         """Return whether row has null ratio."""
-        if not self.options.row_statistics.is_enabled:
+        if (
+            not self.options.row_statistics.is_enabled
+            or not self.options.row_statistics.null_count.is_enabled
+        ):
             return None
 
         if self._min_col_samples_used:
@@ -2051,48 +2066,51 @@ def _update_row_statistics(
                         self.hashed_row_object.add(record)
 
         # Calculate Null Column Count
-        null_rows = set()
-        null_in_row_count = set()
-        first_col_flag = True
-        for column in self._profile:
-            null_type_dict = column.null_types_index
-            null_row_indices = set()
-            if null_type_dict:
-                null_row_indices = set.union(*null_type_dict.values())
-
-            # If sample ids provided, only consider nulls in rows that
-            # were fully sampled
-            if sample_ids is not None:
-                # This is the amount (integer) indices were shifted by in the
-                # event of overlap
-                shift = column._index_shift
-                if shift is None:
-                    # Shift is None if index is str or if no overlap detected
-                    null_row_indices = null_row_indices.intersection(
-                        data.index[sample_ids[: self._min_sampled_from_batch]]
-                    )
+        if self.options.row_statistics.null_count.is_enabled:
+            null_rows = set()
+            null_in_row_count = set()
+            first_col_flag = True
+            for column in self._profile:
+                null_type_dict = column.null_types_index
+                null_row_indices = set()
+                if null_type_dict:
+                    null_row_indices = set.union(*null_type_dict.values())
+
+                # If sample ids provided, only consider nulls in rows that
+                # were fully sampled
+                if sample_ids is not None:
+                    # This is the amount (integer) indices were shifted by in the
+                    # event of overlap
+                    shift = column._index_shift
+                    if shift is None:
+                        # Shift is None if index is str or if no overlap detected
+                        null_row_indices = null_row_indices.intersection(
+                            data.index[sample_ids[: self._min_sampled_from_batch]]
+                        )
+                    else:
+                        # Only shift if index shift detected (must be ints)
+                        null_row_indices = null_row_indices.intersection(
+                            data.index[sample_ids[: self._min_sampled_from_batch]]
+                            + shift
+                        )
+
+                # Find the common null indices between the columns
+                if first_col_flag:
+                    null_rows = null_row_indices
+                    null_in_row_count = null_row_indices
+                    first_col_flag = False
                 else:
-                    # Only shift if index shift detected (must be ints)
-                    null_row_indices = null_row_indices.intersection(
-                        data.index[sample_ids[: self._min_sampled_from_batch]] + shift
-                    )
+                    null_rows = null_rows.intersection(null_row_indices)
+                    null_in_row_count = null_in_row_count.union(null_row_indices)
 
-            # Find the common null indices between the columns
-            if first_col_flag:
-                null_rows = null_row_indices
-                null_in_row_count = null_row_indices
-                first_col_flag = False
+            # If sample_ids provided,
+            # increment since that means only new data read
+            if sample_ids is not None:
+                self.row_has_null_count += len(null_in_row_count)
+                self.row_is_null_count += len(null_rows)
             else:
-                null_rows = null_rows.intersection(null_row_indices)
-                null_in_row_count = null_in_row_count.union(null_row_indices)
-
-        # If sample_ids provided, increment since that means only new data read
-        if sample_ids is not None:
-            self.row_has_null_count += len(null_in_row_count)
-            self.row_is_null_count += len(null_rows)
-        else:
-            self.row_has_null_count = len(null_in_row_count)
-            self.row_is_null_count = len(null_rows)
+                self.row_has_null_count = len(null_in_row_count)
+                self.row_is_null_count = len(null_rows)
 
     def _get_correlation(
         self, clean_samples: dict, batch_properties: dict

@@ -6,6 +6,7 @@
 import copy
 import re
 import warnings
+from typing import Any
 
 from ..labelers.base_data_labeler import BaseDataLabeler
 
@@ -268,6 +269,41 @@ def _validate_helper(self, variable_path: str = "ModeOption") -> list[str]:
         return errors
 
 
+class NumQuantilesOption(BooleanOption):
+    """For setting number of quantile options."""
+
+    def __init__(self, is_enabled: bool = True, num_quantiles: int = 1000) -> None:
+        """
+        Initialize options for number of quantiles.
+
+        :ivar is_enabled: boolean option to enable/disable the option.
+        :vartype is_enabled: bool
+        :ivar num_quantiles: the number of quantiles to bin the data.
+        :vartype num_quantiles: int
+        """
+        self.num_quantiles = num_quantiles
+        super().__init__(is_enabled=is_enabled)
+
+    def _validate_helper(self, variable_path: str = "NumQuantilesOption") -> list[str]:
+        """
+        Validate the options do not conflict and cause errors.
+
+        :param variable_path: current path to variable set.
+        :type variable_path: str
+        :return: list of errors (if raise_error is false)
+        :rtype: list(str)
+        """
+        errors = super()._validate_helper(variable_path=variable_path)
+
+        if self.num_quantiles is not None and (
+            not isinstance(self.num_quantiles, int) or self.num_quantiles < 1
+        ):
+            errors.append(
+                "{}.num_quantiles must be " "a positive integer.".format(variable_path)
+            )
+        return errors
+
+
 class BaseInspectorOptions(BooleanOption):
     """For setting Base options."""
 
@@ -351,6 +387,9 @@ def __init__(self) -> None:
         :vartype num_zeros: BooleanOption
         :ivar num_negatives: boolean option to enable/disable num_negatives
         :vartype num_negatives: BooleanOption
+        :ivar num_quantiles: boolean option to enable/disable num_quantiles
+            and set the number of quantiles
+        :vartype num_quantiles: NumQuantilesOption
         :ivar is_numeric_stats_enabled: boolean to enable/disable all numeric
             stats
         :vartype is_numeric_stats_enabled: bool
@@ -366,6 +405,7 @@ def __init__(self) -> None:
         self.median_abs_deviation = BooleanOption(is_enabled=True)
         self.num_zeros = BooleanOption(is_enabled=True)
         self.num_negatives = BooleanOption(is_enabled=True)
+        self.num_quantiles = NumQuantilesOption(is_enabled=True)
         self.histogram_and_quantiles = HistogramOption()
         # By default, we correct for bias
         self.bias_correction = BooleanOption(is_enabled=True)
@@ -395,6 +435,7 @@ def is_numeric_stats_enabled(self) -> bool:
             or self.histogram_and_quantiles.is_enabled
             or self.num_zeros.is_enabled
             or self.num_negatives.is_enabled
+            or self.num_quantiles.is_enabled
         ):
             return True
         return False
@@ -424,6 +465,7 @@ def is_numeric_stats_enabled(self, value: bool) -> None:
         self.num_zeros.is_enabled = value
         self.num_negatives.is_enabled = value
         self.histogram_and_quantiles.is_enabled = value
+        self.num_quantiles.is_enabled = value
 
     @property
     def properties(self) -> dict[str, BooleanOption]:
@@ -463,6 +505,7 @@ def _validate_helper(self, variable_path: str = "NumericalOptions") -> list[str]
             "bias_correction",
             "num_zeros",
             "num_negatives",
+            "num_quantiles",
         ]:
             if not isinstance(self.properties[item], BooleanOption):
                 errors.append(f"{variable_path}.{item} must be a BooleanOption.")
@@ -568,6 +611,9 @@ def __init__(self) -> None:
         :vartype num_zeros: BooleanOption
         :ivar num_negatives: boolean option to enable/disable num_negatives
         :vartype num_negatives: BooleanOption
+        :ivar num_quantiles: boolean option to enable/disable num_quantiles
+            and set the number of quantiles
+        :vartype num_quantiles: NumQuantilesOption
         :ivar is_numeric_stats_enabled: boolean to enable/disable all numeric
             stats
         :vartype is_numeric_stats_enabled: bool
@@ -665,6 +711,9 @@ def __init__(self) -> None:
         :vartype num_zeros: BooleanOption
         :ivar num_negatives: boolean option to enable/disable num_negatives
         :vartype num_negatives: BooleanOption
+        :ivar num_quantiles: boolean option to enable/disable num_quantiles
+            and set the number of quantiles
+        :vartype num_quantiles: NumQuantilesOption
         :ivar is_numeric_stats_enabled: boolean to enable/disable all numeric
             stats
         :vartype is_numeric_stats_enabled: bool
@@ -724,6 +773,9 @@ def __init__(self) -> None:
         :vartype num_zeros: BooleanOption
         :ivar num_negatives: boolean option to enable/disable num_negatives
         :vartype num_negatives: BooleanOption
+        :ivar num_quantiles: boolean option to enable/disable num_quantiles
+            and set the number of quantiles
+        :vartype num_quantiles: NumQuantilesOption
         :ivar is_numeric_stats_enabled: boolean to enable/disable all numeric
             stats
         :vartype is_numeric_stats_enabled: bool
@@ -787,6 +839,7 @@ def is_numeric_stats_enabled(self) -> bool:
             or self.median.is_enabled
             or self.median_abs_deviation.is_enabled
             or self.histogram_and_quantiles.is_enabled
+            or self.num_quantiles.is_enabled
         ):
             return True
         return False
@@ -813,6 +866,7 @@ def is_numeric_stats_enabled(self, value: bool) -> None:
         self.kurtosis.is_enabled = value
         self.median_abs_deviation.is_enabled = value
         self.histogram_and_quantiles.is_enabled = value
+        self.num_quantiles.is_enabled = value
 
 
 class DateTimeOptions(BaseInspectorOptions):
@@ -1070,15 +1124,25 @@ def _validate_helper(self, variable_path: str = "UniqueCountOptions") -> list[st
 class RowStatisticsOptions(BooleanOption):
     """For configuring options for row statistics."""
 
-    def __init__(self, is_enabled: bool = True, unique_count: bool = True) -> None:
+    def __init__(
+        self,
+        is_enabled: bool = True,
+        unique_count: bool = True,
+        null_count: bool = True,
+    ) -> None:
         """
         Initialize options for row statistics.
 
         :ivar is_enabled: boolean option to enable/disable.
         :vartype is_enabled: bool
+        :ivar unique_count: boolean option to enable/disable unique_count
+        :vartype unique_count: bool
+        ivar null_count: boolean option to enable/disable null_count
+        :vartype null_count: bool
         """
         BooleanOption.__init__(self, is_enabled=is_enabled)
         self.unique_count = UniqueCountOptions(is_enabled=unique_count)
+        self.null_count = BooleanOption(is_enabled=null_count)
 
     def _validate_helper(
         self, variable_path: str = "RowStatisticsOptions"
@@ -1094,9 +1158,14 @@ def _validate_helper(
         errors = super()._validate_helper(variable_path=variable_path)
         if not isinstance(self.unique_count, UniqueCountOptions):
             errors.append(
-                f"{variable_path}.full_hashing must be an UniqueCountOptions."
+                f"{variable_path}.unique_count must be an UniqueCountOptions."
             )
+
+        if not isinstance(self.null_count, BooleanOption):
+            errors.append(f"{variable_path}.null_count must be an BooleanOption.")
+
         errors += self.unique_count._validate_helper(variable_path + ".unique_counts")
+        errors += self.null_count._validate_helper(variable_path + ".null_count")
         return super()._validate_helper(variable_path)
 
 
@@ -1557,7 +1626,8 @@ def __init__(self, presets: str = None) -> None:
         :ivar unstructured_options: option set for unstructured dataset profiling.
         :vartype unstructured_options: UnstructuredOptions
         :ivar presets: A pre-configured mapping of a string name to group of options:
-            "complete", "data_types", and "numeric_stats_disabled". Default: None
+            "complete", "data_types", "numeric_stats_disabled",
+            and "lower_memory_sketching". Default: None
         :vartype presets: Optional[str]
         """
         self.structured_options = StructuredOptions()
@@ -1570,6 +1640,10 @@ def __init__(self, presets: str = None) -> None:
                 self._data_types_presets()
             elif self.presets == "numeric_stats_disabled":
                 self._numeric_stats_disabled_presets()
+            elif self.presets == "lower_memory_sketching":
+                self._lower_memory_sketching_presets()
+            else:
+                raise ValueError("The preset entered is not a valid preset.")
 
     def _complete_presets(self) -> None:
         self.set({"*.is_enabled": True})
@@ -1583,6 +1657,18 @@ def _numeric_stats_disabled_presets(self) -> None:
         self.set({"*.float.is_numeric_stats_enabled": False})
         self.set({"structured_options.text.is_numeric_stats_enabled": False})
 
+    def _lower_memory_sketching_presets(self) -> None:
+        self.set({"row_statistics.unique_count.hashing_method": "hll"})
+        self.set(
+            {
+                (
+                    "structured_options.category."
+                    "max_sample_size_to_check_stop_condition"
+                ): 5000
+            }
+        )
+        self.set({"structured_options.category.stop_condition_unique_value_ratio": 0.5})
+
     def _validate_helper(self, variable_path: str = "ProfilerOptions") -> list[str]:
         """
         Validate the options do not conflict and cause errors.
@@ -1620,7 +1706,7 @@ def _validate_helper(self, variable_path: str = "ProfilerOptions") -> list[str]:
 
         return errors
 
-    def set(self, options: dict[str, bool]) -> None:
+    def set(self, options: dict[str, Any]) -> None:
         """
         Overwrite BaseOption.set.