capitalone · jacob-buehler · Jul 12, 2023 · Jul 12, 2023 · Jul 20, 2023 · Jul 24, 2023
@@ -7,6 +7,8 @@ name: Publish Python Package
 on:
   release:
     types: [created]
+    branches:
+      - 'release/*'
 
 jobs:
   deploy:

@@ -1,7 +1,5 @@
 """Contains functions for data readers."""
 import json
-import os
-import random
 import re
 import urllib
 from collections import OrderedDict
@@ -28,7 +26,7 @@
 from chardet.universaldetector import UniversalDetector
 from typing_extensions import TypeGuard
 
-from .. import dp_logging, settings
+from .. import dp_logging, rng_utils
 from .._typing import JSONType, Url
 from .filepath_or_buffer import FileOrBufferHandler, is_stream_buffer  # NOQA
 
@@ -315,11 +313,7 @@ def reservoir(file: TextIOWrapper, sample_nrows: int) -> list:
 
     kinv = 1 / sample_nrows
     W = 1.0
-    rng = random.Random(x=settings._seed)
-    if "DATAPROFILER_SEED" in os.environ and settings._seed is None:
-        seed = os.environ.get("DATAPROFILER_SEED")
-        if seed:
-            rng = random.Random(int(seed))
+    rng = rng_utils.get_random_number_generator()
 
     while True:
         W *= rng.random() ** kinv
@@ -334,7 +328,7 @@ def reservoir(file: TextIOWrapper, sample_nrows: int) -> list:
         except StopIteration:
             break
         # Append new, replace old with dummy, and keep track of order
-        remove_index = rng.randrange(sample_nrows)
+        remove_index = rng.integers(0, sample_nrows)
         values[indices[remove_index]] = str(None)
         indices[remove_index] = len(values)
         values.append(newval)
@@ -824,7 +818,6 @@ def url_to_bytes(url_as_string: Url, options: Dict) -> BytesIO:
                 "Content-length" in url.headers
                 and int(url.headers["Content-length"]) >= 1024**3
             ):
-
                 raise ValueError(
                     "The downloaded file from the url may not be " "larger than 1GB"
                 )

@@ -32,7 +32,7 @@ def __new__(
 class BaseModel(metaclass=abc.ABCMeta):
     """For labeling data."""
 
-    _BaseModel__subclasses: dict[str, type[BaseModel]] = {}
+    __subclasses: dict[str, type[BaseModel]] = {}
     __metaclass__ = abc.ABCMeta
 
     # boolean if the label mapping requires the mapping for index 0 reserved
@@ -90,7 +90,7 @@ def __eq__(self, other: object) -> bool:
     def _register_subclass(cls) -> None:
         """Register a subclass for the class factory."""
         if not inspect.isabstract(cls):
-            cls._BaseModel__subclasses[cls.__name__.lower()] = cls
+            cls.__subclasses[cls.__name__.lower()] = cls
 
     @property
     def label_mapping(self) -> dict[str, int]:
@@ -156,7 +156,7 @@ def get_class(cls, class_name: str) -> type[BaseModel] | None:
         from .column_name_model import ColumnNameModel  # NOQA
         from .regex_model import RegexModel  # NOQA
 
-        return cls._BaseModel__subclasses.get(class_name.lower(), None)
+        return cls.__subclasses.get(class_name.lower(), None)
 
     def get_parameters(self, param_list: list[str] | None = None) -> dict:
         """

@@ -49,16 +49,14 @@ def __init__(self, **parameters: Any) -> None:
     def _register_subclass(cls) -> None:
         """Register a subclass for the class factory."""
         if not inspect.isabstract(cls):
-            cls._BaseDataProcessor__subclasses[  # type: ignore
-                cls.__name__.lower()
-            ] = cls
+            cls.__subclasses[cls.__name__.lower()] = cls
 
     @classmethod
-    def get_class(cls: type[Processor], class_name: str) -> type[Processor] | None:
+    def get_class(
+        cls: type[BaseDataProcessor], class_name: str
+    ) -> type[BaseDataProcessor] | None:
         """Get class of BaseDataProcessor object."""
-        return cls._BaseDataProcessor__subclasses.get(  # type: ignore
-            class_name.lower(), None
-        )
+        return cls.__subclasses.get(class_name.lower(), None)
 
     def __eq__(self, other: object) -> bool:
         """
@@ -129,7 +127,7 @@ def set_params(self, **kwargs: Any) -> None:
             self._parameters[param] = kwargs[param]
 
     @abc.abstractmethod
-    def process(self, *args: Any) -> Any:
+    def process(self, *args: Any, **kwargs: Any) -> Any:
         """Process data."""
         raise NotImplementedError()
 
@@ -169,13 +167,15 @@ def __init__(self, **parameters: Any) -> None:
         super().__init__(**parameters)
 
     @abc.abstractmethod
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         labels: np.ndarray | None = None,
         label_mapping: dict[str, int] | None = None,
         batch_size: int = 32,
-    ) -> Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None]:
+    ) -> Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None] | tuple[
+        np.ndarray, np.ndarray
+    ] | np.ndarray:
         """Preprocess data."""
         raise NotImplementedError()
 
@@ -191,7 +191,7 @@ def __init__(self, **parameters: Any) -> None:
         super().__init__(**parameters)
 
     @abc.abstractmethod
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         results: dict,
@@ -240,7 +240,7 @@ def help(cls) -> None:
         )
         print(help_str)
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         labels: np.ndarray | None = None,
@@ -668,7 +668,7 @@ def gen_none() -> Generator[None, None, None]:
         if batch_data["samples"]:
             yield batch_data
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         labels: np.ndarray | None = None,
@@ -735,8 +735,8 @@ def process(  # type: ignore
             X_train = np.array(
                 [[sentence] for sentence in batch_data["samples"]], dtype=object
             )
-            if labels is not None:
-                num_classes = max(label_mapping.values()) + 1  # type: ignore
+            if labels is not None and label_mapping is not None:
+                num_classes = max(label_mapping.values()) + 1
 
                 Y_train = tf.keras.utils.to_categorical(
                     batch_data["labels"], num_classes
@@ -836,7 +836,7 @@ def _validate_parameters(self, parameters: dict) -> None:
         if errors:
             raise ValueError("\n".join(errors))
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         labels: np.ndarray | None = None,
@@ -1269,7 +1269,7 @@ def match_sentence_lengths(
 
         return results
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         results: dict,
@@ -1439,7 +1439,7 @@ def convert_to_unstructured_format(
 
         return text, entities
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         labels: np.ndarray | None = None,
@@ -1503,8 +1503,12 @@ def process(  # type: ignore
                 unstructured_label_set,
             ) = self.convert_to_unstructured_format(batch_data, batch_labels)
             unstructured_data[ind] = unstructured_text
-            if labels is not None:
-                unstructured_labels[ind] = unstructured_label_set  # type: ignore
+            if (
+                labels is not None
+                and unstructured_labels is not None
+                and unstructured_label_set is not None
+            ):
+                unstructured_labels[ind] = unstructured_label_set
 
         if labels is not None:
             np_unstruct_labels = np.array(unstructured_labels, dtype="object")
@@ -1800,7 +1804,7 @@ def convert_to_structured_analysis(
 
         return results
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         results: dict,
@@ -2022,7 +2026,7 @@ def split_prediction(results: dict) -> None:
                 pred, axis=1, ord=1, keepdims=True
             )
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         results: dict,
@@ -2160,7 +2164,7 @@ def _save_processor(self, dirpath: str) -> None:
         ) as fp:
             json.dump(params, fp)
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         results: dict,
@@ -2253,7 +2257,7 @@ def help(cls) -> None:
         )
         print(help_str)
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         results: dict,

@@ -11,7 +11,7 @@
 import numpy as np
 import pandas as pd
 
-from . import utils
+from . import profiler_utils
 from .profiler_options import BaseInspectorOptions, BaseOption
 
 BaseColumnProfilerT = TypeVar("BaseColumnProfilerT", bound="BaseColumnProfiler")
@@ -76,7 +76,7 @@ def _timeit(method: Callable = None, name: str = None) -> Callable:
         :param name: key argument for the times dictionary
         :type name: str
         """
-        return utils.method_timeit(method, name)
+        return profiler_utils.method_timeit(method, name)
 
     @staticmethod
     def _filter_properties_w_options(
@@ -173,7 +173,7 @@ def _add_helper(
         else:
             raise ValueError(f"Column names unmatched: {other1.name} != {other2.name}")
 
-        self.times = utils.add_nested_dictionaries(other1.times, other2.times)
+        self.times = profiler_utils.add_nested_dictionaries(other1.times, other2.times)
 
         self.sample_size = other1.sample_size + other2.sample_size
 

@@ -8,7 +8,7 @@
 import datasketches
 from pandas import DataFrame, Series
 
-from . import utils
+from . import profiler_utils
 from .base_column_profilers import BaseColumnProfiler
 from .profiler_options import CategoricalOptions
 
@@ -131,7 +131,7 @@ def __add__(self, other: CategoricalColumn) -> CategoricalColumn:
         elif not self.cms and not other.cms:
             # If both profiles have not met stop condition
             if not (self._stop_condition_is_met or other._stop_condition_is_met):
-                merged_profile._categories = utils.add_nested_dictionaries(
+                merged_profile._categories = profiler_utils.add_nested_dictionaries(
                     self._categories, other._categories
                 )
 
@@ -250,21 +250,21 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict:
         # Make sure other_profile's type matches this class
         differences: dict = super().diff(other_profile, options)
 
-        differences["categorical"] = utils.find_diff_of_strings_and_bools(
+        differences["categorical"] = profiler_utils.find_diff_of_strings_and_bools(
             self.is_match, other_profile.is_match
         )
 
         differences["statistics"] = dict(
             [
                 (
                     "unique_count",
-                    utils.find_diff_of_numbers(
+                    profiler_utils.find_diff_of_numbers(
                         self.unique_count, other_profile.unique_count
                     ),
                 ),
                 (
                     "unique_ratio",
-                    utils.find_diff_of_numbers(
+                    profiler_utils.find_diff_of_numbers(
                         self.unique_ratio, other_profile.unique_ratio
                     ),
                 ),
@@ -275,19 +275,25 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict:
         if self.is_match and other_profile.is_match:
             differences["statistics"][
                 "chi2-test"
-            ] = utils.perform_chi_squared_test_for_homogeneity(
+            ] = profiler_utils.perform_chi_squared_test_for_homogeneity(
                 self._categories,
                 self.sample_size,
                 other_profile._categories,
                 other_profile.sample_size,
             )
-            differences["statistics"]["categories"] = utils.find_diff_of_lists_and_sets(
+            differences["statistics"][
+                "categories"
+            ] = profiler_utils.find_diff_of_lists_and_sets(
                 self.categories, other_profile.categories
             )
-            differences["statistics"]["gini_impurity"] = utils.find_diff_of_numbers(
+            differences["statistics"][
+                "gini_impurity"
+            ] = profiler_utils.find_diff_of_numbers(
                 self.gini_impurity, other_profile.gini_impurity
             )
-            differences["statistics"]["unalikeability"] = utils.find_diff_of_numbers(
+            differences["statistics"][
+                "unalikeability"
+            ] = profiler_utils.find_diff_of_numbers(
                 self.unalikeability, other_profile.unalikeability
             )
             cat_count1 = dict(
@@ -299,9 +305,9 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict:
                 )
             )
 
-            differences["statistics"]["categorical_count"] = utils.find_diff_of_dicts(
-                cat_count1, cat_count2
-            )
+            differences["statistics"][
+                "categorical_count"
+            ] = profiler_utils.find_diff_of_dicts(cat_count1, cat_count2)
 
         return differences
 
@@ -532,7 +538,7 @@ def _merge_categories_cms(
         for k in (x for x in heavy_hitter_dict2 if x not in heavy_hitter_dict1):
             heavy_hitter_dict1[k] = cms1.get_estimate(k)
 
-        categories = utils.add_nested_dictionaries(
+        categories = profiler_utils.add_nested_dictionaries(
             heavy_hitter_dict2, heavy_hitter_dict1
         )
 
@@ -604,7 +610,7 @@ def _update_categories(
             )
         else:
             category_count = self._get_categories_full(df_series)
-            self._categories = utils.add_nested_dictionaries(
+            self._categories = profiler_utils.add_nested_dictionaries(
                 self._categories, category_count
             )
             self._update_stop_condition(df_series)