Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rebase staging/seed-validation onto dev #967

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/publish-python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ name: Publish Python Package
on:
release:
types: [created]
branches:
- 'release/*'

jobs:
deploy:
Expand Down
13 changes: 3 additions & 10 deletions dataprofiler/data_readers/data_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
"""Contains functions for data readers."""
import json
import os
import random
import re
import urllib
from collections import OrderedDict
Expand All @@ -28,7 +26,7 @@
from chardet.universaldetector import UniversalDetector
from typing_extensions import TypeGuard

from .. import dp_logging, settings
from .. import dp_logging, rng_utils
from .._typing import JSONType, Url
from .filepath_or_buffer import FileOrBufferHandler, is_stream_buffer # NOQA

Expand Down Expand Up @@ -315,11 +313,7 @@ def reservoir(file: TextIOWrapper, sample_nrows: int) -> list:

kinv = 1 / sample_nrows
W = 1.0
rng = random.Random(x=settings._seed)
if "DATAPROFILER_SEED" in os.environ and settings._seed is None:
seed = os.environ.get("DATAPROFILER_SEED")
if seed:
rng = random.Random(int(seed))
rng = rng_utils.get_random_number_generator()

while True:
W *= rng.random() ** kinv
Expand All @@ -334,7 +328,7 @@ def reservoir(file: TextIOWrapper, sample_nrows: int) -> list:
except StopIteration:
break
# Append new, replace old with dummy, and keep track of order
remove_index = rng.randrange(sample_nrows)
remove_index = rng.integers(0, sample_nrows)
values[indices[remove_index]] = str(None)
indices[remove_index] = len(values)
values.append(newval)
Expand Down Expand Up @@ -824,7 +818,6 @@ def url_to_bytes(url_as_string: Url, options: Dict) -> BytesIO:
"Content-length" in url.headers
and int(url.headers["Content-length"]) >= 1024**3
):

raise ValueError(
"The downloaded file from the url may not be " "larger than 1GB"
)
Expand Down
6 changes: 3 additions & 3 deletions dataprofiler/labelers/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def __new__(
class BaseModel(metaclass=abc.ABCMeta):
"""For labeling data."""

_BaseModel__subclasses: dict[str, type[BaseModel]] = {}
__subclasses: dict[str, type[BaseModel]] = {}
__metaclass__ = abc.ABCMeta

# boolean if the label mapping requires the mapping for index 0 reserved
Expand Down Expand Up @@ -90,7 +90,7 @@ def __eq__(self, other: object) -> bool:
def _register_subclass(cls) -> None:
"""Register a subclass for the class factory."""
if not inspect.isabstract(cls):
cls._BaseModel__subclasses[cls.__name__.lower()] = cls
cls.__subclasses[cls.__name__.lower()] = cls

@property
def label_mapping(self) -> dict[str, int]:
Expand Down Expand Up @@ -156,7 +156,7 @@ def get_class(cls, class_name: str) -> type[BaseModel] | None:
from .column_name_model import ColumnNameModel # NOQA
from .regex_model import RegexModel # NOQA

return cls._BaseModel__subclasses.get(class_name.lower(), None)
return cls.__subclasses.get(class_name.lower(), None)

def get_parameters(self, param_list: list[str] | None = None) -> dict:
"""
Expand Down
52 changes: 28 additions & 24 deletions dataprofiler/labelers/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,14 @@ def __init__(self, **parameters: Any) -> None:
def _register_subclass(cls) -> None:
"""Register a subclass for the class factory."""
if not inspect.isabstract(cls):
cls._BaseDataProcessor__subclasses[ # type: ignore
cls.__name__.lower()
] = cls
cls.__subclasses[cls.__name__.lower()] = cls

@classmethod
def get_class(cls: type[Processor], class_name: str) -> type[Processor] | None:
def get_class(
cls: type[BaseDataProcessor], class_name: str
) -> type[BaseDataProcessor] | None:
"""Get class of BaseDataProcessor object."""
return cls._BaseDataProcessor__subclasses.get( # type: ignore
class_name.lower(), None
)
return cls.__subclasses.get(class_name.lower(), None)

def __eq__(self, other: object) -> bool:
"""
Expand Down Expand Up @@ -129,7 +127,7 @@ def set_params(self, **kwargs: Any) -> None:
self._parameters[param] = kwargs[param]

@abc.abstractmethod
def process(self, *args: Any) -> Any:
def process(self, *args: Any, **kwargs: Any) -> Any:
"""Process data."""
raise NotImplementedError()

Expand Down Expand Up @@ -169,13 +167,15 @@ def __init__(self, **parameters: Any) -> None:
super().__init__(**parameters)

@abc.abstractmethod
def process( # type: ignore
def process(
self,
data: np.ndarray,
labels: np.ndarray | None = None,
label_mapping: dict[str, int] | None = None,
batch_size: int = 32,
) -> Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None]:
) -> Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None] | tuple[
np.ndarray, np.ndarray
] | np.ndarray:
"""Preprocess data."""
raise NotImplementedError()

Expand All @@ -191,7 +191,7 @@ def __init__(self, **parameters: Any) -> None:
super().__init__(**parameters)

@abc.abstractmethod
def process( # type: ignore
def process(
self,
data: np.ndarray,
results: dict,
Expand Down Expand Up @@ -240,7 +240,7 @@ def help(cls) -> None:
)
print(help_str)

def process( # type: ignore
def process(
self,
data: np.ndarray,
labels: np.ndarray | None = None,
Expand Down Expand Up @@ -668,7 +668,7 @@ def gen_none() -> Generator[None, None, None]:
if batch_data["samples"]:
yield batch_data

def process( # type: ignore
def process(
self,
data: np.ndarray,
labels: np.ndarray | None = None,
Expand Down Expand Up @@ -735,8 +735,8 @@ def process( # type: ignore
X_train = np.array(
[[sentence] for sentence in batch_data["samples"]], dtype=object
)
if labels is not None:
num_classes = max(label_mapping.values()) + 1 # type: ignore
if labels is not None and label_mapping is not None:
num_classes = max(label_mapping.values()) + 1

Y_train = tf.keras.utils.to_categorical(
batch_data["labels"], num_classes
Expand Down Expand Up @@ -836,7 +836,7 @@ def _validate_parameters(self, parameters: dict) -> None:
if errors:
raise ValueError("\n".join(errors))

def process( # type: ignore
def process(
self,
data: np.ndarray,
labels: np.ndarray | None = None,
Expand Down Expand Up @@ -1269,7 +1269,7 @@ def match_sentence_lengths(

return results

def process( # type: ignore
def process(
self,
data: np.ndarray,
results: dict,
Expand Down Expand Up @@ -1439,7 +1439,7 @@ def convert_to_unstructured_format(

return text, entities

def process( # type: ignore
def process(
self,
data: np.ndarray,
labels: np.ndarray | None = None,
Expand Down Expand Up @@ -1503,8 +1503,12 @@ def process( # type: ignore
unstructured_label_set,
) = self.convert_to_unstructured_format(batch_data, batch_labels)
unstructured_data[ind] = unstructured_text
if labels is not None:
unstructured_labels[ind] = unstructured_label_set # type: ignore
if (
labels is not None
and unstructured_labels is not None
and unstructured_label_set is not None
):
unstructured_labels[ind] = unstructured_label_set

if labels is not None:
np_unstruct_labels = np.array(unstructured_labels, dtype="object")
Expand Down Expand Up @@ -1800,7 +1804,7 @@ def convert_to_structured_analysis(

return results

def process( # type: ignore
def process(
self,
data: np.ndarray,
results: dict,
Expand Down Expand Up @@ -2022,7 +2026,7 @@ def split_prediction(results: dict) -> None:
pred, axis=1, ord=1, keepdims=True
)

def process( # type: ignore
def process(
self,
data: np.ndarray,
results: dict,
Expand Down Expand Up @@ -2160,7 +2164,7 @@ def _save_processor(self, dirpath: str) -> None:
) as fp:
json.dump(params, fp)

def process( # type: ignore
def process(
self,
data: np.ndarray,
results: dict,
Expand Down Expand Up @@ -2253,7 +2257,7 @@ def help(cls) -> None:
)
print(help_str)

def process( # type: ignore
def process(
self,
data: np.ndarray,
results: dict,
Expand Down
6 changes: 3 additions & 3 deletions dataprofiler/profilers/base_column_profilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import numpy as np
import pandas as pd

from . import utils
from . import profiler_utils
from .profiler_options import BaseInspectorOptions, BaseOption

BaseColumnProfilerT = TypeVar("BaseColumnProfilerT", bound="BaseColumnProfiler")
Expand Down Expand Up @@ -76,7 +76,7 @@ def _timeit(method: Callable = None, name: str = None) -> Callable:
:param name: key argument for the times dictionary
:type name: str
"""
return utils.method_timeit(method, name)
return profiler_utils.method_timeit(method, name)

@staticmethod
def _filter_properties_w_options(
Expand Down Expand Up @@ -173,7 +173,7 @@ def _add_helper(
else:
raise ValueError(f"Column names unmatched: {other1.name} != {other2.name}")

self.times = utils.add_nested_dictionaries(other1.times, other2.times)
self.times = profiler_utils.add_nested_dictionaries(other1.times, other2.times)

self.sample_size = other1.sample_size + other2.sample_size

Expand Down
34 changes: 20 additions & 14 deletions dataprofiler/profilers/categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import datasketches
from pandas import DataFrame, Series

from . import utils
from . import profiler_utils
from .base_column_profilers import BaseColumnProfiler
from .profiler_options import CategoricalOptions

Expand Down Expand Up @@ -131,7 +131,7 @@ def __add__(self, other: CategoricalColumn) -> CategoricalColumn:
elif not self.cms and not other.cms:
# If both profiles have not met stop condition
if not (self._stop_condition_is_met or other._stop_condition_is_met):
merged_profile._categories = utils.add_nested_dictionaries(
merged_profile._categories = profiler_utils.add_nested_dictionaries(
self._categories, other._categories
)

Expand Down Expand Up @@ -250,21 +250,21 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict:
# Make sure other_profile's type matches this class
differences: dict = super().diff(other_profile, options)

differences["categorical"] = utils.find_diff_of_strings_and_bools(
differences["categorical"] = profiler_utils.find_diff_of_strings_and_bools(
self.is_match, other_profile.is_match
)

differences["statistics"] = dict(
[
(
"unique_count",
utils.find_diff_of_numbers(
profiler_utils.find_diff_of_numbers(
self.unique_count, other_profile.unique_count
),
),
(
"unique_ratio",
utils.find_diff_of_numbers(
profiler_utils.find_diff_of_numbers(
self.unique_ratio, other_profile.unique_ratio
),
),
Expand All @@ -275,19 +275,25 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict:
if self.is_match and other_profile.is_match:
differences["statistics"][
"chi2-test"
] = utils.perform_chi_squared_test_for_homogeneity(
] = profiler_utils.perform_chi_squared_test_for_homogeneity(
self._categories,
self.sample_size,
other_profile._categories,
other_profile.sample_size,
)
differences["statistics"]["categories"] = utils.find_diff_of_lists_and_sets(
differences["statistics"][
"categories"
] = profiler_utils.find_diff_of_lists_and_sets(
self.categories, other_profile.categories
)
differences["statistics"]["gini_impurity"] = utils.find_diff_of_numbers(
differences["statistics"][
"gini_impurity"
] = profiler_utils.find_diff_of_numbers(
self.gini_impurity, other_profile.gini_impurity
)
differences["statistics"]["unalikeability"] = utils.find_diff_of_numbers(
differences["statistics"][
"unalikeability"
] = profiler_utils.find_diff_of_numbers(
self.unalikeability, other_profile.unalikeability
)
cat_count1 = dict(
Expand All @@ -299,9 +305,9 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict:
)
)

differences["statistics"]["categorical_count"] = utils.find_diff_of_dicts(
cat_count1, cat_count2
)
differences["statistics"][
"categorical_count"
] = profiler_utils.find_diff_of_dicts(cat_count1, cat_count2)

return differences

Expand Down Expand Up @@ -532,7 +538,7 @@ def _merge_categories_cms(
for k in (x for x in heavy_hitter_dict2 if x not in heavy_hitter_dict1):
heavy_hitter_dict1[k] = cms1.get_estimate(k)

categories = utils.add_nested_dictionaries(
categories = profiler_utils.add_nested_dictionaries(
heavy_hitter_dict2, heavy_hitter_dict1
)

Expand Down Expand Up @@ -604,7 +610,7 @@ def _update_categories(
)
else:
category_count = self._get_categories_full(df_series)
self._categories = utils.add_nested_dictionaries(
self._categories = profiler_utils.add_nested_dictionaries(
self._categories, category_count
)
self._update_stop_condition(df_series)
Expand Down
Loading