Skip to content

Commit

Permalink
update data labeler (#1129)
Browse files Browse the repository at this point in the history
  • Loading branch information
atl1502 committed Apr 15, 2024
1 parent 4ca4502 commit c9e0254
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 25 deletions.
16 changes: 11 additions & 5 deletions dataprofiler/profilers/data_labeler_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
from typing import Dict, cast

import numpy as np
from pandas import DataFrame, Series
import pandas as pd
import polars as pl
from polars import Series

from ..labelers.base_data_labeler import BaseDataLabeler
from ..labelers.data_labelers import DataLabeler
Expand Down Expand Up @@ -394,7 +396,7 @@ def diff(self, other_profile: DataLabelerColumn, options: dict = None) -> dict:
@BaseColumnProfiler._timeit(name="data_labeler_predict")
def _update_predictions(
self,
df_series: DataFrame,
df_series: Series,
prev_dependent_properties: dict = None,
subset_properties: dict = None,
) -> None:
Expand All @@ -411,8 +413,9 @@ def _update_predictions(
:type df_series: pandas.DataFrame
:return: None
"""
df_series_pd = df_series.to_pandas()
predictions = self.data_labeler.predict(
df_series, predict_options=dict(show_confidences=True)
df_series_pd, predict_options=dict(show_confidences=True)
)
# remove PAD from output (reserved zero index)
if self.data_labeler.model.requires_zero_mapping:
Expand Down Expand Up @@ -441,7 +444,7 @@ def _update_helper(self, df_series_clean: Series, profile: dict) -> None:
Update the column profile properties.
:param df_series_clean: df series with nulls removed
:type df_series_clean: pandas.core.series.Series
:type df_series_clean: polars.Series
:param profile: float profile dictionary
:type profile: dict
:return: None
Expand All @@ -453,10 +456,13 @@ def update(self, df_series: Series) -> DataLabelerColumn:
Update the column profile.
:param df_series: df series
:type df_series: pandas.core.series.Series
:type df_series: polars.Series
:return: updated DataLabelerColumn
:rtype: DataLabelerColumn
"""
# TODO remove onces profiler builder is updated
if type(df_series) == pd.Series:
df_series = pl.from_pandas(df_series) # type: ignore
if len(df_series) == 0:
return self

Expand Down
42 changes: 22 additions & 20 deletions dataprofiler/tests/profilers/test_data_labeler_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from unittest import mock

import numpy as np
import pandas as pd
import polars as pl

from dataprofiler.labelers import BaseDataLabeler
from dataprofiler.profilers import profiler_utils
Expand All @@ -21,6 +21,8 @@
spec=BaseDataLabeler,
)
class TestDataLabelerColumnProfiler(unittest.TestCase):
maxDiff = None

@staticmethod
def _setup_data_labeler_mock(mock_instance):
mock_DataLabeler = mock_instance.return_value
Expand All @@ -46,7 +48,7 @@ def mock_predict(data, *args, **kwargs):
def test_base_case(self, mock_instance):
self._setup_data_labeler_mock(mock_instance)

data = pd.Series([], dtype=object)
data = pl.Series([], dtype=object)
profiler = DataLabelerColumn(data.name)

time_array = [float(i) for i in range(4, 0, -1)]
Expand Down Expand Up @@ -74,7 +76,7 @@ def test_base_case(self, mock_instance):
def test_update(self, mock_instance):
self._setup_data_labeler_mock(mock_instance)

data = pd.Series(["1", "2", "3"])
data = pl.Series(["1", "2", "3"])
profiler = DataLabelerColumn(data.name)
profiler.update(data)

Expand All @@ -93,7 +95,7 @@ def test_update_reserve_label_mapping(self, mock_instance):
mock_DataLabeler.reverse_label_mapping = {0: "PAD", 1: "a", 2: "b"}
mock_DataLabeler.model.num_labels = 3

data = pd.Series(["1", "2", "3"])
data = pl.Series(["1", "2", "3"])
profiler = DataLabelerColumn(data.name)
profiler.update(data)

Expand All @@ -111,7 +113,7 @@ def mock_low_predict(data, *args, **kwargs):

mock_instance.return_value.predict.side_effect = mock_low_predict

data = pd.Series(["1"])
data = pl.Series(["1"])
profiler = DataLabelerColumn(data.name)
profiler.update(data)
self.assertEqual("could not determine", profiler.data_label)
Expand Down Expand Up @@ -144,15 +146,15 @@ def mock_low_predict(data, *args, **kwargs):

mock_instance.return_value.predict.side_effect = mock_low_predict

data = pd.Series(["1"] * 10)
data = pl.Series(["1"] * 10)
profiler = DataLabelerColumn(data.name)
profiler.update(data)
self.assertEqual("a|c|b", profiler.data_label)

def test_profile(self, mock_instance):
self._setup_data_labeler_mock(mock_instance)

data = pd.Series(["1", "2", "3"])
data = pl.Series(["1", "2", "3"])
profiler = DataLabelerColumn(data.name)

expected_profile = {
Expand Down Expand Up @@ -180,7 +182,7 @@ def test_profile(self, mock_instance):
def test_report(self, mock_instance):
self._setup_data_labeler_mock(mock_instance)

data = pd.Series(["1", "2", "3"])
data = pl.Series(["1", "2", "3"])
profile = DataLabelerColumn(data.name)

report1 = profile.profile
Expand All @@ -206,7 +208,7 @@ def test_label_match(self, mock_instance):
mock_DataLabeler.model.num_labels = 4
mock_DataLabeler.model.requires_zero_mapping = False

data = pd.Series(["1", "2", "3", "4", "5", "6"])
data = pl.Series(["1", "2", "3", "4", "5", "6"])
profiler = DataLabelerColumn(data.name)
profiler.sample_size = 1

Expand All @@ -217,8 +219,8 @@ def test_label_match(self, mock_instance):
def test_profile_merge(self, mock_instance):
self._setup_data_labeler_mock(mock_instance)

data = pd.Series(["1", "2", "3", "11"])
data2 = pd.Series(["4", "5", "6", "7", "9", "10", "12"])
data = pl.Series(["1", "2", "3", "11"])
data2 = pl.Series(["4", "5", "6", "7", "9", "10", "12"])

expected_profile = {
"data_label": "a|b",
Expand Down Expand Up @@ -313,8 +315,8 @@ def test_profile_merge_with_different_options(self, mock_instance):
self._setup_data_labeler_mock(mock_instance)

# Different max_sample_size values
data = pd.Series(["1", "2", "3", "11"])
data2 = pd.Series(["4", "5", "6", "7", "9", "10", "12"])
data = pl.Series(["1", "2", "3", "11"])
data2 = pl.Series(["4", "5", "6", "7", "9", "10", "12"])
options = DataLabelerOptions()
options.max_sample_size = 20
profiler = DataLabelerColumn(data.name, options=options)
Expand Down Expand Up @@ -392,8 +394,8 @@ def test_empty_data(self, *mocks):

# Mock out the data_label, avg_predictions, and label_representation
# properties
profiler1.update(pd.Series())
profiler2.update(pd.Series())
profiler1.update(pl.Series())
profiler2.update(pl.Series())

merge_profile = profiler1 + profiler2
self.assertIsNone(merge_profile._rank_distribution)
Expand Down Expand Up @@ -447,7 +449,7 @@ def test_json_encode(self, mock_instance):

def test_json_encode_after_update(self, mock_instance):
self._setup_data_labeler_mock(mock_instance)
data = pd.Series(["1", "2", "3", "4"], dtype=object)
data = pl.Series(["1", "2", "3", "4"], dtype=object)
profiler = DataLabelerColumn(data.name)
profiler.data_labeler._default_model_loc = "this is a test model loc"
with test_utils.mock_timeit():
Expand All @@ -458,7 +460,7 @@ def test_json_encode_after_update(self, mock_instance):
{
"class": "DataLabelerColumn",
"data": {
"name": None,
"name": "",
"col_index": float("nan"),
"sample_size": 4,
"metadata": {},
Expand Down Expand Up @@ -492,7 +494,7 @@ def test_json_decode(self, mock_utils_DataLabeler, mock_BaseDataLabeler):
self._setup_data_labeler_mock(mock_BaseDataLabeler)
mock_utils_DataLabeler.load_from_library.side_effect = mock_BaseDataLabeler

data = pd.Series(["1", "2", "3", "4"], dtype=object)
data = pl.Series(["1", "2", "3", "4"], dtype=object)
expected = DataLabelerColumn(data.name)
expected.data_labeler._default_model_loc = "structured_model"
serialized = json.dumps(expected, cls=ProfileEncoder)
Expand Down Expand Up @@ -537,7 +539,7 @@ def test_json_decode_after_update(
self._setup_data_labeler_mock(mock_BaseDataLabeler)
mock_utils_DataLabeler.load_from_library.side_effect = mock_BaseDataLabeler

data = pd.Series(["1", "2", "3", "4"], dtype=object)
data = pl.Series(["1", "2", "3", "4"], dtype=object)
expected = DataLabelerColumn(data.name)
expected.data_labeler._default_model_loc = "structured_model"
with test_utils.mock_timeit():
Expand All @@ -547,7 +549,7 @@ def test_json_decode_after_update(
deserialized = load_column_profile(json.loads(serialized))

test_utils.assert_profiles_equal(deserialized, expected)
update_data = pd.Series(["4", "5", "6", "7"], dtype=object)
update_data = pl.Series(["4", "5", "6", "7"], dtype=object)
deserialized.update(update_data)

assert deserialized.sample_size == 8
Expand Down

0 comments on commit c9e0254

Please sign in to comment.