You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
On occasion, some categorical columns produce the following error when profiling
TypeError Traceback (most recent call last)
<ipython-input-16-5eb7460647d7> in <module>
11 })
12
---> 13 profile = Profiler(data=df, options=profiler_options)
14
15 report = profile.report()
/opt/conda/lib/python3.8/site-packages/dataprofiler/__init__.py in profiler_tracking_wrapper(wrapped, instance, args, kwargs)
160 _issue_warning_and_notify()
161
--> 162 return wrapped(*args, **kwargs)
163
164
/opt/conda/lib/python3.8/site-packages/dataprofiler/profilers/profile_builder.py in __new__(cls, data, samples_per_update, min_true_samples, options, profiler_type)
3124 return profile
3125 elif profiler_type == "structured":
-> 3126 return StructuredProfiler(
3127 data, samples_per_update, min_true_samples, options
3128 )
/opt/conda/lib/python3.8/site-packages/dataprofiler/__init__.py in sub_profiler_tracking_wrapper(wrapped, instance, args, kwargs)
191 _issue_warning_and_notify()
192
--> 193 wrapped(*args, **kwargs)
194
195 _wrapt.wrap_function_wrapper(
/opt/conda/lib/python3.8/site-packages/dataprofiler/profilers/profile_builder.py in __init__(self, data, samples_per_update, min_true_samples, options)
1678 )
1679 if data is not None:
-> 1680 self.update_profile(data)
1681
1682 def _add_error_checks( # type: ignore[override]
/opt/conda/lib/python3.8/site-packages/dataprofiler/profilers/profile_builder.py in update_profile(self, data, sample_size, min_true_samples)
982 sample_size = self._get_sample_size(data)
983
--> 984 self._update_profile_from_chunk(data, sample_size, min_true_samples)
985
986 # set file properties since data will be processed
/opt/conda/lib/python3.8/site-packages/dataprofiler/profilers/profile_builder.py in _update_profile_from_chunk(self, data, sample_size, min_true_samples)
3004
3005 for prof_idx in tqdm(clean_sampled_dict.keys()):
-> 3006 self._profile[prof_idx].update_column_profilers(
3007 clean_sampled_dict[prof_idx], pool
3008 )
/opt/conda/lib/python3.8/site-packages/dataprofiler/profilers/profile_builder.py in update_column_profilers(self, clean_sampled_df, pool)
164 if self.profiles is None or len(self.profiles) == 0:
165 self.profiles = {
--> 166 "data_type_profile": ColumnPrimitiveTypeProfileCompiler(
167 clean_sampled_df, self.options, pool
168 ),
/opt/conda/lib/python3.8/site-packages/dataprofiler/profilers/column_profile_compilers.py in __init__(self, df_series, options, pool)
54 if df_series is not None:
55 self.name = df_series.name
---> 56 self._create_profile(df_series, options, pool)
57
58 @abc.abstractmethod
/opt/conda/lib/python3.8/site-packages/dataprofiler/profilers/column_profile_compilers.py in _create_profile(self, df_series, options, pool)
110
111 # Update profile after creation
--> 112 self.update_profile(df_series, pool)
113
114 def __add__(self, other: BaseCompiler) -> BaseCompiler:
/opt/conda/lib/python3.8/site-packages/dataprofiler/profilers/column_profile_compilers.py in update_profile(self, df_series, pool)
180 if pool is None:
181 for profile_type in self._profiles:
--> 182 self._profiles[profile_type].update(df_series)
183 return self
184
/opt/conda/lib/python3.8/site-packages/dataprofiler/profilers/float_column_profile.py in update(self, df_series)
448 profile = dict(match_count=float_count, sample_size=sample_size)
449
--> 450 BaseColumnProfiler._perform_property_calcs(
451 self,
452 self.__calculations,
/opt/conda/lib/python3.8/site-packages/dataprofiler/profilers/base_column_profilers.py in _perform_property_calcs(self, calculations, df_series, prev_dependent_properties, subset_properties)
118 """
119 for prop in calculations:
--> 120 calculations[prop](
121 self, df_series, prev_dependent_properties, subset_properties
122 )
/opt/conda/lib/python3.8/site-packages/dataprofiler/profilers/profiler_utils.py in wrapper(self, *args, **kw)
709 name_dec = method.__name__
710 ts = time.time()
--> 711 result = method(self, *args, **kw)
712 te = time.time()
713 self.times[name_dec] += te - ts
/opt/conda/lib/python3.8/site-packages/dataprofiler/profilers/float_column_profile.py in _update_precision(self, df_series, prev_dependent_properties, subset_properties)
366
367 # (min, max, var, sum, sample_size)
--> 368 subset_precision = self._get_float_precision(df_series, sample_ratio)
369 if subset_precision is None:
370 return
/opt/conda/lib/python3.8/site-packages/dataprofiler/profilers/float_column_profile.py in _get_float_precision(cls, df_series_clean, sample_ratio)
306 # length of sampled cells after all punctuation removed
307 len_per_float = (
--> 308 df_series_clean.sample(sample_size).replace(to_replace=r, value="").map(len)
309 ).astype(float)
310
/opt/conda/lib/python3.8/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
315 stacklevel=find_stack_level(inspect.currentframe()),
316 )
--> 317 return func(*args, **kwargs)
318
319 return wrapper
/opt/conda/lib/python3.8/site-packages/pandas/core/series.py in replace(self, to_replace, value, inplace, limit, regex, method)
5381 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = lib.no_default,
5382 ) -> Series | None:
-> 5383 return super().replace(
5384 to_replace=to_replace,
5385 value=value,
/opt/conda/lib/python3.8/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
315 stacklevel=find_stack_level(inspect.currentframe()),
316 )
--> 317 return func(*args, **kwargs)
318
319 return wrapper
/opt/conda/lib/python3.8/site-packages/pandas/core/generic.py in replace(self, to_replace, value, inplace, limit, regex, method)
7277 regex = should_use_regex(regex, to_replace)
7278 if regex:
-> 7279 new_data = self._mgr.replace_regex(
7280 to_replace=to_replace,
7281 value=value,
/opt/conda/lib/python3.8/site-packages/pandas/core/internals/managers.py in replace_regex(self, **kwargs)
471
472 def replace_regex(self, **kwargs):
--> 473 return self.apply("_replace_regex", **kwargs)
474
475 def replace_list(
/opt/conda/lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
346 applied = b.apply(f, **kwargs)
347 else:
--> 348 applied = getattr(b, f)(**kwargs)
349 except (TypeError, NotImplementedError):
350 if not ignore_failures:
/opt/conda/lib/python3.8/site-packages/pandas/core/internals/blocks.py in _replace_regex(self, to_replace, value, inplace, convert, mask)
670
671 new_values = self.values if inplace else self.values.copy()
--> 672 replace_regex(new_values, rx, value, mask)
673
674 block = self.make_block(new_values)
/opt/conda/lib/python3.8/site-packages/pandas/core/array_algos/replace.py in replace_regex(values, rx, value, mask)
157
158 if mask is None:
--> 159 values[:] = f(values)
160 else:
161 values[mask] = f(values[mask])
/opt/conda/lib/python3.8/site-packages/pandas/core/arrays/_mixins.py in __setitem__(self, key, value)
264 def __setitem__(self, key, value) -> None:
265 key = check_array_indexer(self, key)
--> 266 value = self._validate_setitem_value(value)
267 self._ndarray[key] = value
268
/opt/conda/lib/python3.8/site-packages/pandas/core/arrays/categorical.py in _validate_setitem_value(self, value)
1557 if not is_hashable(value):
1558 # wrap scalars and hashable-listlikes in list
-> 1559 return self._validate_listlike(value)
1560 else:
1561 return self._validate_scalar(value)
/opt/conda/lib/python3.8/site-packages/pandas/core/arrays/categorical.py in _validate_listlike(self, value)
2245 # something to np.nan
2246 if len(to_add) and not isna(to_add).all():
-> 2247 raise TypeError(
2248 "Cannot setitem on a Categorical with a new "
2249 "category, set the categories first"
TypeError: Cannot setitem on a Categorical with a new category, set the categories first
To Reproduce:
from dataprofiler import Profiler, ProfilerOptions
import pandas as pd
import os
s = pd.Series([202210, 202210, 202210], dtype="category")
df = pd.DataFrame({"category": s})
profiler_options = ProfilerOptions()
profiler_options.set({
"structured_options.data_labeler.is_enabled": False,
})
profile = Profiler(data=df, options=profiler_options)
report = profile.report()
This error is coming from a categorical variable detailing the year + month that the data is coming from. Weirdly enough, we are only seeing this error occur in October. If you update the 202210 value to 202211 then it profiles successfully. And updating to 202110 also errors.
Expected behavior:
Profile these specific categorical columns successfully.
Screenshots:
Additional context:
The text was updated successfully, but these errors were encountered:
@taylorfturner Running the latest release against this code snippet still errors. For what its worth, the change from @SchadtJ gets this sample script to work fine, although I haven't tested past that
I thought another change that was recently merged addressed this, but I might have been mistaken.
My change is just a type-cast, so that we remove the categorical dtype when we calculate precision. Doing this felt somewhat hacky. However, since it's done for data that is only used in the precision calculation, it seems fine.
I'm still getting familiar with the codebase, but one thing I was wondering is does it make sense to output precision if we are using categorical data?
My change is just a type-cast, so that we remove the categorical dtype when we calculate precision. Doing this felt somewhat hacky. However, since it's done for data that is only used in the precision calculation, it seems fine.
Got it -- yeah, slightly hacky but I see what you're doing for sure, @SchadtJ
General Information:
Describe the bug:
On occasion, some categorical columns produce the following error when profiling
To Reproduce:
This error is coming from a categorical variable detailing the year + month that the data is coming from. Weirdly enough, we are only seeing this error occur in October. If you update the
202210
value to202211
then it profiles successfully. And updating to202110
also errors.Expected behavior:
Profile these specific categorical columns successfully.
Screenshots:
Additional context:
The text was updated successfully, but these errors were encountered: