From f0dbf997c589afb38b50572bfd64811f7eb71901 Mon Sep 17 00:00:00 2001 From: SchadtJ Date: Sun, 24 Mar 2024 22:11:58 -0400 Subject: [PATCH] Bug fix for float precision calculation using categorical data with trailing zeros. --- dataprofiler/profilers/float_column_profile.py | 5 ++++- dataprofiler/tests/profilers/test_float_column_profile.py | 7 +++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/dataprofiler/profilers/float_column_profile.py b/dataprofiler/profilers/float_column_profile.py index bc426a447..29417584e 100644 --- a/dataprofiler/profilers/float_column_profile.py +++ b/dataprofiler/profilers/float_column_profile.py @@ -305,7 +305,10 @@ def _get_float_precision( # length of sampled cells after all punctuation removed len_per_float = ( - df_series_clean.sample(sample_size).replace(to_replace=r, value="").map(len) + df_series_clean.sample(sample_size) + .astype(object) + .replace(to_replace=r, value="") + .map(len) ).astype(float) # Determine statistics precision diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py index d79fdd641..06441dcb7 100644 --- a/dataprofiler/tests/profilers/test_float_column_profile.py +++ b/dataprofiler/tests/profilers/test_float_column_profile.py @@ -211,6 +211,13 @@ def test_profiled_precision(self): msg=f"Errored for: {sample[0]}", ) + # Validate categorical series with trailing zeros supported + categorical_series = pd.Series( + [202209, 202210, 202211], dtype="category" + ).apply(str) + float_profiler = FloatColumn("Name") + float_profiler.update(categorical_series) + def test_profiled_min(self): # test with multiple values data = np.linspace(-5, 5, 11)