Skip to content

Commit

Permalink
Added warning to psi calculation for if it is not calculated
Browse files Browse the repository at this point in the history
  • Loading branch information
ksneab7 committed Sep 20, 2023
1 parent 026b2ef commit 86aa39a
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 5 deletions.
14 changes: 11 additions & 3 deletions dataprofiler/profilers/categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import annotations

import math
import warnings
from collections import defaultdict
from operator import itemgetter
from typing import cast
Expand Down Expand Up @@ -305,14 +306,21 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict:
other_profile._categories.items(), key=itemgetter(1), reverse=True
)
)
total_psi = 0.0
if cat_count1.keys() == cat_count2.keys():
total_psi = 0.0
for key in cat_count1.keys():
perc_A = cat_count1[key] / self.sample_size
perc_B = cat_count2[key] / other_profile.sample_size
total_psi += (perc_B - perc_A) * math.log(perc_B / perc_A)

differences["statistics"]["psi"] = total_psi
else:
warnings.warn(
"psi was not calculated due to the differences in categories "
"of the profiles. Differences:\n"
f"{set(cat_count1.keys()) ^ set(cat_count2.keys())}\n"
"defaulting psi value to 0...",
RuntimeWarning,
)
differences["statistics"]["psi"] = total_psi
differences["statistics"][
"categorical_count"
] = profiler_utils.find_diff_of_dicts(cat_count1, cat_count2)
Expand Down
11 changes: 9 additions & 2 deletions dataprofiler/tests/profilers/test_categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -726,10 +726,17 @@ def test_categorical_diff(self):
"df": 2,
"p-value": 0.3099238764710244,
},
"psi": 0,
},
}

self.assertDictEqual(expected_diff, profile.diff(profile2))
with self.assertWarnsRegex(
RuntimeWarning,
"psi was not calculated due to the differences in categories "
"of the profiles. Differences:\n{'maybe'}\n"
"defaulting psi value to 0...",
):
test_profile_diff = profile.diff(profile2)
self.assertDictEqual(expected_diff, test_profile_diff)

# Test with one categorical column matching
df_not_categorical = pd.Series(
Expand Down

0 comments on commit 86aa39a

Please sign in to comment.