Skip to content

Commit

Permalink
added tests for expectations_median,expectations_quantile
Browse files Browse the repository at this point in the history
  • Loading branch information
bvolodarskiy committed Aug 1, 2023
1 parent 4b4c42a commit ac7bd90
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 14 deletions.
27 changes: 15 additions & 12 deletions functions/data_test/data_test/profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,8 @@ def expectations_mean(name, summary, batch, *args):

def expectations_median(name, summary, batch, *args):
min_median, max_median = calculate_median(summary)
if min_median and max_median:
batch.expect_column_median_to_be_between(
column=name, min_value=min_median, max_value=max_median)
batch.expect_column_median_to_be_between(
column=name, min_value=min_median, max_value=max_median)
return name, summary, batch


Expand All @@ -64,11 +63,10 @@ def expectations_stdev(name, summary, batch, *args):


def expectations_quantile(name, summary, batch, *args):
value_ranges = calculate_q_ranges(summary)
q_ranges = {
"quantiles": [0.05, 0.25, 0.5, 0.75, 0.95],
"value_ranges": [[summary["5%"], summary["25%"]], [summary["25%"], summary["50%"]],
[summary["50%"], summary["75%"]], [summary["75%"], summary["95%"]],
[summary["95%"], summary["max"]]]
"value_ranges": value_ranges
}
batch.expect_column_quantile_values_to_be_between(
column=name, quantile_ranges=q_ranges)
Expand Down Expand Up @@ -209,19 +207,18 @@ def calculate_mean(summary):


def calculate_median(summary):
min_median = None
max_median = None
raw_values = summary["value_counts_index_sorted"]
values = []
for key, v in raw_values.items():
key = [key] * v
values.extend(key)
q = 0.5
j = int(len(values) * q - 2.58 * math.sqrt(len(values) * q * (1 - q)))
k = int(len(values) * q + 2.58 * math.sqrt(len(values) * q * (1 - q)))
if j < len(values) and k < len(values):
min_median = values[j]
max_median = values[k]
k = int(len(values) * q + 2.58 * math.sqrt(len(values) * q * (1 - q))) - 1
if j >= 1:
j -= 1
min_median = values[j]
max_median = values[k]
return min_median, max_median


Expand All @@ -248,6 +245,12 @@ def calculate_z_score(summary):
return threshold + significance_level


def calculate_q_ranges(summary):
return [[summary["5%"], summary["25%"]], [summary["25%"], summary["50%"]],
[summary["50%"], summary["75%"]], [summary["75%"], summary["95%"]],
[summary["95%"], summary["max"]]]


def profile_data(df, suite_name, cloudfront, datasource_root, source_covered,
mapping_config, run_name):
qa_bucket = s3.Bucket(qa_bucket_name)
Expand Down
53 changes: 51 additions & 2 deletions functions/data_test/tests/test_profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@
calculate_stdev,
expectations_stdev,
calculate_z_score,
expectations_z_score)
expectations_z_score,
expectations_quantile,
calculate_q_ranges,
calculate_median,
expectations_median)
import great_expectations as gx
import pandas as pd

Expand All @@ -23,7 +27,7 @@
"type": "Numeric",
"hashable": True,
"value_counts_without_nan": "892",
"value_counts_index_sorted": "892 1 \nName: PassengerId, Length: 418, dtype: int64",
"value_counts_index_sorted": pd.Series({892: 1, 893: 1, 894: 1, 1004: 2, 1500: 1}),
"ordering": True,
"n_missing": 0,
"n": 418,
Expand Down Expand Up @@ -160,6 +164,7 @@ def test_expectations_mean(n, std, mean, max_mean, min_mean, before_and_after_te
assert name == name_expected
assert expectation_type in str(batch.expectation_suite)


@pytest.mark.parametrize("n,std,max_std,min_std",
[(418, 120.81045760473994, 136.10108739120102, 105.51982781827887)])
def test_expectations_stdev(n, std, max_std, min_std, before_and_after_test):
Expand All @@ -178,6 +183,7 @@ def test_expectations_stdev(n, std, max_std, min_std, before_and_after_test):
assert name == name_expected
assert expectation_type in str(batch.expectation_suite)


@pytest.mark.parametrize("mean,std,max,threshold,applied",
[(418, 120.81045760473994, 1309, 7.380189347557294, True),
(418, np.nan, 1309, None, False)])
Expand All @@ -197,3 +203,46 @@ def test_expectations_z_score(mean, std, max, threshold, applied, before_and_aft
assert threshold == threshold_expected
assert name == name_expected
assert (expectation_type in str(batch.expectation_suite)) == applied


@pytest.mark.parametrize("q1,q2,q3,q4,q5,q6",
[(912.85, 996.25, 1100.5, 1204.75, 1288.15, 1309)])
def test_expectations_quantile(q1, q2, q3, q4, q5, q6, before_and_after_test):
q1 = eval("q1")
q2 = eval("q2")
q3 = eval("q3")
q4 = eval("q4")
q5 = eval("q5")
q6 = eval("q6")
expected_ranges = [[q1, q2], [q2, q3],
[q3, q4], [q4, q5],
[q5, q6]]
name_expected, summary_expected = change_template([q1, q2, q3, q4, q5, q6],
["5%", "25%", "50%", "75%", "95%", "max"])
expectation_type = "expect_column_quantile_values_to_be_between"
batch_empty = before_and_after_test

q_ranges = calculate_q_ranges(summary_expected)
name, summary, batch = expectations_quantile(name_expected, summary_expected, batch_empty)

assert expected_ranges == q_ranges
assert name == name_expected
assert expectation_type in str(batch.expectation_suite)

@pytest.mark.parametrize("min_median,max_median,value_counts_index_sorted,applied",
[(892, 1500, pd.Series({892: 1, 893: 1, 894: 1, 1004: 2, 1500: 1}), True)])
def test_expectations_median(min_median, max_median, value_counts_index_sorted, applied, before_and_after_test):
min_median_expected = eval("min_median")
max_median_expected = eval("max_median")
value_counts_index_sorted = eval("value_counts_index_sorted")
applied = eval("applied")
name_expected, summary_expected = change_template([value_counts_index_sorted], ["value_counts_index_sorted"])
expectation_type = "expect_column_median_to_be_between"
batch_empty = before_and_after_test

min_median, max_median = calculate_median(summary_expected)
name, summary, batch = expectations_median(name_expected, summary_expected, batch_empty)

assert (min_median == min_median_expected and max_median == max_median_expected)
assert name == name_expected
assert (expectation_type in str(batch.expectation_suite)) == applied

0 comments on commit ac7bd90

Please sign in to comment.