Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
jurajmajerik committed Aug 7, 2024
1 parent b97089b commit c9b77de
Show file tree
Hide file tree
Showing 2 changed files with 191 additions and 20 deletions.
35 changes: 35 additions & 0 deletions ee/clickhouse/queries/experiments/funnel_experiment_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from numpy.random import default_rng
from rest_framework.exceptions import ValidationError
import scipy.stats as stats

from ee.clickhouse.queries.experiments import (
CONTROL_VARIANT_KEY,
Expand Down Expand Up @@ -111,6 +112,8 @@ def get_results(self, validate: bool = True):
}

significance_code, loss = self.are_results_significant(control_variant, test_variants, probabilities)

credible_intervals = calculate_credible_intervals([control_variant, *test_variants])
except ValidationError:
if validate:
raise
Expand All @@ -124,6 +127,7 @@ def get_results(self, validate: bool = True):
"significance_code": significance_code,
"expected_loss": loss,
"variants": [asdict(variant) for variant in [control_variant, *test_variants]],
"credible_intervals": credible_intervals,
}

def get_variants(self, funnel_results):
Expand Down Expand Up @@ -320,6 +324,37 @@ def calculate_probability_of_winning_for_each(variants: list[Variant]) -> list[P
return [max(0, 1 - total_test_probabilities), *probabilities[1:]]


def calculate_credible_intervals(variants, lower_bound=0.025, upper_bound=0.975):
"""
Calculate the Bayesian credible intervals for a list of variants.
If no lower/upper bound provided, the function calculates the 95% credible interval.
"""
intervals = {}

for variant in variants:
try:
if variant.success_count < 0 or variant.failure_count < 0:
raise ValidationError(
f"Success and failure counts must be non-negative for variant {variant.key}.",
code="invalid_data",
)

# Calculate the credible interval
# Laplace smoothing: we add 1 to alpha and beta to avoid division errors if either is zero
alpha = variant.success_count + 1
beta = variant.failure_count + 1
credible_interval = stats.beta.ppf([lower_bound, upper_bound], alpha, beta)

intervals[variant.key] = (credible_interval[0], credible_interval[1])
except Exception as e:
raise ValidationError(
f"Error calculating credible interval for variant {variant.key}: {str(e)}",

Check warning

Code scanning / CodeQL

Information exposure through an exception Medium

Stack trace information
flows to this location and may be exposed to an external user.
code="calculation_error",
)

return intervals


def validate_event_variants(funnel_results, variants):
errors = {
ExperimentNoResultsErrorKeys.NO_EVENTS: True,
Expand Down
176 changes: 156 additions & 20 deletions ee/clickhouse/queries/experiments/test_experiment_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
ClickhouseFunnelExperimentResult,
Variant,
calculate_expected_loss,
calculate_credible_intervals,
)
from ee.clickhouse.queries.experiments.trend_experiment_result import (
ClickhouseTrendExperimentResult,
Expand Down Expand Up @@ -162,6 +163,13 @@ def test_calculate_results(self):
self.assertAlmostEqual(loss, 0.0016, places=3)
self.assertEqual(significant, ExperimentSignificanceCode.SIGNIFICANT)

credible_intervals = calculate_credible_intervals([variant_control, variant_test])
# Cross-checked with: https://www.causascientia.org/math_stat/ProportionCI.html
self.assertAlmostEqual(credible_intervals[variant_control.key][0], 0.7715, places=3)
self.assertAlmostEqual(credible_intervals[variant_control.key][1], 0.9010, places=3)
self.assertAlmostEqual(credible_intervals[variant_test.key][0], 0.8405, places=3)
self.assertAlmostEqual(credible_intervals[variant_test.key][1], 0.9494, places=3)

def test_simulation_result_is_close_to_closed_form_solution(self):
variant_test = Variant("A", 100, 10)
variant_control = Variant("B", 100, 18)
Expand All @@ -174,8 +182,8 @@ def test_simulation_result_is_close_to_closed_form_solution(self):

def test_calculate_results_for_two_test_variants(self):
variant_test_1 = Variant("A", 100, 10)
variant_test_2 = Variant("A", 100, 3)
variant_control = Variant("B", 100, 18)
variant_test_2 = Variant("B", 100, 3)
variant_control = Variant("C", 100, 18)

probabilities = ClickhouseFunnelExperimentResult.calculate_results(
variant_control, [variant_test_1, variant_test_2]
Expand Down Expand Up @@ -203,10 +211,19 @@ def test_calculate_results_for_two_test_variants(self):
self.assertAlmostEqual(loss, 0.00000, places=3)
self.assertEqual(significant, ExperimentSignificanceCode.SIGNIFICANT)

credible_intervals = calculate_credible_intervals([variant_control, variant_test_1, variant_test_2])
# Cross-checked with: https://www.causascientia.org/math_stat/ProportionCI.html
self.assertAlmostEqual(credible_intervals[variant_control.key][0], 0.7715, places=3)
self.assertAlmostEqual(credible_intervals[variant_control.key][1], 0.9010, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_1.key][0], 0.8405, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_1.key][1], 0.9494, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_2.key][0], 0.9180, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_2.key][1], 0.9894, places=3)

def test_calculate_results_for_two_test_variants_almost_equal(self):
variant_test_1 = Variant("A", 120, 60)
variant_test_2 = Variant("A", 110, 52)
variant_control = Variant("B", 130, 65)
variant_test_2 = Variant("B", 110, 52)
variant_control = Variant("C", 130, 65)

probabilities = ClickhouseFunnelExperimentResult.calculate_results(
variant_control, [variant_test_1, variant_test_2]
Expand All @@ -233,6 +250,15 @@ def test_calculate_results_for_two_test_variants_almost_equal(self):
self.assertAlmostEqual(loss, 1, places=3)
self.assertEqual(significant, ExperimentSignificanceCode.LOW_WIN_PROBABILITY)

credible_intervals = calculate_credible_intervals([variant_control, variant_test_1, variant_test_2])
# Cross-checked with: https://www.causascientia.org/math_stat/ProportionCI.html
self.assertAlmostEqual(credible_intervals[variant_control.key][0], 0.5977, places=3)
self.assertAlmostEqual(credible_intervals[variant_control.key][1], 0.7290, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_1.key][0], 0.5948, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_1.key][1], 0.7314, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_2.key][0], 0.6035, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_2.key][1], 0.7460, places=3)

def test_absolute_loss_less_than_one_percent_but_not_significant(self):
variant_test_1 = Variant("A", 286, 2014)
variant_control = Variant("B", 267, 2031)
Expand All @@ -250,11 +276,18 @@ def test_absolute_loss_less_than_one_percent_but_not_significant(self):
self.assertAlmostEqual(loss, 1, places=3)
self.assertEqual(significant, ExperimentSignificanceCode.LOW_WIN_PROBABILITY)

credible_intervals = calculate_credible_intervals([variant_control, variant_test_1])
# Cross-checked with: https://www.causascientia.org/math_stat/ProportionCI.html
self.assertAlmostEqual(credible_intervals[variant_control.key][0], 0.1037, places=3)
self.assertAlmostEqual(credible_intervals[variant_control.key][1], 0.1299, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_1.key][0], 0.1114, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_1.key][1], 0.1384, places=3)

def test_calculate_results_for_three_test_variants(self):
variant_test_1 = Variant("A", 100, 10)
variant_test_2 = Variant("A", 100, 3)
variant_test_3 = Variant("A", 100, 30)
variant_control = Variant("B", 100, 18)
variant_test_2 = Variant("B", 100, 3)
variant_test_3 = Variant("C", 100, 30)
variant_control = Variant("D", 100, 18)

probabilities = ClickhouseFunnelExperimentResult.calculate_results(
variant_control, [variant_test_1, variant_test_2, variant_test_3]
Expand Down Expand Up @@ -285,11 +318,24 @@ def test_calculate_results_for_three_test_variants(self):
self.assertAlmostEqual(loss, 0.0004, places=2)
self.assertEqual(significant, ExperimentSignificanceCode.SIGNIFICANT)

credible_intervals = calculate_credible_intervals(
[variant_control, variant_test_1, variant_test_2, variant_test_3]
)
# Cross-checked with: https://www.causascientia.org/math_stat/ProportionCI.html
self.assertAlmostEqual(credible_intervals[variant_control.key][0], 0.7715, places=3)
self.assertAlmostEqual(credible_intervals[variant_control.key][1], 0.9010, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_1.key][0], 0.8405, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_1.key][1], 0.9494, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_2.key][0], 0.9180, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_2.key][1], 0.9894, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_3.key][0], 0.6894, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_3.key][1], 0.8332, places=3)

def test_calculate_results_for_three_test_variants_almost_equal(self):
variant_control = Variant("B", 130, 65)
variant_test_1 = Variant("A", 120, 60)
variant_test_2 = Variant("A", 110, 52)
variant_test_3 = Variant("A", 100, 46)
variant_test_2 = Variant("B", 110, 52)
variant_test_3 = Variant("C", 100, 46)
variant_control = Variant("D", 130, 65)

probabilities = ClickhouseFunnelExperimentResult.calculate_results(
variant_control, [variant_test_1, variant_test_2, variant_test_3]
Expand Down Expand Up @@ -318,11 +364,24 @@ def test_calculate_results_for_three_test_variants_almost_equal(self):
self.assertAlmostEqual(loss, 0.012, places=2)
self.assertEqual(significant, ExperimentSignificanceCode.HIGH_LOSS)

credible_intervals = calculate_credible_intervals(
[variant_control, variant_test_1, variant_test_2, variant_test_3]
)
# Cross-checked with: https://www.causascientia.org/math_stat/ProportionCI.html
self.assertAlmostEqual(credible_intervals[variant_control.key][0], 0.5977, places=3)
self.assertAlmostEqual(credible_intervals[variant_control.key][1], 0.7290, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_1.key][0], 0.5948, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_1.key][1], 0.7314, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_2.key][0], 0.6035, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_2.key][1], 0.7460, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_3.key][0], 0.6054, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_3.key][1], 0.7547, places=3)

def test_calculate_results_for_three_test_variants_much_better_than_control(self):
variant_control = Variant("B", 80, 65)
variant_test_1 = Variant("A", 130, 60)
variant_test_2 = Variant("A", 135, 62)
variant_test_3 = Variant("A", 132, 60)
variant_test_2 = Variant("B", 135, 62)
variant_test_3 = Variant("C", 132, 60)
variant_control = Variant("D", 80, 65)

probabilities = ClickhouseFunnelExperimentResult.calculate_results(
variant_control, [variant_test_1, variant_test_2, variant_test_3]
Expand All @@ -342,15 +401,28 @@ def test_calculate_results_for_three_test_variants_much_better_than_control(self
self.assertAlmostEqual(loss, 0, places=2)
self.assertEqual(significant, ExperimentSignificanceCode.SIGNIFICANT)

credible_intervals = calculate_credible_intervals(
[variant_control, variant_test_1, variant_test_2, variant_test_3]
)
# Cross-checked with: https://www.causascientia.org/math_stat/ProportionCI.html
self.assertAlmostEqual(credible_intervals[variant_control.key][0], 0.4703, places=3)
self.assertAlmostEqual(credible_intervals[variant_control.key][1], 0.6303, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_1.key][0], 0.6148, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_1.key][1], 0.7460, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_2.key][0], 0.6172, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_2.key][1], 0.7460, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_3.key][0], 0.6186, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_3.key][1], 0.7488, places=3)

def test_calculate_results_for_seven_test_variants(self):
variant_test_1 = Variant("A", 100, 17)
variant_test_2 = Variant("A", 100, 16)
variant_test_3 = Variant("A", 100, 30)
variant_test_4 = Variant("A", 100, 31)
variant_test_5 = Variant("A", 100, 29)
variant_test_6 = Variant("A", 100, 32)
variant_test_7 = Variant("A", 100, 33)
variant_control = Variant("B", 100, 18)
variant_test_2 = Variant("B", 100, 16)
variant_test_3 = Variant("C", 100, 30)
variant_test_4 = Variant("D", 100, 31)
variant_test_5 = Variant("E", 100, 29)
variant_test_6 = Variant("F", 100, 32)
variant_test_7 = Variant("G", 100, 33)
variant_control = Variant("H", 100, 18)

probabilities = ClickhouseFunnelExperimentResult.calculate_results(
variant_control,
Expand Down Expand Up @@ -407,6 +479,36 @@ def test_calculate_results_for_seven_test_variants(self):
self.assertAlmostEqual(loss, 1, places=2)
self.assertEqual(significant, ExperimentSignificanceCode.LOW_WIN_PROBABILITY)

credible_intervals = calculate_credible_intervals(
[
variant_control,
variant_test_1,
variant_test_2,
variant_test_3,
variant_test_4,
variant_test_5,
variant_test_6,
variant_test_7,
]
)
# Cross-checked with: https://www.causascientia.org/math_stat/ProportionCI.html
self.assertAlmostEqual(credible_intervals[variant_control.key][0], 0.7715, places=3)
self.assertAlmostEqual(credible_intervals[variant_control.key][1], 0.9010, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_1.key][0], 0.7793, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_1.key][1], 0.9070, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_2.key][0], 0.7874, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_2.key][1], 0.9130, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_3.key][0], 0.6894, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_3.key][1], 0.8332, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_4.key][0], 0.6835, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_4.key][1], 0.8278, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_5.key][0], 0.6955, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_5.key][1], 0.8385, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_6.key][0], 0.6776, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_6.key][1], 0.8226, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_7.key][0], 0.6718, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_7.key][1], 0.8174, places=3)

def test_calculate_results_control_is_significant(self):
variant_test = Variant("test", 100, 18)
variant_control = Variant("control", 100, 10)
Expand All @@ -422,6 +524,13 @@ def test_calculate_results_control_is_significant(self):
self.assertAlmostEqual(loss, 0.0016, places=3)
self.assertEqual(significant, ExperimentSignificanceCode.SIGNIFICANT)

credible_intervals = calculate_credible_intervals([variant_control, variant_test])
# Cross-checked with: https://www.causascientia.org/math_stat/ProportionCI.html
self.assertAlmostEqual(credible_intervals[variant_control.key][0], 0.8405, places=3)
self.assertAlmostEqual(credible_intervals[variant_control.key][1], 0.9494, places=3)
self.assertAlmostEqual(credible_intervals[variant_test.key][0], 0.7715, places=3)
self.assertAlmostEqual(credible_intervals[variant_test.key][1], 0.9010, places=3)

def test_calculate_results_many_variants_control_is_significant(self):
variant_test_1 = Variant("test_1", 100, 20)
variant_test_2 = Variant("test_2", 100, 21)
Expand Down Expand Up @@ -451,6 +560,33 @@ def test_calculate_results_many_variants_control_is_significant(self):
self.assertAlmostEqual(loss, 0.0008, places=3)
self.assertEqual(significant, ExperimentSignificanceCode.SIGNIFICANT)

credible_intervals = calculate_credible_intervals(
[
variant_control,
variant_test_1,
variant_test_2,
variant_test_3,
variant_test_4,
variant_test_5,
variant_test_6,
]
)
# Cross-checked with: https://www.causascientia.org/math_stat/ProportionCI.html
self.assertAlmostEqual(credible_intervals[variant_control.key][0], 0.8405, places=3)
self.assertAlmostEqual(credible_intervals[variant_control.key][1], 0.9494, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_1.key][0], 0.7563, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_1.key][1], 0.8892, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_2.key][0], 0.7489, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_2.key][1], 0.8834, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_3.key][0], 0.7418, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_3.key][1], 0.8776, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_4.key][0], 0.7347, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_4.key][1], 0.8718, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_5.key][0], 0.7279, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_5.key][1], 0.8661, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_6.key][0], 0.7211, places=3)
self.assertAlmostEqual(credible_intervals[variant_test_6.key][1], 0.8605, places=3)


# calculation: https://www.evanmiller.org/bayesian-ab-testing.html#count_ab
def calculate_probability_of_winning_for_target_count_data(
Expand Down

0 comments on commit c9b77de

Please sign in to comment.