From 17ea023f733c984b2ca64ac6a679c21be5f75a94 Mon Sep 17 00:00:00 2001
From: Daniel Bachhuber <daniel.b@posthog.com>
Date: Thu, 19 Dec 2024 14:44:48 -0800
Subject: [PATCH] Apply expected loss to trends continuous

---
 .../test/test_trends_statistics_continuous.py | 57 ++++++++++++
 .../trends_statistics_v2_continuous.py        | 88 +++++++++++++++++--
 2 files changed, 140 insertions(+), 5 deletions(-)

diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py
index 0ed9d43a5684a..a2b3c0a54fa7e 100644
--- a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py
+++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py
@@ -474,3 +474,60 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca
                 self.assertAlmostEqual(intervals["test"][1], 0.0025, delta=0.0025)  # Upper bound ratio
 
         self.run_test_for_both_implementations(run_test)
+
+    def test_expected_loss_minimal_difference(self):
+        """Test expected loss when variants have very similar performance"""
+
+        def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals):
+            control_absolute_exposure = 600
+            control = create_variant("control", mean=100.0, exposure=1, absolute_exposure=control_absolute_exposure)
+            test_absolute_exposure = 600
+            test = create_variant(
+                "test",
+                mean=120.0,  # Slightly higher mean
+                exposure=test_absolute_exposure / control_absolute_exposure,
+                absolute_exposure=test_absolute_exposure,
+            )
+
+            probabilities = calculate_probabilities(control, [test])
+            significance, expected_loss = are_results_significant(control, [test], probabilities)
+
+            if stats_version == 2:
+                self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT)
+                # Expected loss should be relatively small
+                self.assertLess(expected_loss, 3.0)  # Less than $3 expected loss
+                self.assertGreater(expected_loss, 0)  # But still some loss
+            else:
+                # Original implementation behavior (returns p_value in expected_loss)
+                self.assertEqual(significance, ExperimentSignificanceCode.HIGH_P_VALUE)
+                self.assertAlmostEqual(expected_loss, 0.2, delta=0.1)
+
+        self.run_test_for_both_implementations(run_test)
+
+    def test_expected_loss_test_variant_clear_winner(self):
+        """Test expected loss when one variant is clearly better"""
+
+        def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals):
+            control_absolute_exposure = 10000
+            control = create_variant("control", mean=100.0, exposure=1, absolute_exposure=control_absolute_exposure)
+            test_absolute_exposure = 10000
+            test = create_variant(
+                "test",
+                mean=200.0,  # Much higher mean
+                exposure=test_absolute_exposure / control_absolute_exposure,
+                absolute_exposure=test_absolute_exposure,
+            )
+
+            probabilities = calculate_probabilities(control, [test])
+            significance, expected_loss = are_results_significant(control, [test], probabilities)
+
+            if stats_version == 2:
+                self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT)
+                # Expected loss should be very close to zero since test is clearly better
+                self.assertLess(expected_loss, 0.1)  # Essentially zero loss
+            else:
+                # Original implementation behavior
+                self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT)
+                self.assertLess(expected_loss, 0.001)
+
+        self.run_test_for_both_implementations(run_test)
diff --git a/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py
index 78d10ec9ace81..c0894302143ec 100644
--- a/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py
+++ b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py
@@ -1,6 +1,10 @@
 from rest_framework.exceptions import ValidationError
 from sentry_sdk import capture_exception
-from posthog.hogql_queries.experiments import FF_DISTRIBUTION_THRESHOLD, MIN_PROBABILITY_FOR_SIGNIFICANCE
+from posthog.hogql_queries.experiments import (
+    FF_DISTRIBUTION_THRESHOLD,
+    MIN_PROBABILITY_FOR_SIGNIFICANCE,
+    EXPECTED_LOSS_SIGNIFICANCE_LEVEL,
+)
 from posthog.schema import ExperimentSignificanceCode, ExperimentVariantTrendsBaseStats
 from scipy.stats import t
 import numpy as np
@@ -123,7 +127,7 @@ def are_results_significant_v2_continuous(
     --------
     tuple[ExperimentSignificanceCode, float]
         - ExperimentSignificanceCode indicating the significance status
-        - Probability value
+        - Expected loss value for significant results, 1.0 for non-significant results
     """
     # Check exposure thresholds
     for variant in test_variants:
@@ -137,10 +141,22 @@ def are_results_significant_v2_continuous(
     max_probability = max(probabilities)
 
     # Check if any variant has a high enough probability of being best
-    if max_probability < MIN_PROBABILITY_FOR_SIGNIFICANCE:
-        return ExperimentSignificanceCode.LOW_WIN_PROBABILITY, 1.0
+    if max_probability >= MIN_PROBABILITY_FOR_SIGNIFICANCE:
+        # Find best performing variant
+        all_variants = [control_variant, *test_variants]
+        means = [v.count for v in all_variants]  # count field stores mean value
+        best_idx = np.argmax(means)
+        best_variant = all_variants[best_idx]
+        other_variants = all_variants[:best_idx] + all_variants[best_idx + 1 :]
 
-    return ExperimentSignificanceCode.SIGNIFICANT, 0.0
+        expected_loss = calculate_expected_loss_v2_continuous(best_variant, other_variants)
+
+        if expected_loss >= EXPECTED_LOSS_SIGNIFICANCE_LEVEL:
+            return ExperimentSignificanceCode.HIGH_LOSS, expected_loss
+
+        return ExperimentSignificanceCode.SIGNIFICANT, expected_loss
+
+    return ExperimentSignificanceCode.LOW_WIN_PROBABILITY, 1.0
 
 
 def calculate_credible_intervals_v2_continuous(variants, lower_bound=0.025, upper_bound=0.975):
@@ -193,3 +209,65 @@ def calculate_credible_intervals_v2_continuous(variants, lower_bound=0.025, uppe
             return {}
 
     return intervals
+
+
+def calculate_expected_loss_v2_continuous(
+    target_variant: ExperimentVariantTrendsBaseStats, variants: list[ExperimentVariantTrendsBaseStats]
+) -> float:
+    """
+    Calculates expected loss in mean value using Normal-Inverse-Gamma conjugate prior.
+
+    This implementation uses a Bayesian approach with Normal-Inverse-Gamma model
+    to estimate the expected loss when choosing the target variant over others.
+    The data is log-transformed to handle typical revenue/continuous metric distributions.
+
+    Parameters:
+    -----------
+    target_variant : ExperimentVariantTrendsBaseStats
+        The variant being evaluated for loss
+    variants : list[ExperimentVariantTrendsBaseStats]
+        List of other variants to compare against
+
+    Returns:
+    --------
+    float
+        Expected loss in mean value if choosing the target variant
+    """
+    # Calculate posterior parameters for target variant
+    log_target_mean = np.log(target_variant.count + EPSILON)
+
+    # Update parameters for target variant
+    kappa_n_target = KAPPA_0 + target_variant.absolute_exposure
+    mu_n_target = (KAPPA_0 * MU_0 + target_variant.absolute_exposure * log_target_mean) / kappa_n_target
+    alpha_n_target = ALPHA_0 + target_variant.absolute_exposure / 2
+    beta_n_target = BETA_0 + 0.5 * target_variant.absolute_exposure * LOG_VARIANCE
+
+    # Draw samples from target variant's posterior
+    target_posterior = t(
+        df=2 * alpha_n_target, loc=mu_n_target, scale=np.sqrt(beta_n_target / (kappa_n_target * alpha_n_target))
+    )
+    target_samples = target_posterior.rvs(SAMPLE_SIZE)
+
+    # Draw samples from each comparison variant's posterior
+    variant_samples = []
+    for variant in variants:
+        log_variant_mean = np.log(variant.count + EPSILON)
+
+        kappa_n = KAPPA_0 + variant.absolute_exposure
+        mu_n = (KAPPA_0 * MU_0 + variant.absolute_exposure * log_variant_mean) / kappa_n
+        alpha_n = ALPHA_0 + variant.absolute_exposure / 2
+        beta_n = BETA_0 + 0.5 * variant.absolute_exposure * LOG_VARIANCE
+
+        variant_posterior = t(df=2 * alpha_n, loc=mu_n, scale=np.sqrt(beta_n / (kappa_n * alpha_n)))
+        variant_samples.append(variant_posterior.rvs(SAMPLE_SIZE))
+
+    # Transform samples back from log space
+    target_samples = np.exp(target_samples) - EPSILON
+    variant_samples = [np.exp(samples) - EPSILON for samples in variant_samples]
+
+    # Calculate loss
+    variant_max = np.maximum.reduce(variant_samples)
+    losses = np.maximum(0, variant_max - target_samples)
+    expected_loss = float(np.mean(losses))
+
+    return expected_loss