diff --git a/bin/mprocs.yaml b/bin/mprocs.yaml
index cb3761250d035..f57efc845d834 100644
--- a/bin/mprocs.yaml
+++ b/bin/mprocs.yaml
@@ -19,8 +19,8 @@ procs:
         shell: 'bin/check_kafka_clickhouse_up && bin/check_temporal_up && python manage.py start_temporal_worker'
 
     docker-compose:
-        shell: 'docker compose -f docker-compose.dev.yml up'
-        stop:
-            send-keys: ['<C-c>']
+        # docker-compose makes sure the stack is up, and then follows its logs - but doesn't tear down on exit for speed
+        shell: 'docker compose -f docker-compose.dev.yml up -d && docker compose -f docker-compose.dev.yml logs --tail=0 -f'
 
 mouse_scroll_speed: 1
+scrollback: 10000
diff --git a/bin/start b/bin/start
index ceaddede2140f..7ec881e57b81c 100755
--- a/bin/start
+++ b/bin/start
@@ -9,4 +9,14 @@ export HOG_HOOK_URL=${HOG_HOOK_URL:-http://localhost:3300/hoghook}
 
 [ ! -f ./share/GeoLite2-City.mmdb ] && ( curl -L "https://mmdbcdn.posthog.net/" --http1.1 | brotli --decompress --output=./share/GeoLite2-City.mmdb )
 
+if ! command -v mprocs &> /dev/null; then
+    if command -v brew &> /dev/null; then
+        echo "🔁 Installing mprocs via Homebrew..."
+        brew install mprocs
+    else
+        echo "👉 To run bin/start, install mprocs: https://github.com/pvolok/mprocs#installation"
+        exit 1
+    fi
+fi
+
 exec mprocs --config bin/mprocs.yaml
diff --git a/cypress/e2e/experiments.cy.ts b/cypress/e2e/experiments.cy.ts
index 34898f3146536..e0e8339920bee 100644
--- a/cypress/e2e/experiments.cy.ts
+++ b/cypress/e2e/experiments.cy.ts
@@ -1,5 +1,3 @@
-import { setupFeatureFlags } from '../support/decide'
-
 describe('Experiments', () => {
     let randomNum
     let experimentName
@@ -47,10 +45,6 @@ describe('Experiments', () => {
     })
 
     const createExperimentInNewUi = (): void => {
-        setupFeatureFlags({
-            'new-experiments-ui': true,
-        })
-
         cy.visit('/experiments')
 
         // Name, flag key, description
diff --git a/cypress/e2e/insights-reload-query.ts b/cypress/e2e/insights-reload-query.ts
index 2f944f8993da7..55685ae309fb8 100644
--- a/cypress/e2e/insights-reload-query.ts
+++ b/cypress/e2e/insights-reload-query.ts
@@ -1,5 +1,3 @@
-import JSONCrush from 'jsoncrush'
-
 describe('ReloadInsight component', () => {
     beforeEach(() => {
         // Clear local storage before each test to ensure a clean state
@@ -21,8 +19,7 @@ describe('ReloadInsight component', () => {
             const draftQuery = window.localStorage.getItem(`draft-query-${currentTeamId}`)
             expect(draftQuery).to.not.be.null
 
-            const draftQueryObjUncrushed = JSONCrush.uncrush(draftQuery)
-            const draftQueryObj = JSON.parse(draftQueryObjUncrushed)
+            const draftQueryObj = JSON.parse(draftQuery)
 
             expect(draftQueryObj).to.have.property('query')
 
diff --git a/ee/hogai/eval/conftest.py b/ee/hogai/eval/conftest.py
index d0bc75348eeac..1a88ebffa2e33 100644
--- a/ee/hogai/eval/conftest.py
+++ b/ee/hogai/eval/conftest.py
@@ -1,28 +1,107 @@
+import functools
+from collections.abc import Generator
+from pathlib import Path
+
 import pytest
+from django.conf import settings
+from django.test import override_settings
+from langchain_core.runnables import RunnableConfig
+
+from ee.models import Conversation
+from posthog.demo.matrix.manager import MatrixManager
+from posthog.models import Organization, Project, Team, User
+from posthog.tasks.demo_create_data import HedgeboxMatrix
+from posthog.test.base import BaseTest
+
+
+# Flaky is a handy tool, but it always runs setup fixtures for retries.
+# This decorator will just retry without re-running setup.
+def retry_test_only(max_retries=3):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            last_error: Exception | None = None
+            for attempt in range(max_retries):
+                try:
+                    return func(*args, **kwargs)
+                except Exception as e:
+                    last_error = e
+                    print(f"\nRetrying test (attempt {attempt + 1}/{max_retries})...")  # noqa
+            if last_error:
+                raise last_error
+
+        return wrapper
+
+    return decorator
+
+
+# Apply decorators to all tests in the package.
+def pytest_collection_modifyitems(items):
+    current_dir = Path(__file__).parent
+    for item in items:
+        if Path(item.fspath).is_relative_to(current_dir):
+            item.add_marker(
+                pytest.mark.skipif(not settings.IN_EVAL_TESTING, reason="Only runs for the assistant evaluation")
+            )
+            # Apply our custom retry decorator to the test function
+            item.obj = retry_test_only(max_retries=3)(item.obj)
+
+
+@pytest.fixture(scope="package")
+def team(django_db_blocker) -> Generator[Team, None, None]:
+    with django_db_blocker.unblock():
+        organization = Organization.objects.create(name=BaseTest.CONFIG_ORGANIZATION_NAME)
+        project = Project.objects.create(id=Team.objects.increment_id_sequence(), organization=organization)
+        team = Team.objects.create(
+            id=project.id,
+            project=project,
+            organization=organization,
+            test_account_filters=[
+                {
+                    "key": "email",
+                    "value": "@posthog.com",
+                    "operator": "not_icontains",
+                    "type": "person",
+                }
+            ],
+            has_completed_onboarding_for={"product_analytics": True},
+        )
+        yield team
+        organization.delete()
 
-from posthog.test.base import run_clickhouse_statement_in_parallel
 
+@pytest.fixture(scope="package")
+def user(team, django_db_blocker) -> Generator[User, None, None]:
+    with django_db_blocker.unblock():
+        user = User.objects.create_and_join(team.organization, "eval@posthog.com", "password1234")
+        yield user
+        user.delete()
 
-@pytest.fixture(scope="module", autouse=True)
-def setup_kafka_tables(django_db_setup):
-    from posthog.clickhouse.client import sync_execute
-    from posthog.clickhouse.schema import (
-        CREATE_KAFKA_TABLE_QUERIES,
-        build_query,
-    )
-    from posthog.settings import CLICKHOUSE_CLUSTER, CLICKHOUSE_DATABASE
 
-    kafka_queries = list(map(build_query, CREATE_KAFKA_TABLE_QUERIES))
-    run_clickhouse_statement_in_parallel(kafka_queries)
+@pytest.mark.django_db(transaction=True)
+@pytest.fixture
+def runnable_config(team, user) -> Generator[RunnableConfig, None, None]:
+    conversation = Conversation.objects.create(team=team, user=user)
+    yield {
+        "configurable": {
+            "thread_id": conversation.id,
+        }
+    }
+    conversation.delete()
 
-    yield
 
-    kafka_tables = sync_execute(
-        f"""
-        SELECT name
-        FROM system.tables
-        WHERE database = '{CLICKHOUSE_DATABASE}' AND name LIKE 'kafka_%'
-        """,
-    )
-    kafka_truncate_queries = [f"DROP TABLE {table[0]} ON CLUSTER '{CLICKHOUSE_CLUSTER}'" for table in kafka_tables]
-    run_clickhouse_statement_in_parallel(kafka_truncate_queries)
+@pytest.fixture(scope="package", autouse=True)
+def setup_test_data(django_db_setup, team, user, django_db_blocker):
+    with django_db_blocker.unblock():
+        matrix = HedgeboxMatrix(
+            seed="b1ef3c66-5f43-488a-98be-6b46d92fbcef",  # this seed generates all events
+            days_past=120,
+            days_future=30,
+            n_clusters=500,
+            group_type_index_offset=0,
+        )
+        matrix_manager = MatrixManager(matrix, print_steps=True)
+        with override_settings(TEST=False):
+            # Simulation saving should occur in non-test mode, so that Kafka isn't mocked. Normally in tests we don't
+            # want to ingest via Kafka, but simulation saving is specifically designed to use that route for speed
+            matrix_manager.run_on_team(team, user)
diff --git a/ee/hogai/eval/tests/test_eval_funnel_generator.py b/ee/hogai/eval/tests/test_eval_funnel_generator.py
index 4d7876ca6f73c..5f0f29243296a 100644
--- a/ee/hogai/eval/tests/test_eval_funnel_generator.py
+++ b/ee/hogai/eval/tests/test_eval_funnel_generator.py
@@ -1,40 +1,46 @@
+from collections.abc import Callable
 from typing import cast
 
+import pytest
 from langgraph.graph.state import CompiledStateGraph
 
 from ee.hogai.assistant import AssistantGraph
-from ee.hogai.eval.utils import EvalBaseTest
 from ee.hogai.utils.types import AssistantNodeName, AssistantState
 from posthog.schema import AssistantFunnelsQuery, HumanMessage, VisualizationMessage
 
 
-class TestEvalFunnelGenerator(EvalBaseTest):
-    def _call_node(self, query: str, plan: str) -> AssistantFunnelsQuery:
-        graph: CompiledStateGraph = (
-            AssistantGraph(self.team)
-            .add_edge(AssistantNodeName.START, AssistantNodeName.FUNNEL_GENERATOR)
-            .add_funnel_generator(AssistantNodeName.END)
-            .compile()
-        )
+@pytest.fixture
+def call_node(team, runnable_config) -> Callable[[str, str], AssistantFunnelsQuery]:
+    graph: CompiledStateGraph = (
+        AssistantGraph(team)
+        .add_edge(AssistantNodeName.START, AssistantNodeName.FUNNEL_GENERATOR)
+        .add_funnel_generator(AssistantNodeName.END)
+        .compile()
+    )
+
+    def callable(query: str, plan: str) -> AssistantFunnelsQuery:
         state = graph.invoke(
             AssistantState(messages=[HumanMessage(content=query)], plan=plan),
-            self._get_config(),
+            runnable_config,
         )
         return cast(VisualizationMessage, AssistantState.model_validate(state).messages[-1]).answer
 
-    def test_node_replaces_equals_with_contains(self):
-        query = "what is the conversion rate from a page view to sign up for users with name John?"
-        plan = """Sequence:
-        1. $pageview
-        - property filter 1
-            - person
-            - name
-            - equals
-            - John
-        2. signed_up
-        """
-        actual_output = self._call_node(query, plan).model_dump_json(exclude_none=True)
-        assert "exact" not in actual_output
-        assert "icontains" in actual_output
-        assert "John" not in actual_output
-        assert "john" in actual_output
+    return callable
+
+
+def test_node_replaces_equals_with_contains(call_node):
+    query = "what is the conversion rate from a page view to sign up for users with name John?"
+    plan = """Sequence:
+    1. $pageview
+    - property filter 1
+        - person
+        - name
+        - equals
+        - John
+    2. signed_up
+    """
+    actual_output = call_node(query, plan).model_dump_json(exclude_none=True)
+    assert "exact" not in actual_output
+    assert "icontains" in actual_output
+    assert "John" not in actual_output
+    assert "john" in actual_output
diff --git a/ee/hogai/eval/tests/test_eval_funnel_planner.py b/ee/hogai/eval/tests/test_eval_funnel_planner.py
index 9adbd75e77c6c..c8bc25bc0b5dc 100644
--- a/ee/hogai/eval/tests/test_eval_funnel_planner.py
+++ b/ee/hogai/eval/tests/test_eval_funnel_planner.py
@@ -1,208 +1,224 @@
+from collections.abc import Callable
+
+import pytest
 from deepeval import assert_test
 from deepeval.metrics import GEval
 from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from langchain_core.runnables.config import RunnableConfig
 from langgraph.graph.state import CompiledStateGraph
 
 from ee.hogai.assistant import AssistantGraph
-from ee.hogai.eval.utils import EvalBaseTest
 from ee.hogai.utils.types import AssistantNodeName, AssistantState
 from posthog.schema import HumanMessage
 
 
-class TestEvalFunnelPlanner(EvalBaseTest):
-    def _get_plan_correctness_metric(self):
-        return GEval(
-            name="Funnel Plan Correctness",
-            criteria="You will be given expected and actual generated plans to provide a taxonomy to answer a user's question with a funnel insight. Compare the plans to determine whether the taxonomy of the actual plan matches the expected plan. Do not apply general knowledge about funnel insights.",
-            evaluation_steps=[
-                "A plan must define at least two series in the sequence, but it is not required to define any filters, exclusion steps, or a breakdown.",
-                "Compare events, properties, math types, and property values of 'expected output' and 'actual output'. Do not penalize if the actual output does not include a timeframe.",
-                "Check if the combination of events, properties, and property values in 'actual output' can answer the user's question according to the 'expected output'.",
-                # The criteria for aggregations must be more specific because there isn't a way to bypass them.
-                "Check if the math types in 'actual output' match those in 'expected output.' If the aggregation type is specified by a property, user, or group in 'expected output', the same property, user, or group must be used in 'actual output'.",
-                "If 'expected output' contains exclusion steps, check if 'actual output' contains those, and heavily penalize if the exclusion steps are not present or different.",
-                "If 'expected output' contains a breakdown, check if 'actual output' contains a similar breakdown, and heavily penalize if the breakdown is not present or different. Plans may only have one breakdown.",
-                # We don't want to see in the output unnecessary property filters. The assistant tries to use them all the time.
-                "Heavily penalize if the 'actual output' contains any excessive output not present in the 'expected output'. For example, the `is set` operator in filters should not be used unless the user explicitly asks for it.",
-            ],
-            evaluation_params=[
-                LLMTestCaseParams.INPUT,
-                LLMTestCaseParams.EXPECTED_OUTPUT,
-                LLMTestCaseParams.ACTUAL_OUTPUT,
-            ],
-            threshold=0.7,
-        )
+@pytest.fixture(scope="module")
+def metric():
+    return GEval(
+        name="Funnel Plan Correctness",
+        criteria="You will be given expected and actual generated plans to provide a taxonomy to answer a user's question with a funnel insight. Compare the plans to determine whether the taxonomy of the actual plan matches the expected plan. Do not apply general knowledge about funnel insights.",
+        evaluation_steps=[
+            "A plan must define at least two series in the sequence, but it is not required to define any filters, exclusion steps, or a breakdown.",
+            "Compare events, properties, math types, and property values of 'expected output' and 'actual output'. Do not penalize if the actual output does not include a timeframe.",
+            "Check if the combination of events, properties, and property values in 'actual output' can answer the user's question according to the 'expected output'.",
+            # The criteria for aggregations must be more specific because there isn't a way to bypass them.
+            "Check if the math types in 'actual output' match those in 'expected output.' If the aggregation type is specified by a property, user, or group in 'expected output', the same property, user, or group must be used in 'actual output'.",
+            "If 'expected output' contains exclusion steps, check if 'actual output' contains those, and heavily penalize if the exclusion steps are not present or different.",
+            "If 'expected output' contains a breakdown, check if 'actual output' contains a similar breakdown, and heavily penalize if the breakdown is not present or different. Plans may only have one breakdown.",
+            # We don't want to see in the output unnecessary property filters. The assistant tries to use them all the time.
+            "Heavily penalize if the 'actual output' contains any excessive output not present in the 'expected output'. For example, the `is set` operator in filters should not be used unless the user explicitly asks for it.",
+        ],
+        evaluation_params=[
+            LLMTestCaseParams.INPUT,
+            LLMTestCaseParams.EXPECTED_OUTPUT,
+            LLMTestCaseParams.ACTUAL_OUTPUT,
+        ],
+        threshold=0.7,
+    )
 
-    def _call_node(self, query):
-        graph: CompiledStateGraph = (
-            AssistantGraph(self.team)
-            .add_edge(AssistantNodeName.START, AssistantNodeName.FUNNEL_PLANNER)
-            .add_funnel_planner(AssistantNodeName.END)
-            .compile()
-        )
+
+@pytest.fixture
+def call_node(team, runnable_config: RunnableConfig) -> Callable[[str], str]:
+    graph: CompiledStateGraph = (
+        AssistantGraph(team)
+        .add_edge(AssistantNodeName.START, AssistantNodeName.FUNNEL_PLANNER)
+        .add_funnel_planner(AssistantNodeName.END)
+        .compile()
+    )
+
+    def callable(query: str) -> str:
         state = graph.invoke(
             AssistantState(messages=[HumanMessage(content=query)]),
-            self._get_config(),
+            runnable_config,
         )
         return AssistantState.model_validate(state).plan or ""
 
-    def test_basic_funnel(self):
-        query = "what was the conversion from a page view to sign up?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Sequence:
-            1. $pageview
-            2. signed_up
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_outputs_at_least_two_events(self):
-        """
-        Ambigious query. The funnel must return at least two events.
-        """
-        query = "how many users paid a bill?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Sequence:
-            1. any event
-            2. upgrade_plan
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_no_excessive_property_filters(self):
-        query = "Show the user conversion from a sign up to a file download"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Sequence:
-            1. signed_up
-            2. downloaded_file
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
+    return callable
 
-    def test_basic_filtering(self):
-        query = (
-            "What was the conversion from uploading a file to downloading it from Chrome and Safari in the last 30d?"
-        )
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Sequence:
-            1. uploaded_file
-                - property filter 1:
-                    - entity: event
-                    - property name: $browser
-                    - property type: String
-                    - operator: equals
-                    - property value: Chrome
-                - property filter 2:
-                    - entity: event
-                    - property name: $browser
-                    - property type: String
-                    - operator: equals
-                    - property value: Safari
-            2. downloaded_file
-                - property filter 1:
-                    - entity: event
-                    - property name: $browser
-                    - property type: String
-                    - operator: equals
-                    - property value: Chrome
-                - property filter 2:
-                    - entity: event
-                    - property name: $browser
-                    - property type: String
-                    - operator: equals
-                    - property value: Safari
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_exclusion_steps(self):
-        query = "What was the conversion from uploading a file to downloading it in the last 30d excluding users that deleted a file?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Sequence:
-            1. uploaded_file
-            2. downloaded_file
-
-            Exclusions:
-            - deleted_file
-                - start index: 0
-                - end index: 1
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_breakdown(self):
-        query = "Show a conversion from uploading a file to downloading it segmented by a browser"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Sequence:
-            1. uploaded_file
-            2. downloaded_file
-
-            Breakdown by:
-            - entity: event
-            - property name: $browser
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_needle_in_a_haystack(self):
-        query = "What was the conversion from a sign up to a paying customer on the personal-pro plan?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Sequence:
-            1. signed_up
-            2. paid_bill
-                - property filter 1:
-                    - entity: event
-                    - property name: plan
-                    - property type: String
-                    - operator: equals
-                    - property value: personal/pro
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_planner_outputs_multiple_series_from_a_single_series_question(self):
-        query = "What's our sign-up funnel?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Sequence:
-            1. $pageview
-            2. signed_up
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_funnel_does_not_include_timeframe(self):
-        query = "what was the conversion from a page view to sign up for event time before 2024-01-01?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Sequence:
-            1. $pageview
-            2. signed_up
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
+
+def test_basic_funnel(metric, call_node):
+    query = "what was the conversion from a page view to sign up?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Sequence:
+        1. $pageview
+        2. signed_up
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_outputs_at_least_two_events(metric, call_node):
+    """
+    Ambigious query. The funnel must return at least two events.
+    """
+    query = "how many users paid a bill?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Sequence:
+        1. any event
+        2. upgrade_plan
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_no_excessive_property_filters(metric, call_node):
+    query = "Show the user conversion from a sign up to a file download"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Sequence:
+        1. signed_up
+        2. downloaded_file
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_basic_filtering(metric, call_node):
+    query = "What was the conversion from uploading a file to downloading it from Chrome and Safari in the last 30d?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Sequence:
+        1. uploaded_file
+            - property filter 1:
+                - entity: event
+                - property name: $browser
+                - property type: String
+                - operator: equals
+                - property value: Chrome
+            - property filter 2:
+                - entity: event
+                - property name: $browser
+                - property type: String
+                - operator: equals
+                - property value: Safari
+        2. downloaded_file
+            - property filter 1:
+                - entity: event
+                - property name: $browser
+                - property type: String
+                - operator: equals
+                - property value: Chrome
+            - property filter 2:
+                - entity: event
+                - property name: $browser
+                - property type: String
+                - operator: equals
+                - property value: Safari
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_exclusion_steps(metric, call_node):
+    query = "What was the conversion from uploading a file to downloading it in the last 30d excluding users that deleted a file?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Sequence:
+        1. uploaded_file
+        2. downloaded_file
+
+        Exclusions:
+        - deleted_file
+            - start index: 0
+            - end index: 1
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_breakdown(metric, call_node):
+    query = "Show a conversion from uploading a file to downloading it segmented by a browser"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Sequence:
+        1. uploaded_file
+        2. downloaded_file
+
+        Breakdown by:
+        - entity: event
+        - property name: $browser
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_needle_in_a_haystack(metric, call_node):
+    query = "What was the conversion from a sign up to a paying customer on the personal-pro plan?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Sequence:
+        1. signed_up
+        2. paid_bill
+            - property filter 1:
+                - entity: event
+                - property name: plan
+                - property type: String
+                - operator: equals
+                - property value: personal/pro
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_planner_outputs_multiple_series_from_a_single_series_question(metric, call_node):
+    query = "What's our sign-up funnel?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Sequence:
+        1. $pageview
+        2. signed_up
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_funnel_does_not_include_timeframe(metric, call_node):
+    query = "what was the conversion from a page view to sign up for event time before 2024-01-01?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Sequence:
+        1. $pageview
+        2. signed_up
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
diff --git a/ee/hogai/eval/tests/test_eval_router.py b/ee/hogai/eval/tests/test_eval_router.py
index c1307e9d40f00..84e5c4c809972 100644
--- a/ee/hogai/eval/tests/test_eval_router.py
+++ b/ee/hogai/eval/tests/test_eval_router.py
@@ -1,69 +1,80 @@
+from collections.abc import Callable
 from typing import cast
 
+import pytest
 from langgraph.graph.state import CompiledStateGraph
 
 from ee.hogai.assistant import AssistantGraph
-from ee.hogai.eval.utils import EvalBaseTest
 from ee.hogai.utils.types import AssistantNodeName, AssistantState
 from posthog.schema import HumanMessage, RouterMessage
 
 
-class TestEvalRouter(EvalBaseTest):
-    def _call_node(self, query: str | list):
-        graph: CompiledStateGraph = (
-            AssistantGraph(self.team)
-            .add_start()
-            .add_router(path_map={"trends": AssistantNodeName.END, "funnel": AssistantNodeName.END})
-            .compile()
-        )
+@pytest.fixture
+def call_node(team, runnable_config) -> Callable[[str | list], str]:
+    graph: CompiledStateGraph = (
+        AssistantGraph(team)
+        .add_start()
+        .add_router(path_map={"trends": AssistantNodeName.END, "funnel": AssistantNodeName.END})
+        .compile()
+    )
+
+    def callable(query: str | list) -> str:
         messages = [HumanMessage(content=query)] if isinstance(query, str) else query
         state = graph.invoke(
             AssistantState(messages=messages),
-            self._get_config(),
+            runnable_config,
         )
         return cast(RouterMessage, AssistantState.model_validate(state).messages[-1]).content
 
-    def test_outputs_basic_trends_insight(self):
-        query = "Show the $pageview trend"
-        res = self._call_node(query)
-        self.assertEqual(res, "trends")
-
-    def test_outputs_basic_funnel_insight(self):
-        query = "What is the conversion rate of users who uploaded a file to users who paid for a plan?"
-        res = self._call_node(query)
-        self.assertEqual(res, "funnel")
-
-    def test_converts_trends_to_funnel(self):
-        conversation = [
-            HumanMessage(content="Show trends of $pageview and $identify"),
-            RouterMessage(content="trends"),
-            HumanMessage(content="Convert this insight to a funnel"),
-        ]
-        res = self._call_node(conversation[:1])
-        self.assertEqual(res, "trends")
-        res = self._call_node(conversation)
-        self.assertEqual(res, "funnel")
-
-    def test_converts_funnel_to_trends(self):
-        conversation = [
-            HumanMessage(content="What is the conversion from a page view to a sign up?"),
-            RouterMessage(content="funnel"),
-            HumanMessage(content="Convert this insight to a trends"),
-        ]
-        res = self._call_node(conversation[:1])
-        self.assertEqual(res, "funnel")
-        res = self._call_node(conversation)
-        self.assertEqual(res, "trends")
-
-    def test_outputs_single_trends_insight(self):
-        """
-        Must display a trends insight because it's not possible to build a funnel with a single series.
-        """
-        query = "how many users upgraded their plan to personal pro?"
-        res = self._call_node(query)
-        self.assertEqual(res, "trends")
-
-    def test_classifies_funnel_with_single_series(self):
-        query = "What's our sign-up funnel?"
-        res = self._call_node(query)
-        self.assertEqual(res, "funnel")
+    return callable
+
+
+def test_outputs_basic_trends_insight(call_node):
+    query = "Show the $pageview trend"
+    res = call_node(query)
+    assert res == "trends"
+
+
+def test_outputs_basic_funnel_insight(call_node):
+    query = "What is the conversion rate of users who uploaded a file to users who paid for a plan?"
+    res = call_node(query)
+    assert res == "funnel"
+
+
+def test_converts_trends_to_funnel(call_node):
+    conversation = [
+        HumanMessage(content="Show trends of $pageview and $identify"),
+        RouterMessage(content="trends"),
+        HumanMessage(content="Convert this insight to a funnel"),
+    ]
+    res = call_node(conversation[:1])
+    assert res == "trends"
+    res = call_node(conversation)
+    assert res == "funnel"
+
+
+def test_converts_funnel_to_trends(call_node):
+    conversation = [
+        HumanMessage(content="What is the conversion from a page view to a sign up?"),
+        RouterMessage(content="funnel"),
+        HumanMessage(content="Convert this insight to a trends"),
+    ]
+    res = call_node(conversation[:1])
+    assert res == "funnel"
+    res = call_node(conversation)
+    assert res == "trends"
+
+
+def test_outputs_single_trends_insight(call_node):
+    """
+    Must display a trends insight because it's not possible to build a funnel with a single series.
+    """
+    query = "how many users upgraded their plan to personal pro?"
+    res = call_node(query)
+    assert res == "trends"
+
+
+def test_classifies_funnel_with_single_series(call_node):
+    query = "What's our sign-up funnel?"
+    res = call_node(query)
+    assert res == "funnel"
diff --git a/ee/hogai/eval/tests/test_eval_trends_generator.py b/ee/hogai/eval/tests/test_eval_trends_generator.py
index 496bbf0100b51..c8491957c868f 100644
--- a/ee/hogai/eval/tests/test_eval_trends_generator.py
+++ b/ee/hogai/eval/tests/test_eval_trends_generator.py
@@ -1,58 +1,65 @@
+from collections.abc import Callable
 from typing import cast
 
+import pytest
 from langgraph.graph.state import CompiledStateGraph
 
 from ee.hogai.assistant import AssistantGraph
-from ee.hogai.eval.utils import EvalBaseTest
 from ee.hogai.utils.types import AssistantNodeName, AssistantState
 from posthog.schema import AssistantTrendsQuery, HumanMessage, VisualizationMessage
 
 
-class TestEvalTrendsGenerator(EvalBaseTest):
-    def _call_node(self, query: str, plan: str) -> AssistantTrendsQuery:
-        graph: CompiledStateGraph = (
-            AssistantGraph(self.team)
-            .add_edge(AssistantNodeName.START, AssistantNodeName.TRENDS_GENERATOR)
-            .add_trends_generator(AssistantNodeName.END)
-            .compile()
-        )
+@pytest.fixture
+def call_node(team, runnable_config) -> Callable[[str, str], AssistantTrendsQuery]:
+    graph: CompiledStateGraph = (
+        AssistantGraph(team)
+        .add_edge(AssistantNodeName.START, AssistantNodeName.TRENDS_GENERATOR)
+        .add_trends_generator(AssistantNodeName.END)
+        .compile()
+    )
+
+    def callable(query: str, plan: str) -> AssistantTrendsQuery:
         state = graph.invoke(
             AssistantState(messages=[HumanMessage(content=query)], plan=plan),
-            self._get_config(),
+            runnable_config,
         )
         return cast(VisualizationMessage, AssistantState.model_validate(state).messages[-1]).answer
 
-    def test_node_replaces_equals_with_contains(self):
-        query = "what is pageview trend for users with name John?"
-        plan = """Events:
-        - $pageview
-            - math operation: total count
-            - property filter 1
-                - person
-                - name
-                - equals
-                - John
-        """
-        actual_output = self._call_node(query, plan).model_dump_json(exclude_none=True)
-        assert "exact" not in actual_output
-        assert "icontains" in actual_output
-        assert "John" not in actual_output
-        assert "john" in actual_output
-
-    def test_node_leans_towards_line_graph(self):
-        query = "How often do users download files?"
-        # We ideally want to consider both total count of downloads per period, as well as how often a median user downloads
-        plan = """Events:
-        - downloaded_file
-            - math operation: total count
-        - downloaded_file
-            - math operation: median count per user
-        """
-        actual_output = self._call_node(query, plan)
-        assert actual_output.trendsFilter.display == "ActionsLineGraph"
-        assert actual_output.series[0].kind == "EventsNode"
-        assert actual_output.series[0].event == "downloaded_file"
-        assert actual_output.series[0].math == "total"
-        assert actual_output.series[1].kind == "EventsNode"
-        assert actual_output.series[1].event == "downloaded_file"
-        assert actual_output.series[1].math == "median_count_per_actor"
+    return callable
+
+
+def test_node_replaces_equals_with_contains(call_node):
+    query = "what is pageview trend for users with name John?"
+    plan = """Events:
+    - $pageview
+        - math operation: total count
+        - property filter 1
+            - person
+            - name
+            - equals
+            - John
+    """
+    actual_output = call_node(query, plan).model_dump_json(exclude_none=True)
+    assert "exact" not in actual_output
+    assert "icontains" in actual_output
+    assert "John" not in actual_output
+    assert "john" in actual_output
+
+
+def test_node_leans_towards_line_graph(call_node):
+    query = "How often do users download files?"
+    # We ideally want to consider both total count of downloads per period, as well as how often a median user downloads
+    plan = """Events:
+    - downloaded_file
+        - math operation: total count
+    - downloaded_file
+        - math operation: median count per user
+    """
+    actual_output = call_node(query, plan)
+    assert actual_output.trendsFilter.display == "ActionsLineGraph"
+    assert actual_output.series[0].kind == "EventsNode"
+    assert actual_output.series[0].event == "downloaded_file"
+    assert actual_output.series[0].math == "total"
+    assert actual_output.series[1].kind == "EventsNode"
+    assert actual_output.series[1].event == "downloaded_file"
+    assert actual_output.series[1].math == "median_count_per_actor"
diff --git a/ee/hogai/eval/tests/test_eval_trends_planner.py b/ee/hogai/eval/tests/test_eval_trends_planner.py
index d4fbff456a91c..4d4ea4c41dfbf 100644
--- a/ee/hogai/eval/tests/test_eval_trends_planner.py
+++ b/ee/hogai/eval/tests/test_eval_trends_planner.py
@@ -1,179 +1,196 @@
+from collections.abc import Callable
+
+import pytest
 from deepeval import assert_test
 from deepeval.metrics import GEval
 from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from langchain_core.runnables.config import RunnableConfig
 from langgraph.graph.state import CompiledStateGraph
 
 from ee.hogai.assistant import AssistantGraph
-from ee.hogai.eval.utils import EvalBaseTest
 from ee.hogai.utils.types import AssistantNodeName, AssistantState
 from posthog.schema import HumanMessage
 
 
-class TestEvalTrendsPlanner(EvalBaseTest):
-    def _get_plan_correctness_metric(self):
-        return GEval(
-            name="Trends Plan Correctness",
-            criteria="You will be given expected and actual generated plans to provide a taxonomy to answer a user's question with a trends insight. Compare the plans to determine whether the taxonomy of the actual plan matches the expected plan. Do not apply general knowledge about trends insights.",
-            evaluation_steps=[
-                "A plan must define at least one event and a math type, but it is not required to define any filters, breakdowns, or formulas.",
-                "Compare events, properties, math types, and property values of 'expected output' and 'actual output'. Do not penalize if the actual output does not include a timeframe.",
-                "Check if the combination of events, properties, and property values in 'actual output' can answer the user's question according to the 'expected output'.",
-                # The criteria for aggregations must be more specific because there isn't a way to bypass them.
-                "Check if the math types in 'actual output' match those in 'expected output'. Math types sometimes are interchangeable, so use your judgement. If the aggregation type is specified by a property, user, or group in 'expected output', the same property, user, or group must be used in 'actual output'.",
-                "If 'expected output' contains a breakdown, check if 'actual output' contains a similar breakdown, and heavily penalize if the breakdown is not present or different.",
-                "If 'expected output' contains a formula, check if 'actual output' contains a similar formula, and heavily penalize if the formula is not present or different.",
-                # We don't want to see in the output unnecessary property filters. The assistant tries to use them all the time.
-                "Heavily penalize if the 'actual output' contains any excessive output not present in the 'expected output'. For example, the `is set` operator in filters should not be used unless the user explicitly asks for it.",
-            ],
-            evaluation_params=[
-                LLMTestCaseParams.INPUT,
-                LLMTestCaseParams.EXPECTED_OUTPUT,
-                LLMTestCaseParams.ACTUAL_OUTPUT,
-            ],
-            threshold=0.7,
-        )
+@pytest.fixture(scope="module")
+def metric():
+    return GEval(
+        name="Trends Plan Correctness",
+        criteria="You will be given expected and actual generated plans to provide a taxonomy to answer a user's question with a trends insight. Compare the plans to determine whether the taxonomy of the actual plan matches the expected plan. Do not apply general knowledge about trends insights.",
+        evaluation_steps=[
+            "A plan must define at least one event and a math type, but it is not required to define any filters, breakdowns, or formulas.",
+            "Compare events, properties, math types, and property values of 'expected output' and 'actual output'. Do not penalize if the actual output does not include a timeframe.",
+            "Check if the combination of events, properties, and property values in 'actual output' can answer the user's question according to the 'expected output'.",
+            # The criteria for aggregations must be more specific because there isn't a way to bypass them.
+            "Check if the math types in 'actual output' match those in 'expected output'. Math types sometimes are interchangeable, so use your judgement. If the aggregation type is specified by a property, user, or group in 'expected output', the same property, user, or group must be used in 'actual output'.",
+            "If 'expected output' contains a breakdown, check if 'actual output' contains a similar breakdown, and heavily penalize if the breakdown is not present or different.",
+            "If 'expected output' contains a formula, check if 'actual output' contains a similar formula, and heavily penalize if the formula is not present or different.",
+            # We don't want to see in the output unnecessary property filters. The assistant tries to use them all the time.
+            "Heavily penalize if the 'actual output' contains any excessive output not present in the 'expected output'. For example, the `is set` operator in filters should not be used unless the user explicitly asks for it.",
+        ],
+        evaluation_params=[
+            LLMTestCaseParams.INPUT,
+            LLMTestCaseParams.EXPECTED_OUTPUT,
+            LLMTestCaseParams.ACTUAL_OUTPUT,
+        ],
+        threshold=0.7,
+    )
 
-    def _call_node(self, query):
-        graph: CompiledStateGraph = (
-            AssistantGraph(self.team)
-            .add_edge(AssistantNodeName.START, AssistantNodeName.TRENDS_PLANNER)
-            .add_trends_planner(AssistantNodeName.END)
-            .compile()
-        )
+
+@pytest.fixture
+def call_node(team, runnable_config: RunnableConfig) -> Callable[[str], str]:
+    graph: CompiledStateGraph = (
+        AssistantGraph(team)
+        .add_edge(AssistantNodeName.START, AssistantNodeName.TRENDS_PLANNER)
+        .add_trends_planner(AssistantNodeName.END)
+        .compile()
+    )
+
+    def callable(query: str) -> str:
         state = graph.invoke(
             AssistantState(messages=[HumanMessage(content=query)]),
-            self._get_config(),
+            runnable_config,
         )
         return AssistantState.model_validate(state).plan or ""
 
-    def test_no_excessive_property_filters(self):
-        query = "Show the $pageview trend"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Events:
-            - $pageview
-                - math operation: total count
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_no_excessive_property_filters_for_a_defined_math_type(self):
-        query = "What is the MAU?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Events:
-            - $pageview
-                - math operation: unique users
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_basic_filtering(self):
-        query = "can you compare how many Chrome vs Safari users uploaded a file in the last 30d?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Events:
-            - uploaded_file
-                - math operation: total count
-                - property filter 1:
-                    - entity: event
-                    - property name: $browser
-                    - property type: String
-                    - operator: equals
-                    - property value: Chrome
-                - property filter 2:
-                    - entity: event
-                    - property name: $browser
-                    - property type: String
-                    - operator: equals
-                    - property value: Safari
-
-            Breakdown by:
-            - breakdown 1:
+    return callable
+
+
+def test_no_excessive_property_filters(metric, call_node):
+    query = "Show the $pageview trend"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Events:
+        - $pageview
+            - math operation: total count
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_no_excessive_property_filters_for_a_defined_math_type(metric, call_node):
+    query = "What is the MAU?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Events:
+        - $pageview
+            - math operation: unique users
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_basic_filtering(metric, call_node):
+    query = "can you compare how many Chrome vs Safari users uploaded a file in the last 30d?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Events:
+        - uploaded_file
+            - math operation: total count
+            - property filter 1:
                 - entity: event
                 - property name: $browser
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_formula_mode(self):
-        query = "i want to see a ratio of identify divided by page views"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Events:
-            - $identify
-                - math operation: total count
-            - $pageview
-                - math operation: total count
-
-            Formula:
-            `A/B`, where `A` is the total count of `$identify` and `B` is the total count of `$pageview`
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_math_type_by_a_property(self):
-        query = "what is the average session duration?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Events:
-            - All Events
-                - math operation: average by `$session_duration`
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_math_type_by_a_user(self):
-        query = "What is the median page view count for a user?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Events:
-            - $pageview
-                - math operation: median by users
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_needle_in_a_haystack(self):
-        query = "How frequently do people pay for a personal-pro plan?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Events:
-            - paid_bill
-                - math operation: total count
-                - property filter 1:
-                    - entity: event
-                    - property name: plan
-                    - property type: String
-                    - operator: contains
-                    - property value: personal/pro
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_funnel_does_not_include_timeframe(self):
-        query = "what is the pageview trend for event time before 2024-01-01?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Events:
-            - $pageview
-                - math operation: total count
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
+                - property type: String
+                - operator: equals
+                - property value: Chrome
+            - property filter 2:
+                - entity: event
+                - property name: $browser
+                - property type: String
+                - operator: equals
+                - property value: Safari
+
+        Breakdown by:
+        - breakdown 1:
+            - entity: event
+            - property name: $browser
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_formula_mode(metric, call_node):
+    query = "i want to see a ratio of identify divided by page views"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Events:
+        - $identify
+            - math operation: total count
+        - $pageview
+            - math operation: total count
+
+        Formula:
+        `A/B`, where `A` is the total count of `$identify` and `B` is the total count of `$pageview`
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_math_type_by_a_property(metric, call_node):
+    query = "what is the average session duration?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Events:
+        - All Events
+            - math operation: average by `$session_duration`
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_math_type_by_a_user(metric, call_node):
+    query = "What is the median page view count for a user?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Events:
+        - $pageview
+            - math operation: median by users
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_needle_in_a_haystack(metric, call_node):
+    query = "How frequently do people pay for a personal-pro plan?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Events:
+        - paid_bill
+            - math operation: total count
+            - property filter 1:
+                - entity: event
+                - property name: plan
+                - property type: String
+                - operator: contains
+                - property value: personal/pro
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_trends_does_not_include_timeframe(metric, call_node):
+    query = "what is the pageview trend for event time before 2024-01-01?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Events:
+        - $pageview
+            - math operation: total count
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
diff --git a/ee/hogai/eval/utils.py b/ee/hogai/eval/utils.py
deleted file mode 100644
index 6e03c4cfafa9f..0000000000000
--- a/ee/hogai/eval/utils.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import os
-
-import pytest
-from django.test import override_settings
-from flaky import flaky
-from langchain_core.runnables import RunnableConfig
-
-from ee.models.assistant import Conversation
-from posthog.demo.matrix.manager import MatrixManager
-from posthog.tasks.demo_create_data import HedgeboxMatrix
-from posthog.test.base import NonAtomicBaseTest
-
-
-@pytest.mark.skipif(os.environ.get("DEEPEVAL") != "YES", reason="Only runs for the assistant evaluation")
-@flaky(max_runs=3, min_passes=1)
-class EvalBaseTest(NonAtomicBaseTest):
-    def _get_config(self) -> RunnableConfig:
-        conversation = Conversation.objects.create(team=self.team, user=self.user)
-        return {
-            "configurable": {
-                "thread_id": conversation.id,
-            }
-        }
-
-    @classmethod
-    def setUpTestData(cls):
-        super().setUpTestData()
-        matrix = HedgeboxMatrix(
-            seed="b1ef3c66-5f43-488a-98be-6b46d92fbcef",  # this seed generates all events
-            days_past=120,
-            days_future=30,
-            n_clusters=500,
-            group_type_index_offset=0,
-        )
-        matrix_manager = MatrixManager(matrix, print_steps=True)
-        existing_user = cls.team.organization.members.first()
-        with override_settings(TEST=False):
-            # Simulation saving should occur in non-test mode, so that Kafka isn't mocked. Normally in tests we don't
-            # want to ingest via Kafka, but simulation saving is specifically designed to use that route for speed
-            matrix_manager.run_on_team(cls.team, existing_user)
diff --git a/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-breakdown-edit--dark.png b/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-breakdown-edit--dark.png
index c97834c6d7fb1..4b405550412a0 100644
Binary files a/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-breakdown-edit--dark.png and b/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-breakdown-edit--dark.png differ
diff --git a/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-edit--dark.png b/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-edit--dark.png
index 9f3435ec919df..248d1f34b318a 100644
Binary files a/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-edit--dark.png and b/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-edit--dark.png differ
diff --git a/frontend/src/lib/components/CommandBar/SearchResultPreview.tsx b/frontend/src/lib/components/CommandBar/SearchResultPreview.tsx
index af4e70df0a08a..7fe2a6313bfdd 100644
--- a/frontend/src/lib/components/CommandBar/SearchResultPreview.tsx
+++ b/frontend/src/lib/components/CommandBar/SearchResultPreview.tsx
@@ -40,18 +40,10 @@ export const SearchResultPreview = (): JSX.Element | null => {
                         onClick={() => {
                             openResult(activeResultIndex)
                         }}
-                        tooltip={
-                            <>
-                                Open <KeyboardShortcut enter />
-                            </>
-                        }
                         aria-label="Open search result"
                     >
-                        Open
+                        <span className="mr-1">Open</span> <KeyboardShortcut enter />
                     </LemonButton>
-                    <div>
-                        <KeyboardShortcut enter /> Open
-                    </div>
                 </div>
             </div>
         </div>
diff --git a/frontend/src/lib/constants.tsx b/frontend/src/lib/constants.tsx
index ae93860525488..4ba7745915b41 100644
--- a/frontend/src/lib/constants.tsx
+++ b/frontend/src/lib/constants.tsx
@@ -181,7 +181,6 @@ export const FEATURE_FLAGS = {
     SQL_EDITOR: 'sql-editor', // owner: @EDsCODE #team-data-warehouse
     SESSION_REPLAY_DOCTOR: 'session-replay-doctor', // owner: #team-replay
     SAVED_NOT_PINNED: 'saved-not-pinned', // owner: #team-replay
-    NEW_EXPERIMENTS_UI: 'new-experiments-ui', // owner: @jurajmajerik #team-feature-success
     AUDIT_LOGS_ACCESS: 'audit-logs-access', // owner: #team-growth
     SUBSCRIBE_FROM_PAYGATE: 'subscribe-from-paygate', // owner: #team-growth
     HEATMAPS_UI: 'heatmaps-ui', // owner: @benjackwhite
diff --git a/frontend/src/lib/lemon-ui/LemonButton/More.tsx b/frontend/src/lib/lemon-ui/LemonButton/More.tsx
index 6eff15a377572..d4e61ade7cbf9 100644
--- a/frontend/src/lib/lemon-ui/LemonButton/More.tsx
+++ b/frontend/src/lib/lemon-ui/LemonButton/More.tsx
@@ -2,12 +2,14 @@ import { IconEllipsis } from '@posthog/icons'
 
 import { PopoverProps } from '../Popover/Popover'
 import { LemonButtonWithDropdown } from '.'
-import { LemonButtonProps } from './LemonButton'
+import { LemonButtonDropdown, LemonButtonProps } from './LemonButton'
 
-export type MoreProps = Partial<Pick<PopoverProps, 'overlay' | 'placement'>> & LemonButtonProps
+export type MoreProps = Partial<Pick<PopoverProps, 'overlay' | 'placement'>> &
+    LemonButtonProps & { dropdown?: Partial<LemonButtonDropdown> }
 
 export function More({
     overlay,
+    dropdown,
     'data-attr': dataAttr,
     placement = 'bottom-end',
     ...buttonProps
@@ -17,11 +19,14 @@ export function More({
             aria-label="more"
             data-attr={dataAttr ?? 'more-button'}
             icon={<IconEllipsis />}
-            dropdown={{
-                placement: placement,
-                actionable: true,
-                overlay,
-            }}
+            dropdown={
+                {
+                    placement: placement,
+                    actionable: true,
+                    ...dropdown,
+                    overlay,
+                } as LemonButtonDropdown
+            }
             size="small"
             {...buttonProps}
             disabled={!overlay}
diff --git a/frontend/src/lib/lemon-ui/LemonInput/LemonInput.scss b/frontend/src/lib/lemon-ui/LemonInput/LemonInput.scss
index 166c7ff50528a..fa2ddaea8d2fd 100644
--- a/frontend/src/lib/lemon-ui/LemonInput/LemonInput.scss
+++ b/frontend/src/lib/lemon-ui/LemonInput/LemonInput.scss
@@ -98,11 +98,6 @@
         // NOTE Design: Search inputs are given a specific small width
         max-width: 240px;
         border-radius: 0;
-
-        // Add a focus ring to the element (not the input) when focused
-        &:has(:focus-visible) {
-            outline: -webkit-focus-ring-color auto 1px;
-        }
     }
 
     &.LemonInput--type-number {
diff --git a/frontend/src/queries/schema.json b/frontend/src/queries/schema.json
index 1d2a4d94012aa..005fb78c94497 100644
--- a/frontend/src/queries/schema.json
+++ b/frontend/src/queries/schema.json
@@ -2132,6 +2132,9 @@
                 "significant": {
                     "type": "boolean"
                 },
+                "stats_version": {
+                    "type": "integer"
+                },
                 "timezone": {
                     "type": "string"
                 },
@@ -4511,6 +4514,9 @@
                                 "significant": {
                                     "type": "boolean"
                                 },
+                                "stats_version": {
+                                    "type": "integer"
+                                },
                                 "variants": {
                                     "items": {
                                         "$ref": "#/definitions/ExperimentVariantFunnelsBaseStats"
@@ -5991,6 +5997,9 @@
                 },
                 "response": {
                     "$ref": "#/definitions/ExperimentFunnelsQueryResponse"
+                },
+                "stats_version": {
+                    "type": "integer"
                 }
             },
             "required": ["funnels_query", "kind"],
@@ -6041,6 +6050,9 @@
                 "significant": {
                     "type": "boolean"
                 },
+                "stats_version": {
+                    "type": "integer"
+                },
                 "variants": {
                     "items": {
                         "$ref": "#/definitions/ExperimentVariantFunnelsBaseStats"
@@ -9787,6 +9799,9 @@
                         "significant": {
                             "type": "boolean"
                         },
+                        "stats_version": {
+                            "type": "integer"
+                        },
                         "variants": {
                             "items": {
                                 "$ref": "#/definitions/ExperimentVariantFunnelsBaseStats"
@@ -10418,6 +10433,9 @@
                         "significant": {
                             "type": "boolean"
                         },
+                        "stats_version": {
+                            "type": "integer"
+                        },
                         "variants": {
                             "items": {
                                 "$ref": "#/definitions/ExperimentVariantFunnelsBaseStats"
diff --git a/frontend/src/queries/schema.ts b/frontend/src/queries/schema.ts
index 5360ae06d99f4..9e4252c989b38 100644
--- a/frontend/src/queries/schema.ts
+++ b/frontend/src/queries/schema.ts
@@ -2003,6 +2003,7 @@ export interface ExperimentFunnelsQueryResponse {
     significance_code: ExperimentSignificanceCode
     expected_loss: number
     credible_intervals: Record<string, [number, number]>
+    stats_version?: integer
 }
 
 export type CachedExperimentFunnelsQueryResponse = CachedQueryResponse<ExperimentFunnelsQueryResponse>
@@ -2012,6 +2013,7 @@ export interface ExperimentFunnelsQuery extends DataNode<ExperimentFunnelsQueryR
     name?: string
     experiment_id?: integer
     funnels_query: FunnelsQuery
+    stats_version?: integer
 }
 
 export interface ExperimentTrendsQuery extends DataNode<ExperimentTrendsQueryResponse> {
diff --git a/frontend/src/scenes/experiments/ExperimentView/CumulativeExposuresChart.tsx b/frontend/src/scenes/experiments/ExperimentView/CumulativeExposuresChart.tsx
index 7f4378a7ec5ef..6efdd992f5988 100644
--- a/frontend/src/scenes/experiments/ExperimentView/CumulativeExposuresChart.tsx
+++ b/frontend/src/scenes/experiments/ExperimentView/CumulativeExposuresChart.tsx
@@ -10,11 +10,11 @@ import { BaseMathType, ChartDisplayType, InsightType, PropertyFilterType, Proper
 import { experimentLogic } from '../experimentLogic'
 
 export function CumulativeExposuresChart(): JSX.Element {
-    const { experiment, experimentResults, getMetricType } = useValues(experimentLogic)
+    const { experiment, metricResults, getMetricType } = useValues(experimentLogic)
 
     const metricIdx = 0
     const metricType = getMetricType(metricIdx)
-
+    const result = metricResults?.[metricIdx]
     const variants = experiment.parameters?.feature_flag_variants?.map((variant) => variant.key) || []
     if (experiment.holdout) {
         variants.push(`holdout-${experiment.holdout.id}`)
@@ -25,7 +25,7 @@ export function CumulativeExposuresChart(): JSX.Element {
     if (metricType === InsightType.TRENDS) {
         query = {
             kind: NodeKind.InsightVizNode,
-            source: (experimentResults as CachedExperimentTrendsQueryResponse)?.exposure_query || {
+            source: (result as CachedExperimentTrendsQueryResponse)?.exposure_query || {
                 kind: NodeKind.TrendsQuery,
                 series: [],
                 interval: 'day',
diff --git a/frontend/src/scenes/experiments/ExperimentView/DataCollection.tsx b/frontend/src/scenes/experiments/ExperimentView/DataCollection.tsx
index 2463e1dd791a5..b22eb57b35d4b 100644
--- a/frontend/src/scenes/experiments/ExperimentView/DataCollection.tsx
+++ b/frontend/src/scenes/experiments/ExperimentView/DataCollection.tsx
@@ -32,7 +32,7 @@ export function DataCollection(): JSX.Element {
 
     const experimentProgressPercent =
         metricType === InsightType.FUNNELS
-            ? (funnelResultsPersonsTotal / recommendedSampleSize) * 100
+            ? (funnelResultsPersonsTotal(0) / recommendedSampleSize) * 100
             : (actualRunningTime / recommendedRunningTime) * 100
 
     const hasHighRunningTime = recommendedRunningTime > 62
@@ -109,7 +109,7 @@ export function DataCollection(): JSX.Element {
                                 <span>
                                     Saw&nbsp;
                                     <b>
-                                        {humanFriendlyNumber(funnelResultsPersonsTotal)} of{' '}
+                                        {humanFriendlyNumber(funnelResultsPersonsTotal(0))} of{' '}
                                         {humanFriendlyNumber(recommendedSampleSize)}{' '}
                                     </b>{' '}
                                     {formatUnitByQuantity(recommendedSampleSize, 'participant')}
diff --git a/frontend/src/scenes/experiments/ExperimentView/DistributionTable.tsx b/frontend/src/scenes/experiments/ExperimentView/DistributionTable.tsx
index b3c2962d95c55..caee718efb726 100644
--- a/frontend/src/scenes/experiments/ExperimentView/DistributionTable.tsx
+++ b/frontend/src/scenes/experiments/ExperimentView/DistributionTable.tsx
@@ -137,9 +137,11 @@ export function DistributionModal({ experimentId }: { experimentId: Experiment['
 
 export function DistributionTable(): JSX.Element {
     const { openDistributionModal } = useActions(experimentLogic)
-    const { experimentId, experiment, experimentResults } = useValues(experimentLogic)
+    const { experimentId, experiment, metricResults } = useValues(experimentLogic)
     const { reportExperimentReleaseConditionsViewed } = useActions(experimentLogic)
 
+    const result = metricResults?.[0]
+
     const onSelectElement = (variant: string): void => {
         LemonDialog.open({
             title: 'Select a domain',
@@ -166,7 +168,7 @@ export function DistributionTable(): JSX.Element {
             key: 'key',
             title: 'Variant',
             render: function Key(_, item): JSX.Element {
-                if (!experimentResults || !experimentResults.insight) {
+                if (!result || !result.insight) {
                     return <span className="font-semibold">{item.key}</span>
                 }
                 return <VariantTag experimentId={experimentId} variantKey={item.key} />
diff --git a/frontend/src/scenes/experiments/ExperimentView/ExperimentView.tsx b/frontend/src/scenes/experiments/ExperimentView/ExperimentView.tsx
index 4f2c109275189..c5dc5eb43cb4b 100644
--- a/frontend/src/scenes/experiments/ExperimentView/ExperimentView.tsx
+++ b/frontend/src/scenes/experiments/ExperimentView/ExperimentView.tsx
@@ -26,9 +26,9 @@ import { Results } from './Results'
 import { SecondaryMetricsTable } from './SecondaryMetricsTable'
 
 const ResultsTab = (): JSX.Element => {
-    const { experiment, experimentResults, featureFlags } = useValues(experimentLogic)
-
-    const hasResultsInsight = experimentResults && experimentResults.insight
+    const { experiment, metricResults, featureFlags } = useValues(experimentLogic)
+    const result = metricResults?.[0]
+    const hasResultsInsight = result && result.insight
 
     return (
         <div className="space-y-8">
@@ -70,12 +70,12 @@ const VariantsTab = (): JSX.Element => {
 }
 
 export function ExperimentView(): JSX.Element {
-    const { experimentLoading, experimentResultsLoading, experimentId, experimentResults, tabKey, featureFlags } =
+    const { experimentLoading, metricResultsLoading, experimentId, metricResults, tabKey, featureFlags } =
         useValues(experimentLogic)
 
     const { setTabKey } = useActions(experimentLogic)
-
-    const hasResultsInsight = experimentResults && experimentResults.insight
+    const result = metricResults?.[0]
+    const hasResultsInsight = result && result.insight
 
     return (
         <>
@@ -86,7 +86,7 @@ export function ExperimentView(): JSX.Element {
                 ) : (
                     <>
                         <Info />
-                        {experimentResultsLoading ? (
+                        {metricResultsLoading ? (
                             <ExperimentLoadingAnimation />
                         ) : (
                             <>
diff --git a/frontend/src/scenes/experiments/ExperimentView/Overview.tsx b/frontend/src/scenes/experiments/ExperimentView/Overview.tsx
index 2095309364143..c8c44c8ea5c04 100644
--- a/frontend/src/scenes/experiments/ExperimentView/Overview.tsx
+++ b/frontend/src/scenes/experiments/ExperimentView/Overview.tsx
@@ -1,17 +1,29 @@
 import { useValues } from 'kea'
 
+import { CachedExperimentFunnelsQueryResponse, CachedExperimentTrendsQueryResponse } from '~/queries/schema'
+
 import { experimentLogic } from '../experimentLogic'
 import { VariantTag } from './components'
 
 export function Overview(): JSX.Element {
-    const { experimentId, experimentResults, getIndexForVariant, getHighestProbabilityVariant, areResultsSignificant } =
+    const { experimentId, metricResults, getIndexForVariant, getHighestProbabilityVariant, areResultsSignificant } =
         useValues(experimentLogic)
 
+    const result = metricResults?.[0]
+    if (!result) {
+        return <></>
+    }
+
     function WinningVariantText(): JSX.Element {
-        const highestProbabilityVariant = getHighestProbabilityVariant(experimentResults)
-        const index = getIndexForVariant(experimentResults, highestProbabilityVariant || '')
-        if (highestProbabilityVariant && index !== null && experimentResults) {
-            const { probability } = experimentResults
+        const highestProbabilityVariant = getHighestProbabilityVariant(
+            result as CachedExperimentFunnelsQueryResponse | CachedExperimentTrendsQueryResponse
+        )
+        const index = getIndexForVariant(
+            result as CachedExperimentFunnelsQueryResponse | CachedExperimentTrendsQueryResponse,
+            highestProbabilityVariant || ''
+        )
+        if (highestProbabilityVariant && index !== null && result) {
+            const { probability } = result
 
             return (
                 <div className="items-center inline-flex flex-wrap">
@@ -32,7 +44,9 @@ export function Overview(): JSX.Element {
         return (
             <div className="flex-wrap">
                 <span>Your results are&nbsp;</span>
-                <span className="font-semibold">{`${areResultsSignificant ? 'significant' : 'not significant'}`}.</span>
+                <span className="font-semibold">
+                    {`${areResultsSignificant(0) ? 'significant' : 'not significant'}`}.
+                </span>
             </div>
         )
     }
diff --git a/frontend/src/scenes/experiments/ExperimentView/Results.tsx b/frontend/src/scenes/experiments/ExperimentView/Results.tsx
index c4e7a4b05ed62..61574de1b3966 100644
--- a/frontend/src/scenes/experiments/ExperimentView/Results.tsx
+++ b/frontend/src/scenes/experiments/ExperimentView/Results.tsx
@@ -5,13 +5,17 @@ import { ResultsHeader, ResultsQuery } from './components'
 import { SummaryTable } from './SummaryTable'
 
 export function Results(): JSX.Element {
-    const { experimentResults } = useValues(experimentLogic)
+    const { metricResults } = useValues(experimentLogic)
+    const result = metricResults?.[0]
+    if (!result) {
+        return <></>
+    }
 
     return (
         <div>
             <ResultsHeader />
             <SummaryTable />
-            <ResultsQuery targetResults={experimentResults} showTable={true} />
+            <ResultsQuery targetResults={result} showTable={true} />
         </div>
     )
 }
diff --git a/frontend/src/scenes/experiments/ExperimentView/SecondaryMetricsTable.tsx b/frontend/src/scenes/experiments/ExperimentView/SecondaryMetricsTable.tsx
index 8369038f00cbb..7f5bcbabfa3a1 100644
--- a/frontend/src/scenes/experiments/ExperimentView/SecondaryMetricsTable.tsx
+++ b/frontend/src/scenes/experiments/ExperimentView/SecondaryMetricsTable.tsx
@@ -21,7 +21,7 @@ export function SecondaryMetricsTable({ experimentId }: { experimentId: Experime
     const [modalMetricIdx, setModalMetricIdx] = useState<number | null>(null)
 
     const {
-        experimentResults,
+        metricResults,
         secondaryMetricResultsLoading,
         experiment,
         getSecondaryMetricType,
@@ -65,7 +65,7 @@ export function SecondaryMetricsTable({ experimentId }: { experimentId: Experime
                 {
                     title: <div className="py-2">Variant</div>,
                     render: function Key(_, item: TabularSecondaryMetricResults): JSX.Element {
-                        if (!experimentResults || !experimentResults.insight) {
+                        if (!metricResults?.[0] || !metricResults?.[0].insight) {
                             return <span className="font-semibold">{item.variant}</span>
                         }
                         return (
diff --git a/frontend/src/scenes/experiments/ExperimentView/SummaryTable.tsx b/frontend/src/scenes/experiments/ExperimentView/SummaryTable.tsx
index 6150d4e7b7826..adb1bc9d69616 100644
--- a/frontend/src/scenes/experiments/ExperimentView/SummaryTable.tsx
+++ b/frontend/src/scenes/experiments/ExperimentView/SummaryTable.tsx
@@ -10,7 +10,6 @@ import { urls } from 'scenes/urls'
 
 import {
     FilterLogicalOperator,
-    FunnelExperimentVariant,
     InsightType,
     PropertyFilterType,
     PropertyOperator,
@@ -27,7 +26,7 @@ export function SummaryTable(): JSX.Element {
     const {
         experimentId,
         experiment,
-        experimentResults,
+        metricResults,
         tabularExperimentResults,
         getMetricType,
         exposureCountDataForVariant,
@@ -38,14 +37,14 @@ export function SummaryTable(): JSX.Element {
         credibleIntervalForVariant,
     } = useValues(experimentLogic)
     const metricType = getMetricType(0)
-
-    if (!experimentResults) {
+    const result = metricResults?.[0]
+    if (!result) {
         return <></>
     }
 
-    const winningVariant = getHighestProbabilityVariant(experimentResults)
+    const winningVariant = getHighestProbabilityVariant(result)
 
-    const columns: LemonTableColumns<TrendExperimentVariant | FunnelExperimentVariant> = [
+    const columns: LemonTableColumns<any> = [
         {
             key: 'variants',
             title: 'Variant',
@@ -64,14 +63,14 @@ export function SummaryTable(): JSX.Element {
             key: 'counts',
             title: (
                 <div className="flex">
-                    {experimentResults.insight?.[0] && 'action' in experimentResults.insight[0] && (
-                        <EntityFilterInfo filter={experimentResults.insight[0].action} />
+                    {result.insight?.[0] && 'action' in result.insight[0] && (
+                        <EntityFilterInfo filter={result.insight[0].action} />
                     )}
                     <span className="pl-1">{experimentMathAggregationForTrends() ? 'metric' : 'count'}</span>
                 </div>
             ),
             render: function Key(_, variant): JSX.Element {
-                const count = countDataForVariant(experimentResults, variant.key)
+                const count = countDataForVariant(result, variant.key)
                 if (!count) {
                     return <>—</>
                 }
@@ -83,7 +82,7 @@ export function SummaryTable(): JSX.Element {
             key: 'exposure',
             title: 'Exposure',
             render: function Key(_, variant): JSX.Element {
-                const exposure = exposureCountDataForVariant(experimentResults, variant.key)
+                const exposure = exposureCountDataForVariant(result, variant.key)
                 if (!exposure) {
                     return <>—</>
                 }
@@ -120,7 +119,7 @@ export function SummaryTable(): JSX.Element {
                     return <em>Baseline</em>
                 }
 
-                const controlVariant = (experimentResults.variants as TrendExperimentVariant[]).find(
+                const controlVariant = (result.variants as TrendExperimentVariant[]).find(
                     ({ key }) => key === 'control'
                 ) as TrendExperimentVariant
 
@@ -161,7 +160,7 @@ export function SummaryTable(): JSX.Element {
                     return <em>Baseline</em>
                 }
 
-                const credibleInterval = credibleIntervalForVariant(experimentResults || null, variant.key, metricType)
+                const credibleInterval = credibleIntervalForVariant(result || null, variant.key, metricType)
                 if (!credibleInterval) {
                     return <>—</>
                 }
@@ -181,7 +180,7 @@ export function SummaryTable(): JSX.Element {
             key: 'conversionRate',
             title: 'Conversion rate',
             render: function Key(_, item): JSX.Element {
-                const conversionRate = conversionRateForVariant(experimentResults, item.key)
+                const conversionRate = conversionRateForVariant(result, item.key)
                 if (!conversionRate) {
                     return <>—</>
                 }
@@ -204,8 +203,8 @@ export function SummaryTable(): JSX.Element {
                         return <em>Baseline</em>
                     }
 
-                    const controlConversionRate = conversionRateForVariant(experimentResults, 'control')
-                    const variantConversionRate = conversionRateForVariant(experimentResults, item.key)
+                    const controlConversionRate = conversionRateForVariant(result, 'control')
+                    const variantConversionRate = conversionRateForVariant(result, item.key)
 
                     if (!controlConversionRate || !variantConversionRate) {
                         return <>—</>
@@ -235,7 +234,7 @@ export function SummaryTable(): JSX.Element {
                         return <em>Baseline</em>
                     }
 
-                    const credibleInterval = credibleIntervalForVariant(experimentResults || null, item.key, metricType)
+                    const credibleInterval = credibleIntervalForVariant(result || null, item.key, metricType)
                     if (!credibleInterval) {
                         return <>—</>
                     }
@@ -254,15 +253,13 @@ export function SummaryTable(): JSX.Element {
         key: 'winProbability',
         title: 'Win probability',
         sorter: (a, b) => {
-            const aPercentage = (experimentResults?.probability?.[a.key] || 0) * 100
-            const bPercentage = (experimentResults?.probability?.[b.key] || 0) * 100
+            const aPercentage = (result?.probability?.[a.key] || 0) * 100
+            const bPercentage = (result?.probability?.[b.key] || 0) * 100
             return aPercentage - bPercentage
         },
         render: function Key(_, item): JSX.Element {
             const variantKey = item.key
-            const percentage =
-                experimentResults?.probability?.[variantKey] != undefined &&
-                experimentResults.probability?.[variantKey] * 100
+            const percentage = result?.probability?.[variantKey] != undefined && result.probability?.[variantKey] * 100
             const isWinning = variantKey === winningVariant
 
             return (
@@ -351,7 +348,7 @@ export function SummaryTable(): JSX.Element {
 
     return (
         <div className="mb-4">
-            <LemonTable loading={false} columns={columns} dataSource={tabularExperimentResults} />
+            <LemonTable loading={false} columns={columns} dataSource={tabularExperimentResults(0)} />
         </div>
     )
 }
diff --git a/frontend/src/scenes/experiments/ExperimentView/components.tsx b/frontend/src/scenes/experiments/ExperimentView/components.tsx
index 9e5bfb7c2c4b0..87885a3761b79 100644
--- a/frontend/src/scenes/experiments/ExperimentView/components.tsx
+++ b/frontend/src/scenes/experiments/ExperimentView/components.tsx
@@ -62,7 +62,11 @@ export function VariantTag({
     muted?: boolean
     fontSize?: number
 }): JSX.Element {
-    const { experiment, experimentResults, getIndexForVariant } = useValues(experimentLogic({ experimentId }))
+    const { experiment, getIndexForVariant, metricResults } = useValues(experimentLogic({ experimentId }))
+
+    if (!metricResults) {
+        return <></>
+    }
 
     if (experiment.holdout && variantKey === `holdout-${experiment.holdout_id}`) {
         return (
@@ -71,7 +75,7 @@ export function VariantTag({
                     className="w-2 h-2 rounded-full mr-0.5"
                     // eslint-disable-next-line react/forbid-dom-props
                     style={{
-                        backgroundColor: getExperimentInsightColour(getIndexForVariant(experimentResults, variantKey)),
+                        backgroundColor: getExperimentInsightColour(getIndexForVariant(metricResults[0], variantKey)),
                     }}
                 />
                 <LemonTag type="option">{experiment.holdout.name}</LemonTag>
@@ -85,7 +89,7 @@ export function VariantTag({
                 className="w-2 h-2 rounded-full mr-0.5"
                 // eslint-disable-next-line react/forbid-dom-props
                 style={{
-                    backgroundColor: getExperimentInsightColour(getIndexForVariant(experimentResults, variantKey)),
+                    backgroundColor: getExperimentInsightColour(getIndexForVariant(metricResults[0], variantKey)),
                 }}
             />
             <span
@@ -99,15 +103,15 @@ export function VariantTag({
     )
 }
 
-export function ResultsTag(): JSX.Element {
+export function ResultsTag({ metricIndex = 0 }: { metricIndex?: number }): JSX.Element {
     const { areResultsSignificant, significanceDetails } = useValues(experimentLogic)
-    const result: { color: LemonTagType; label: string } = areResultsSignificant
+    const result: { color: LemonTagType; label: string } = areResultsSignificant(metricIndex)
         ? { color: 'success', label: 'Significant' }
         : { color: 'primary', label: 'Not significant' }
 
-    if (significanceDetails) {
+    if (significanceDetails(metricIndex)) {
         return (
-            <Tooltip title={significanceDetails}>
+            <Tooltip title={significanceDetails(metricIndex)}>
                 <LemonTag className="cursor-pointer" type={result.color}>
                     <b className="uppercase">{result.label}</b>
                 </LemonTag>
@@ -205,8 +209,15 @@ export function ResultsQuery({
     )
 }
 
-export function ExploreButton({ icon = <IconAreaChart /> }: { icon?: JSX.Element }): JSX.Element {
-    const { experimentResults, experiment, featureFlags } = useValues(experimentLogic)
+export function ExploreButton({
+    icon = <IconAreaChart />,
+    metricIndex = 0,
+}: {
+    icon?: JSX.Element
+    metricIndex?: number
+}): JSX.Element {
+    const { metricResults, experiment, featureFlags } = useValues(experimentLogic)
+    const result = metricResults?.[metricIndex]
 
     // keep in sync with https://github.com/PostHog/posthog/blob/master/ee/clickhouse/queries/experiments/funnel_experiment_result.py#L71
     // :TRICKY: In the case of no results, we still want users to explore the query, so they can debug further.
@@ -223,7 +234,7 @@ export function ExploreButton({ icon = <IconAreaChart /> }: { icon?: JSX.Element
 
     let query: InsightVizNode
     if (featureFlags[FEATURE_FLAGS.EXPERIMENTS_HOGQL]) {
-        const newQueryResults = experimentResults as unknown as
+        const newQueryResults = result as unknown as
             | CachedExperimentTrendsQueryResponse
             | CachedExperimentFunnelsQueryResponse
 
@@ -237,7 +248,7 @@ export function ExploreButton({ icon = <IconAreaChart /> }: { icon?: JSX.Element
             source: source as InsightQueryNode,
         }
     } else {
-        const oldQueryResults = experimentResults as ExperimentResults['result']
+        const oldQueryResults = result as unknown as ExperimentResults['result']
 
         if (!oldQueryResults?.filters) {
             return <></>
@@ -272,7 +283,9 @@ export function ExploreButton({ icon = <IconAreaChart /> }: { icon?: JSX.Element
 }
 
 export function ResultsHeader(): JSX.Element {
-    const { experimentResults } = useValues(experimentLogic)
+    const { metricResults } = useValues(experimentLogic)
+
+    const result = metricResults?.[0]
 
     return (
         <div className="flex">
@@ -284,16 +297,17 @@ export function ResultsHeader(): JSX.Element {
             </div>
 
             <div className="w-1/2 flex flex-col justify-end">
-                <div className="ml-auto">{experimentResults && <ExploreButton />}</div>
+                <div className="ml-auto">{result && <ExploreButton />}</div>
             </div>
         </div>
     )
 }
 
-export function NoResultsEmptyState(): JSX.Element {
+export function NoResultsEmptyState({ metricIndex = 0 }: { metricIndex?: number }): JSX.Element {
     type ErrorCode = 'no-events' | 'no-flag-info' | 'no-control-variant' | 'no-test-variant'
 
-    const { experimentResultsLoading, experimentResultCalculationError } = useValues(experimentLogic)
+    const { metricResultsLoading, primaryMetricsResultErrors } = useValues(experimentLogic)
+    const metricError = primaryMetricsResultErrors?.[metricIndex]
 
     function ChecklistItem({ errorCode, value }: { errorCode: ErrorCode; value: boolean }): JSX.Element {
         const failureText = {
@@ -327,28 +341,25 @@ export function NoResultsEmptyState(): JSX.Element {
         )
     }
 
-    if (experimentResultsLoading) {
+    if (metricResultsLoading) {
         return <></>
     }
 
     // Validation errors return 400 and are rendered as a checklist
-    if (experimentResultCalculationError?.statusCode === 400) {
-        let parsedDetail: Record<ErrorCode, boolean>
-        try {
-            parsedDetail = JSON.parse(experimentResultCalculationError.detail)
-        } catch (error) {
+    if (metricError?.statusCode === 400) {
+        if (!metricError.hasDiagnostics) {
             return (
                 <div className="border rounded bg-bg-light p-4">
                     <div className="font-semibold leading-tight text-base text-current">
                         Experiment results could not be calculated
                     </div>
-                    <div className="mt-2">{experimentResultCalculationError.detail}</div>
+                    <div className="mt-2">{metricError.detail}</div>
                 </div>
             )
         }
 
         const checklistItems = []
-        for (const [errorCode, value] of Object.entries(parsedDetail)) {
+        for (const [errorCode, value] of Object.entries(metricError.detail as Record<ErrorCode, boolean>)) {
             checklistItems.push(<ChecklistItem key={errorCode} errorCode={errorCode as ErrorCode} value={value} />)
         }
 
@@ -377,14 +388,14 @@ export function NoResultsEmptyState(): JSX.Element {
         )
     }
 
-    if (experimentResultCalculationError?.statusCode === 504) {
+    if (metricError?.statusCode === 504) {
         return (
             <div>
                 <div className="border rounded bg-bg-light py-10">
                     <div className="flex flex-col items-center mx-auto text-muted space-y-2">
                         <IconArchive className="text-4xl text-secondary-3000" />
                         <h2 className="text-xl font-semibold leading-tight">Experiment results timed out</h2>
-                        {!!experimentResultCalculationError && (
+                        {!!metricError && (
                             <div className="text-sm text-center text-balance">
                                 This may occur when the experiment has a large amount of data or is particularly
                                 complex. We are actively working on fixing this. In the meantime, please try refreshing
@@ -404,11 +415,7 @@ export function NoResultsEmptyState(): JSX.Element {
                 <div className="flex flex-col items-center mx-auto text-muted space-y-2">
                     <IconArchive className="text-4xl text-secondary-3000" />
                     <h2 className="text-xl font-semibold leading-tight">Experiment results could not be calculated</h2>
-                    {!!experimentResultCalculationError && (
-                        <div className="text-sm text-center text-balance">
-                            {experimentResultCalculationError.detail}
-                        </div>
-                    )}
+                    {!!metricError && <div className="text-sm text-center text-balance">{metricError.detail}</div>}
                 </div>
             </div>
         </div>
@@ -464,7 +471,7 @@ export function PageHeaderCustom(): JSX.Element {
         launchExperiment,
         endExperiment,
         archiveExperiment,
-        loadExperimentResults,
+        loadMetricResults,
         loadSecondaryMetricResults,
         createExposureCohort,
         openShipVariantModal,
@@ -505,7 +512,7 @@ export function PageHeaderCustom(): JSX.Element {
                                                 {exposureCohortId ? 'View' : 'Create'} exposure cohort
                                             </LemonButton>
                                             <LemonButton
-                                                onClick={() => loadExperimentResults(true)}
+                                                onClick={() => loadMetricResults(true)}
                                                 fullWidth
                                                 data-attr="refresh-experiment"
                                             >
@@ -588,7 +595,7 @@ export function PageHeaderCustom(): JSX.Element {
                         </div>
                     )}
                     {featureFlags[FEATURE_FLAGS.EXPERIMENT_MAKE_DECISION] &&
-                        areResultsSignificant &&
+                        areResultsSignificant(0) &&
                         !isSingleVariantShipped && (
                             <>
                                 <Tooltip title="Choose a variant and roll it out to all users">
@@ -615,7 +622,7 @@ export function ShipVariantModal({ experimentId }: { experimentId: Experiment['i
     const { aggregationLabel } = useValues(groupsModel)
 
     const [selectedVariantKey, setSelectedVariantKey] = useState<string | null>()
-    useEffect(() => setSelectedVariantKey(sortedWinProbabilities[0]?.key), [sortedWinProbabilities])
+    useEffect(() => setSelectedVariantKey(sortedWinProbabilities(0)[0]?.key), [sortedWinProbabilities(0)])
 
     const aggregationTargetName =
         experiment.filters.aggregation_group_type_index != null
@@ -656,12 +663,12 @@ export function ShipVariantModal({ experimentId }: { experimentId: Experiment['i
                             data-attr="metrics-selector"
                             value={selectedVariantKey}
                             onChange={(variantKey) => setSelectedVariantKey(variantKey)}
-                            options={sortedWinProbabilities.map(({ key }) => ({
+                            options={sortedWinProbabilities(0).map(({ key }) => ({
                                 value: key,
                                 label: (
                                     <div className="space-x-2 inline-flex">
                                         <VariantTag experimentId={experimentId} variantKey={key} />
-                                        {key === sortedWinProbabilities[0]?.key && (
+                                        {key === sortedWinProbabilities(0)[0]?.key && (
                                             <LemonTag type="success">
                                                 <b className="uppercase">Winning</b>
                                             </LemonTag>
@@ -693,9 +700,9 @@ export function ActionBanner(): JSX.Element {
     const {
         experiment,
         getMetricType,
-        experimentResults,
+        metricResults,
         experimentLoading,
-        experimentResultsLoading,
+        metricResultsLoading,
         isExperimentRunning,
         areResultsSignificant,
         isExperimentStopped,
@@ -706,6 +713,7 @@ export function ActionBanner(): JSX.Element {
         featureFlags,
     } = useValues(experimentLogic)
 
+    const result = metricResults?.[0]
     const { archiveExperiment } = useActions(experimentLogic)
 
     const { aggregationLabel } = useValues(groupsModel)
@@ -720,7 +728,7 @@ export function ActionBanner(): JSX.Element {
     const recommendedRunningTime = experiment?.parameters?.recommended_running_time || 1
     const recommendedSampleSize = experiment?.parameters?.recommended_sample_size || 100
 
-    if (!experiment || experimentLoading || experimentResultsLoading) {
+    if (!experiment || experimentLoading || metricResultsLoading) {
         return <></>
     }
 
@@ -766,12 +774,12 @@ export function ActionBanner(): JSX.Element {
     }
 
     // Running, results present, not significant
-    if (isExperimentRunning && experimentResults && !isExperimentStopped && !areResultsSignificant) {
+    if (isExperimentRunning && result && !isExperimentStopped && !areResultsSignificant(0)) {
         // Results insignificant, but a large enough sample/running time has been achieved
         // Further collection unlikely to change the result -> recommmend cutting the losses
         if (
             metricType === InsightType.FUNNELS &&
-            funnelResultsPersonsTotal > Math.max(recommendedSampleSize, 500) &&
+            funnelResultsPersonsTotal(0) > Math.max(recommendedSampleSize, 500) &&
             dayjs().diff(experiment.start_date, 'day') > 2 // at least 2 days running
         ) {
             return (
@@ -800,9 +808,9 @@ export function ActionBanner(): JSX.Element {
     }
 
     // Running, results significant
-    if (isExperimentRunning && !isExperimentStopped && areResultsSignificant && experimentResults) {
-        const { probability } = experimentResults
-        const winningVariant = getHighestProbabilityVariant(experimentResults)
+    if (isExperimentRunning && !isExperimentStopped && areResultsSignificant(0) && result) {
+        const { probability } = result
+        const winningVariant = getHighestProbabilityVariant(result)
         if (!winningVariant) {
             return <></>
         }
@@ -812,7 +820,7 @@ export function ActionBanner(): JSX.Element {
         // Win probability only slightly over 0.9 and the recommended sample/time just met -> proceed with caution
         if (
             metricType === InsightType.FUNNELS &&
-            funnelResultsPersonsTotal < recommendedSampleSize + 50 &&
+            funnelResultsPersonsTotal(0) < recommendedSampleSize + 50 &&
             winProbability < 0.93
         ) {
             return (
@@ -848,7 +856,7 @@ export function ActionBanner(): JSX.Element {
     }
 
     // Stopped, results significant
-    if (isExperimentStopped && areResultsSignificant) {
+    if (isExperimentStopped && areResultsSignificant(0)) {
         return (
             <LemonBanner type="success" className="mt-4">
                 You have stopped this experiment, and it is no longer collecting data. With significant results in hand,
@@ -866,7 +874,7 @@ export function ActionBanner(): JSX.Element {
     }
 
     // Stopped, results not significant
-    if (isExperimentStopped && experimentResults && !areResultsSignificant) {
+    if (isExperimentStopped && result && !areResultsSignificant(0)) {
         return (
             <LemonBanner type="info" className="mt-4">
                 You have stopped this experiment, and it is no longer collecting data. Because your results are not
diff --git a/frontend/src/scenes/experiments/experimentLogic.tsx b/frontend/src/scenes/experiments/experimentLogic.tsx
index 698c4182dc84d..42f6b2cd37652 100644
--- a/frontend/src/scenes/experiments/experimentLogic.tsx
+++ b/frontend/src/scenes/experiments/experimentLogic.tsx
@@ -1,4 +1,3 @@
-import { IconInfo } from '@posthog/icons'
 import { actions, connect, kea, key, listeners, path, props, reducers, selectors } from 'kea'
 import { forms } from 'kea-forms'
 import { loaders } from 'kea-loaders'
@@ -8,11 +7,9 @@ import { EXPERIMENT_DEFAULT_DURATION, FunnelLayout } from 'lib/constants'
 import { FEATURE_FLAGS } from 'lib/constants'
 import { dayjs } from 'lib/dayjs'
 import { lemonToast } from 'lib/lemon-ui/LemonToast/LemonToast'
-import { Tooltip } from 'lib/lemon-ui/Tooltip'
 import { featureFlagLogic } from 'lib/logic/featureFlagLogic'
 import { hasFormErrors, toParams } from 'lib/utils'
 import { eventUsageLogic } from 'lib/utils/eventUsageLogic'
-import { ReactElement } from 'react'
 import { validateFeatureFlagKey } from 'scenes/feature-flags/featureFlagLogic'
 import { funnelDataLogic } from 'scenes/funnels/funnelDataLogic'
 import { insightDataLogic } from 'scenes/insights/insightDataLogic'
@@ -31,6 +28,7 @@ import {
     CachedExperimentFunnelsQueryResponse,
     CachedExperimentTrendsQueryResponse,
     ExperimentFunnelsQuery,
+    ExperimentSignificanceCode,
     ExperimentTrendsQuery,
     NodeKind,
 } from '~/queries/schema'
@@ -55,7 +53,6 @@ import {
     ProductKey,
     PropertyMathType,
     SecondaryMetricResults,
-    SignificanceCode,
     TrendExperimentVariant,
     TrendResult,
     TrendsFilterType,
@@ -177,7 +174,6 @@ export const experimentLogic = kea<experimentLogicType>([
         setExperimentType: (type?: string) => ({ type }),
         removeExperimentGroup: (idx: number) => ({ idx }),
         setEditExperiment: (editing: boolean) => ({ editing }),
-        setExperimentResultCalculationError: (error: ExperimentResultCalculationError) => ({ error }),
         setFlagImplementationWarning: (warning: boolean) => ({ warning }),
         setExposureAndSampleSize: (exposure: number, sampleSize: number) => ({ exposure, sampleSize }),
         updateExperimentGoal: (filters: Partial<FilterType>) => ({ filters }),
@@ -420,12 +416,6 @@ export const experimentLogic = kea<experimentLogicType>([
                 setEditExperiment: (_, { editing }) => editing,
             },
         ],
-        experimentResultCalculationError: [
-            null as ExperimentResultCalculationError | null,
-            {
-                setExperimentResultCalculationError: (_, { error }) => error,
-            },
-        ],
         flagImplementationWarning: [
             false as boolean,
             {
@@ -597,10 +587,7 @@ export const experimentLogic = kea<experimentLogicType>([
             experiment && actions.reportExperimentViewed(experiment)
 
             if (experiment?.start_date) {
-                actions.loadExperimentResults()
-                if (values.featureFlags[FEATURE_FLAGS.EXPERIMENTS_MULTIPLE_METRICS]) {
-                    actions.loadMetricResults()
-                }
+                actions.loadMetricResults()
                 actions.loadSecondaryMetricResults()
             }
         },
@@ -618,7 +605,7 @@ export const experimentLogic = kea<experimentLogicType>([
             actions.updateExperiment({ end_date: endDate.toISOString() })
             const duration = endDate.diff(values.experiment?.start_date, 'second')
             values.experiment &&
-                actions.reportExperimentCompleted(values.experiment, endDate, duration, values.areResultsSignificant)
+                actions.reportExperimentCompleted(values.experiment, endDate, duration, values.areResultsSignificant(0))
         },
         archiveExperiment: async () => {
             actions.updateExperiment({ archived: true })
@@ -683,13 +670,11 @@ export const experimentLogic = kea<experimentLogicType>([
         resetRunningExperiment: async () => {
             actions.updateExperiment({ start_date: null, end_date: null, archived: false })
             values.experiment && actions.reportExperimentReset(values.experiment)
-
-            actions.loadExperimentResultsSuccess(null)
             actions.loadSecondaryMetricResultsSuccess([])
         },
         updateExperimentSuccess: async ({ experiment }) => {
             actions.updateExperiments(experiment)
-            actions.loadExperimentResults()
+            actions.loadMetricResults()
             actions.loadSecondaryMetricResults()
         },
         setExperiment: async ({ experiment }) => {
@@ -838,69 +823,6 @@ export const experimentLogic = kea<experimentLogicType>([
                 return response
             },
         },
-        experimentResults: [
-            null as
-                | ExperimentResults['result']
-                | CachedExperimentTrendsQueryResponse
-                | CachedExperimentFunnelsQueryResponse
-                | null,
-            {
-                loadExperimentResults: async (
-                    refresh?: boolean
-                ): Promise<
-                    | ExperimentResults['result']
-                    | CachedExperimentTrendsQueryResponse
-                    | CachedExperimentFunnelsQueryResponse
-                    | null
-                > => {
-                    try {
-                        // :FLAG: CLEAN UP AFTER MIGRATION
-                        if (values.featureFlags[FEATURE_FLAGS.EXPERIMENTS_HOGQL]) {
-                            // Queries are shareable, so we need to set the experiment_id for the backend to correctly associate the query with the experiment
-                            const queryWithExperimentId = {
-                                ...values.experiment.metrics[0],
-                                experiment_id: values.experimentId,
-                            }
-                            if (
-                                queryWithExperimentId.kind === NodeKind.ExperimentTrendsQuery &&
-                                values.featureFlags[FEATURE_FLAGS.EXPERIMENT_STATS_V2]
-                            ) {
-                                queryWithExperimentId.stats_version = 2
-                            }
-
-                            const response = await performQuery(queryWithExperimentId, undefined, refresh)
-
-                            return {
-                                ...response,
-                                fakeInsightId: Math.random().toString(36).substring(2, 15),
-                            } as unknown as CachedExperimentTrendsQueryResponse | CachedExperimentFunnelsQueryResponse
-                        }
-
-                        const refreshParam = refresh ? '?refresh=true' : ''
-                        const response: ExperimentResults = await api.get(
-                            `api/projects/${values.currentProjectId}/experiments/${values.experimentId}/results${refreshParam}`
-                        )
-                        return {
-                            ...response.result,
-                            fakeInsightId: Math.random().toString(36).substring(2, 15),
-                            last_refresh: response.last_refresh,
-                        }
-                    } catch (error: any) {
-                        let errorDetail = error.detail
-                        // :HANDLE FLAG: CLEAN UP AFTER MIGRATION
-                        if (values.featureFlags[FEATURE_FLAGS.EXPERIMENTS_HOGQL]) {
-                            const errorDetailMatch = error.detail.match(/\{.*\}/)
-                            errorDetail = errorDetailMatch ? errorDetailMatch[0] : error.detail
-                        }
-                        actions.setExperimentResultCalculationError({ detail: errorDetail, statusCode: error.status })
-                        if (error.status === 504) {
-                            actions.reportExperimentResultsLoadingTimeout(values.experimentId)
-                        }
-                        return null
-                    }
-                },
-            },
-        ],
         metricResults: [
             null as (CachedExperimentTrendsQueryResponse | CachedExperimentFunnelsQueryResponse | null)[] | null,
             {
@@ -1187,83 +1109,40 @@ export const experimentLogic = kea<experimentLogicType>([
             },
         ],
         areResultsSignificant: [
-            (s) => [s.experimentResults],
-            (experimentResults): boolean => {
-                return experimentResults?.significant || false
-            },
-        ],
-        // TODO: remove with the old UI
-        significanceBannerDetails: [
-            (s) => [s.experimentResults],
-            (experimentResults): string | ReactElement => {
-                if (experimentResults?.significance_code === SignificanceCode.HighLoss) {
-                    return (
-                        <>
-                            This is because the expected loss in conversion is greater than 1%
-                            <Tooltip
-                                placement="right"
-                                title={
-                                    <>Current value is {((experimentResults?.expected_loss || 0) * 100)?.toFixed(2)}%</>
-                                }
-                            >
-                                <IconInfo className="ml-1 text-muted text-xl" />
-                            </Tooltip>
-                            .
-                        </>
-                    )
-                }
-
-                if (experimentResults?.significance_code === SignificanceCode.HighPValue) {
-                    return (
-                        <>
-                            This is because the p value is greater than 0.05
-                            <Tooltip
-                                placement="right"
-                                title={<>Current value is {experimentResults?.p_value?.toFixed(3) || 1}.</>}
-                            >
-                                <IconInfo className="ml-1 text-muted text-xl" />
-                            </Tooltip>
-                            .
-                        </>
-                    )
-                }
-
-                if (experimentResults?.significance_code === SignificanceCode.LowWinProbability) {
-                    return 'This is because the win probability of all test variants combined is less than 90%.'
-                }
-
-                if (experimentResults?.significance_code === SignificanceCode.NotEnoughExposure) {
-                    return 'This is because we need at least 100 people per variant to declare significance.'
-                }
-
-                return ''
-            },
+            (s) => [s.metricResults],
+            (metricResults: (CachedExperimentFunnelsQueryResponse | CachedExperimentTrendsQueryResponse | null)[]) =>
+                (metricIndex: number = 0): boolean => {
+                    return metricResults?.[metricIndex]?.significant || false
+                },
         ],
         significanceDetails: [
-            (s) => [s.experimentResults],
-            (experimentResults): string => {
-                if (experimentResults?.significance_code === SignificanceCode.HighLoss) {
-                    return `This is because the expected loss in conversion is greater than 1% (current value is ${(
-                        (experimentResults?.expected_loss || 0) * 100
-                    )?.toFixed(2)}%).`
-                }
+            (s) => [s.metricResults],
+            (metricResults: (CachedExperimentFunnelsQueryResponse | CachedExperimentTrendsQueryResponse | null)[]) =>
+                (metricIndex: number = 0): string => {
+                    const results = metricResults?.[metricIndex]
+
+                    if (results?.significance_code === ExperimentSignificanceCode.HighLoss) {
+                        return `This is because the expected loss in conversion is greater than 1% (current value is ${(
+                            (results as CachedExperimentFunnelsQueryResponse)?.expected_loss || 0
+                        )?.toFixed(2)}%).`
+                    }
 
-                if (experimentResults?.significance_code === SignificanceCode.HighPValue) {
-                    return `This is because the p value is greater than 0.05 (current value is ${
-                        experimentResults?.p_value?.toFixed(3) || 1
-                    }).`
-                }
+                    if (results?.significance_code === ExperimentSignificanceCode.HighPValue) {
+                        return `This is because the p value is greater than 0.05 (current value is ${
+                            (results as CachedExperimentTrendsQueryResponse)?.p_value?.toFixed(3) || 1
+                        }).`
+                    }
 
-                if (experimentResults?.significance_code === SignificanceCode.LowWinProbability) {
-                    return 'This is because the win probability of all test variants combined is less than 90%.'
-                }
+                    if (results?.significance_code === ExperimentSignificanceCode.LowWinProbability) {
+                        return 'This is because the win probability of all test variants combined is less than 90%.'
+                    }
 
-                if (experimentResults?.significance_code === SignificanceCode.NotEnoughExposure) {
-                    return 'This is because we need at least 100 people per variant to declare significance.'
-                }
+                    if (results?.significance_code === ExperimentSignificanceCode.NotEnoughExposure) {
+                        return 'This is because we need at least 100 people per variant to declare significance.'
+                    }
 
-                return ''
-            },
+                    return ''
+                },
         ],
         recommendedSampleSize: [
             (s) => [s.conversionMetrics, s.minimumSampleSizePerVariant, s.variants],
@@ -1353,17 +1232,17 @@ export const experimentLogic = kea<experimentLogicType>([
             () => [],
             () =>
                 (
-                    experimentResults:
+                    metricResult:
                         | Partial<ExperimentResults['result']>
                         | CachedExperimentFunnelsQueryResponse
                         | CachedExperimentTrendsQueryResponse
                         | null,
                     variantKey: string
                 ): number | null => {
-                    if (!experimentResults || !experimentResults.insight) {
+                    if (!metricResult || !metricResult.insight) {
                         return null
                     }
-                    const variantResults = (experimentResults.insight as FunnelStep[][]).find(
+                    const variantResults = (metricResult.insight as FunnelStep[][]).find(
                         (variantFunnel: FunnelStep[]) => {
                             const breakdownValue = variantFunnel[0]?.breakdown_value
                             return Array.isArray(breakdownValue) && breakdownValue[0] === variantKey
@@ -1380,7 +1259,7 @@ export const experimentLogic = kea<experimentLogicType>([
             () => [],
             () =>
                 (
-                    experimentResults:
+                    metricResult:
                         | Partial<ExperimentResults['result']>
                         | CachedSecondaryMetricExperimentFunnelsQueryResponse
                         | CachedSecondaryMetricExperimentTrendsQueryResponse
@@ -1388,13 +1267,13 @@ export const experimentLogic = kea<experimentLogicType>([
                     variantKey: string,
                     metricType: InsightType
                 ): [number, number] | null => {
-                    const credibleInterval = experimentResults?.credible_intervals?.[variantKey]
+                    const credibleInterval = metricResult?.credible_intervals?.[variantKey]
                     if (!credibleInterval) {
                         return null
                     }
 
                     if (metricType === InsightType.FUNNELS) {
-                        const controlVariant = (experimentResults.variants as FunnelExperimentVariant[]).find(
+                        const controlVariant = (metricResult.variants as FunnelExperimentVariant[]).find(
                             ({ key }) => key === 'control'
                         ) as FunnelExperimentVariant
                         const controlConversionRate =
@@ -1411,7 +1290,7 @@ export const experimentLogic = kea<experimentLogicType>([
                         return [lowerBound, upperBound]
                     }
 
-                    const controlVariant = (experimentResults.variants as TrendExperimentVariant[]).find(
+                    const controlVariant = (metricResult.variants as TrendExperimentVariant[]).find(
                         ({ key }) => key === 'control'
                     ) as TrendExperimentVariant
 
@@ -1428,7 +1307,7 @@ export const experimentLogic = kea<experimentLogicType>([
             (s) => [s.getMetricType],
             (getMetricType) =>
                 (
-                    experimentResults:
+                    metricResult:
                         | Partial<ExperimentResults['result']>
                         | CachedExperimentTrendsQueryResponse
                         | CachedExperimentFunnelsQueryResponse
@@ -1437,14 +1316,14 @@ export const experimentLogic = kea<experimentLogicType>([
                 ): number | null => {
                     // Ensures we get the right index from results, so the UI can
                     // display the right colour for the variant
-                    if (!experimentResults || !experimentResults.insight) {
+                    if (!metricResult || !metricResult.insight) {
                         return null
                     }
 
                     let index = -1
                     if (getMetricType(0) === InsightType.FUNNELS) {
                         // Funnel Insight is displayed in order of decreasing count
-                        index = (Array.isArray(experimentResults.insight) ? [...experimentResults.insight] : [])
+                        index = (Array.isArray(metricResult.insight) ? [...metricResult.insight] : [])
                             .sort((a, b) => {
                                 const aCount = (a && Array.isArray(a) && a[0]?.count) || 0
                                 const bCount = (b && Array.isArray(b) && b[0]?.count) || 0
@@ -1458,7 +1337,7 @@ export const experimentLogic = kea<experimentLogicType>([
                                 return Array.isArray(breakdownValue) && breakdownValue[0] === variant
                             })
                     } else {
-                        index = (experimentResults.insight as TrendResult[]).findIndex(
+                        index = (metricResult.insight as TrendResult[]).findIndex(
                             (variantTrend: TrendResult) => variantTrend.breakdown_value === variant
                         )
                     }
@@ -1474,7 +1353,7 @@ export const experimentLogic = kea<experimentLogicType>([
             (s) => [s.experimentMathAggregationForTrends],
             (experimentMathAggregationForTrends) =>
                 (
-                    experimentResults:
+                    metricResult:
                         | Partial<ExperimentResults['result']>
                         | CachedExperimentTrendsQueryResponse
                         | CachedExperimentFunnelsQueryResponse
@@ -1483,10 +1362,10 @@ export const experimentLogic = kea<experimentLogicType>([
                     type: 'primary' | 'secondary' = 'primary'
                 ): number | null => {
                     const usingMathAggregationType = type === 'primary' ? experimentMathAggregationForTrends() : false
-                    if (!experimentResults || !experimentResults.insight) {
+                    if (!metricResult || !metricResult.insight) {
                         return null
                     }
-                    const variantResults = (experimentResults.insight as TrendResult[]).find(
+                    const variantResults = (metricResult.insight as TrendResult[]).find(
                         (variantTrend: TrendResult) => variantTrend.breakdown_value === variant
                     )
                     if (!variantResults) {
@@ -1524,17 +1403,17 @@ export const experimentLogic = kea<experimentLogicType>([
             () => [],
             () =>
                 (
-                    experimentResults:
+                    metricResult:
                         | Partial<ExperimentResults['result']>
                         | CachedExperimentTrendsQueryResponse
                         | CachedExperimentFunnelsQueryResponse
                         | null,
                     variant: string
                 ): number | null => {
-                    if (!experimentResults || !experimentResults.variants) {
+                    if (!metricResult || !metricResult.variants) {
                         return null
                     }
-                    const variantResults = (experimentResults.variants as TrendExperimentVariant[]).find(
+                    const variantResults = (metricResult.variants as TrendExperimentVariant[]).find(
                         (variantTrend: TrendExperimentVariant) => variantTrend.key === variant
                     )
                     if (!variantResults || !variantResults.absolute_exposure) {
@@ -1564,58 +1443,50 @@ export const experimentLogic = kea<experimentLogicType>([
                     }
                 },
         ],
-        sortedExperimentResultVariants: [
-            (s) => [s.experimentResults, s.experiment],
-            (experimentResults, experiment): string[] => {
-                if (experimentResults) {
-                    const sortedResults = Object.keys(experimentResults.probability).sort(
-                        (a, b) => experimentResults.probability[b] - experimentResults.probability[a]
-                    )
-
-                    experiment?.parameters?.feature_flag_variants?.forEach((variant) => {
-                        if (!sortedResults.includes(variant.key)) {
-                            sortedResults.push(variant.key)
-                        }
-                    })
-                    return sortedResults
-                }
-                return []
-            },
-        ],
         tabularExperimentResults: [
-            (s) => [s.experiment, s.experimentResults, s.getMetricType],
-            (experiment, experimentResults, getMetricType): any => {
-                const tabularResults = []
-                const metricType = getMetricType(0)
-
-                if (experimentResults) {
-                    for (const variantObj of experimentResults.variants) {
-                        if (metricType === InsightType.FUNNELS) {
-                            const { key, success_count, failure_count } = variantObj as FunnelExperimentVariant
-                            tabularResults.push({ key, success_count, failure_count })
-                        } else if (metricType === InsightType.TRENDS) {
-                            const { key, count, exposure, absolute_exposure } = variantObj as TrendExperimentVariant
-                            tabularResults.push({ key, count, exposure, absolute_exposure })
+            (s) => [s.experiment, s.metricResults, s.getMetricType],
+            (
+                    experiment,
+                    metricResults: (
+                        | CachedExperimentFunnelsQueryResponse
+                        | CachedExperimentTrendsQueryResponse
+                        | null
+                    )[],
+                    getMetricType
+                ) =>
+                (metricIndex: number = 0): any[] => {
+                    const tabularResults = []
+                    const metricType = getMetricType(metricIndex)
+                    const result = metricResults?.[metricIndex]
+
+                    if (result) {
+                        for (const variantObj of result.variants) {
+                            if (metricType === InsightType.FUNNELS) {
+                                const { key, success_count, failure_count } = variantObj as FunnelExperimentVariant
+                                tabularResults.push({ key, success_count, failure_count })
+                            } else if (metricType === InsightType.TRENDS) {
+                                const { key, count, exposure, absolute_exposure } = variantObj as TrendExperimentVariant
+                                tabularResults.push({ key, count, exposure, absolute_exposure })
+                            }
                         }
                     }
-                }
 
-                if (experiment.feature_flag?.filters.multivariate?.variants) {
-                    for (const { key } of experiment.feature_flag.filters.multivariate.variants) {
-                        if (tabularResults.find((variantObj) => variantObj.key === key)) {
-                            continue
-                        }
+                    if (experiment.feature_flag?.filters.multivariate?.variants) {
+                        for (const { key } of experiment.feature_flag.filters.multivariate.variants) {
+                            if (tabularResults.find((variantObj) => variantObj.key === key)) {
+                                continue
+                            }
 
-                        if (metricType === InsightType.FUNNELS) {
-                            tabularResults.push({ key, success_count: null, failure_count: null })
-                        } else if (metricType === InsightType.TRENDS) {
-                            tabularResults.push({ key, count: null, exposure: null, absolute_exposure: null })
+                            if (metricType === InsightType.FUNNELS) {
+                                tabularResults.push({ key, success_count: null, failure_count: null })
+                            } else if (metricType === InsightType.TRENDS) {
+                                tabularResults.push({ key, count: null, exposure: null, absolute_exposure: null })
+                            }
                         }
                     }
-                }
 
-                return tabularResults
-            },
+                    return tabularResults
+                },
         ],
         tabularSecondaryMetricResults: [
             (s) => [s.experiment, s.secondaryMetricResults, s.conversionRateForVariant, s.countDataForVariant],
@@ -1655,39 +1526,56 @@ export const experimentLogic = kea<experimentLogicType>([
             },
         ],
         sortedWinProbabilities: [
-            (s) => [s.experimentResults, s.conversionRateForVariant],
+            (s) => [s.metricResults, s.conversionRateForVariant],
             (
-                experimentResults,
-                conversionRateForVariant
-            ): { key: string; winProbability: number; conversionRate: number | null }[] => {
-                if (!experimentResults) {
-                    return []
-                }
+                    metricResults: (
+                        | CachedExperimentFunnelsQueryResponse
+                        | CachedExperimentTrendsQueryResponse
+                        | null
+                    )[],
+                    conversionRateForVariant
+                ) =>
+                (metricIndex: number = 0) => {
+                    const result = metricResults?.[metricIndex]
+
+                    if (!result || !result.probability) {
+                        return []
+                    }
 
-                return Object.keys(experimentResults.probability)
-                    .map((key) => ({
-                        key,
-                        winProbability: experimentResults.probability[key],
-                        conversionRate: conversionRateForVariant(experimentResults, key),
-                    }))
-                    .sort((a, b) => b.winProbability - a.winProbability)
-            },
+                    return Object.keys(result.probability)
+                        .map((key) => ({
+                            key,
+                            winProbability: result.probability[key],
+                            conversionRate: conversionRateForVariant(result, key),
+                        }))
+                        .sort((a, b) => b.winProbability - a.winProbability)
+                },
         ],
         funnelResultsPersonsTotal: [
-            (s) => [s.experimentResults, s.getMetricType],
-            (experimentResults, getMetricType): number => {
-                if (getMetricType(0) !== InsightType.FUNNELS || !experimentResults?.insight) {
-                    return 0
-                }
+            (s) => [s.metricResults, s.getMetricType],
+            (
+                    metricResults: (
+                        | CachedExperimentFunnelsQueryResponse
+                        | CachedExperimentTrendsQueryResponse
+                        | null
+                    )[],
+                    getMetricType
+                ) =>
+                (metricIndex: number = 0): number => {
+                    const result = metricResults?.[metricIndex]
 
-                let sum = 0
-                experimentResults.insight.forEach((variantResult) => {
-                    if (variantResult[0]?.count) {
-                        sum += variantResult[0].count
+                    if (getMetricType(metricIndex) !== InsightType.FUNNELS || !result?.insight) {
+                        return 0
                     }
-                })
-                return sum
-            },
+
+                    let sum = 0
+                    result.insight.forEach((variantResult) => {
+                        if (variantResult[0]?.count) {
+                            sum += variantResult[0].count
+                        }
+                    })
+                    return sum
+                },
         ],
         actualRunningTime: [
             (s) => [s.experiment],
diff --git a/frontend/src/scenes/insights/insightDataLogic.tsx b/frontend/src/scenes/insights/insightDataLogic.tsx
index 1de631587cf35..b5036267488b0 100644
--- a/frontend/src/scenes/insights/insightDataLogic.tsx
+++ b/frontend/src/scenes/insights/insightDataLogic.tsx
@@ -219,6 +219,7 @@ export const insightDataLogic = kea<insightDataLogicType>([
             if (isQueryTooLarge(query)) {
                 localStorage.removeItem(`draft-query-${values.currentTeamId}`)
             }
+
             localStorage.setItem(
                 `draft-query-${values.currentTeamId}`,
                 crushDraftQueryForLocalStorage(query, Date.now())
diff --git a/frontend/src/scenes/insights/utils.tsx b/frontend/src/scenes/insights/utils.tsx
index 5a1b4d56ec7d9..04e55519ab05b 100644
--- a/frontend/src/scenes/insights/utils.tsx
+++ b/frontend/src/scenes/insights/utils.tsx
@@ -1,4 +1,3 @@
-import JSONCrush from 'jsoncrush'
 import api from 'lib/api'
 import { dayjs } from 'lib/dayjs'
 import { CORE_FILTER_DEFINITIONS_BY_GROUP } from 'lib/taxonomy'
@@ -445,40 +444,27 @@ export function isQueryTooLarge(query: Node<Record<string, any>>): boolean {
 export function parseDraftQueryFromLocalStorage(
     query: string
 ): { query: Node<Record<string, any>>; timestamp: number } | null {
-    // First try to uncrush the query if it's a JSONCrush query else fall back to parsing it as a JSON
     try {
-        const uncrushedQuery = JSONCrush.uncrush(query)
-        return JSON.parse(uncrushedQuery)
+        return JSON.parse(query)
     } catch (e) {
-        console.error('Error parsing uncrushed query', e)
-        try {
-            return JSON.parse(query)
-        } catch (e) {
-            console.error('Error parsing query', e)
-            return null
-        }
+        console.error('Error parsing query', e)
+        return null
     }
 }
 
 export function crushDraftQueryForLocalStorage(query: Node<Record<string, any>>, timestamp: number): string {
-    return JSONCrush.crush(JSON.stringify({ query, timestamp }))
+    return JSON.stringify({ query, timestamp })
 }
 
 export function parseDraftQueryFromURL(query: string): Node<Record<string, any>> | null {
     try {
-        const uncrushedQuery = JSONCrush.uncrush(query)
-        return JSON.parse(uncrushedQuery)
+        return JSON.parse(query)
     } catch (e) {
-        console.error('Error parsing uncrushed query', e)
-        try {
-            return JSON.parse(query)
-        } catch (e) {
-            console.error('Error parsing query', e)
-            return null
-        }
+        console.error('Error parsing query', e)
+        return null
     }
 }
 
 export function crushDraftQueryForURL(query: Node<Record<string, any>>): string {
-    return JSONCrush.crush(JSON.stringify(query))
+    return JSON.stringify(query)
 }
diff --git a/frontend/src/scenes/notebooks/Nodes/NotebookNodeExperiment.tsx b/frontend/src/scenes/notebooks/Nodes/NotebookNodeExperiment.tsx
index e7bc3a324202c..afde3f1836415 100644
--- a/frontend/src/scenes/notebooks/Nodes/NotebookNodeExperiment.tsx
+++ b/frontend/src/scenes/notebooks/Nodes/NotebookNodeExperiment.tsx
@@ -18,7 +18,7 @@ import { INTEGER_REGEX_MATCH_GROUPS } from './utils'
 
 const Component = ({ attributes }: NotebookNodeProps<NotebookNodeExperimentAttributes>): JSX.Element => {
     const { id } = attributes
-    const { experiment, experimentLoading, experimentMissing, isExperimentRunning, experimentResults } = useValues(
+    const { experiment, experimentLoading, experimentMissing, isExperimentRunning, metricResults } = useValues(
         experimentLogic({ experimentId: id })
     )
     const { loadExperiment } = useActions(experimentLogic({ experimentId: id }))
@@ -41,6 +41,10 @@ const Component = ({ attributes }: NotebookNodeProps<NotebookNodeExperimentAttri
         return <NotFound object="experiment" />
     }
 
+    if (!metricResults) {
+        return <></>
+    }
+
     return (
         <div>
             <BindLogic logic={experimentLogic} props={{ experimentId: id }}>
@@ -78,7 +82,7 @@ const Component = ({ attributes }: NotebookNodeProps<NotebookNodeExperimentAttri
                                 <LemonDivider className="my-0" />
                                 <div className="p-2">
                                     <SummaryTable />
-                                    <ResultsQuery targetResults={experimentResults} showTable={true} />
+                                    <ResultsQuery targetResults={metricResults[0]} showTable={true} />
                                 </div>
                             </>
                         )}
diff --git a/frontend/src/scenes/pipeline/hogfunctions/HogFunctionTest.tsx b/frontend/src/scenes/pipeline/hogfunctions/HogFunctionTest.tsx
index 1861b06f369ed..787d46245c4cb 100644
--- a/frontend/src/scenes/pipeline/hogfunctions/HogFunctionTest.tsx
+++ b/frontend/src/scenes/pipeline/hogfunctions/HogFunctionTest.tsx
@@ -1,9 +1,19 @@
 import { TZLabel } from '@posthog/apps-common'
 import { IconInfo, IconX } from '@posthog/icons'
-import { LemonButton, LemonLabel, LemonSwitch, LemonTable, LemonTag, Tooltip } from '@posthog/lemon-ui'
+import {
+    LemonButton,
+    LemonDivider,
+    LemonLabel,
+    LemonSwitch,
+    LemonTable,
+    LemonTag,
+    Spinner,
+    Tooltip,
+} from '@posthog/lemon-ui'
 import clsx from 'clsx'
 import { useActions, useValues } from 'kea'
 import { Form } from 'kea-forms'
+import { More } from 'lib/lemon-ui/LemonButton/More'
 import { LemonField } from 'lib/lemon-ui/LemonField'
 import { CodeEditorResizeable } from 'lib/monaco/CodeEditorResizable'
 
@@ -62,11 +72,25 @@ export function HogFunctionTestPlaceholder({
 }
 
 export function HogFunctionTest(props: HogFunctionTestLogicProps): JSX.Element {
-    const { isTestInvocationSubmitting, testResult, expanded, sampleGlobalsLoading, sampleGlobalsError, type } =
-        useValues(hogFunctionTestLogic(props))
-    const { submitTestInvocation, setTestResult, toggleExpanded, loadSampleGlobals } = useActions(
-        hogFunctionTestLogic(props)
-    )
+    const {
+        isTestInvocationSubmitting,
+        testResult,
+        expanded,
+        sampleGlobalsLoading,
+        sampleGlobalsError,
+        type,
+        savedGlobals,
+        testInvocation,
+    } = useValues(hogFunctionTestLogic(props))
+    const {
+        submitTestInvocation,
+        setTestResult,
+        toggleExpanded,
+        loadSampleGlobals,
+        deleteSavedGlobals,
+        setSampleGlobals,
+        saveGlobals,
+    } = useActions(hogFunctionTestLogic(props))
 
     return (
         <Form logic={hogFunctionTestLogic} props={props} formKey="testInvocation" enableFormOnSubmit>
@@ -75,7 +99,10 @@ export function HogFunctionTest(props: HogFunctionTestLogicProps): JSX.Element {
             >
                 <div className="flex items-center gap-2 justify-end">
                     <div className="flex-1 space-y-2">
-                        <h2 className="mb-0">Testing</h2>
+                        <h2 className="mb-0 flex gap-2 items-center">
+                            <span>Testing</span>
+                            {sampleGlobalsLoading ? <Spinner /> : null}
+                        </h2>
                         {!expanded &&
                             (type === 'email' ? (
                                 <p>Click here to test the provider with a sample e-mail</p>
@@ -87,7 +114,7 @@ export function HogFunctionTest(props: HogFunctionTestLogicProps): JSX.Element {
                     </div>
 
                     {!expanded ? (
-                        <LemonButton type="secondary" onClick={() => toggleExpanded()}>
+                        <LemonButton data-attr="expand-hog-testing" type="secondary" onClick={() => toggleExpanded()}>
                             Start testing
                         </LemonButton>
                     ) : (
@@ -97,46 +124,100 @@ export function HogFunctionTest(props: HogFunctionTestLogicProps): JSX.Element {
                                     type="primary"
                                     onClick={() => setTestResult(null)}
                                     loading={isTestInvocationSubmitting}
+                                    data-attr="clear-hog-test-result"
                                 >
                                     Clear test result
                                 </LemonButton>
                             ) : (
                                 <>
-                                    <LemonButton
-                                        type="secondary"
-                                        onClick={loadSampleGlobals}
-                                        loading={sampleGlobalsLoading}
-                                        tooltip="Find the last event matching filters, and use it to populate the globals below."
-                                    >
-                                        Refresh globals
-                                    </LemonButton>
-                                    <LemonField name="mock_async_functions">
-                                        {({ value, onChange }) => (
-                                            <LemonSwitch
-                                                bordered
-                                                onChange={onChange}
-                                                checked={value}
-                                                label={
-                                                    <Tooltip
-                                                        title={
-                                                            <>
-                                                                When selected, async functions such as `fetch` will not
-                                                                actually be called but instead will be mocked out with
-                                                                the fetch content logged instead
-                                                            </>
-                                                        }
+                                    <More
+                                        dropdown={{ closeOnClickInside: false }}
+                                        overlay={
+                                            <>
+                                                <LemonField name="mock_async_functions">
+                                                    {({ value, onChange }) => (
+                                                        <LemonSwitch
+                                                            onChange={(v) => onChange(!v)}
+                                                            checked={!value}
+                                                            data-attr="toggle-hog-test-mocking"
+                                                            className="px-2 py-1"
+                                                            label={
+                                                                <Tooltip
+                                                                    title={
+                                                                        <>
+                                                                            When disabled, async functions such as
+                                                                            `fetch` will not be called. Instead they
+                                                                            will be mocked out and logged.
+                                                                        </>
+                                                                    }
+                                                                >
+                                                                    <span className="flex gap-2">
+                                                                        Make real HTTP requests
+                                                                        <IconInfo className="text-lg" />
+                                                                    </span>
+                                                                </Tooltip>
+                                                            }
+                                                        />
+                                                    )}
+                                                </LemonField>
+                                                <LemonDivider />
+                                                <LemonButton
+                                                    fullWidth
+                                                    onClick={loadSampleGlobals}
+                                                    loading={sampleGlobalsLoading}
+                                                    tooltip="Find the last event matching filters, and use it to populate the globals below."
+                                                >
+                                                    Fetch new event
+                                                </LemonButton>
+                                                <LemonDivider />
+                                                {savedGlobals.map(({ name, globals }, index) => (
+                                                    <div className="flex w-full justify-between" key={index}>
+                                                        <LemonButton
+                                                            data-attr="open-hog-test-data"
+                                                            key={index}
+                                                            onClick={() => setSampleGlobals(globals)}
+                                                            fullWidth
+                                                            className="flex-1"
+                                                        >
+                                                            {name}
+                                                        </LemonButton>
+                                                        <LemonButton
+                                                            data-attr="delete-hog-test-data"
+                                                            size="small"
+                                                            icon={<IconX />}
+                                                            onClick={() => deleteSavedGlobals(index)}
+                                                            tooltip="Delete saved test data"
+                                                        />
+                                                    </div>
+                                                ))}
+                                                {testInvocation.globals && (
+                                                    <LemonButton
+                                                        fullWidth
+                                                        data-attr="save-hog-test-data"
+                                                        onClick={() => {
+                                                            const name = prompt('Name this test data')
+                                                            if (name) {
+                                                                saveGlobals(name, JSON.parse(testInvocation.globals))
+                                                            }
+                                                        }}
+                                                        disabledReason={(() => {
+                                                            try {
+                                                                JSON.parse(testInvocation.globals)
+                                                            } catch (e) {
+                                                                return 'Invalid globals JSON'
+                                                            }
+                                                            return undefined
+                                                        })()}
                                                     >
-                                                        <span className="flex gap-2">
-                                                            Mock out HTTP requests
-                                                            <IconInfo className="text-lg" />
-                                                        </span>
-                                                    </Tooltip>
-                                                }
-                                            />
-                                        )}
-                                    </LemonField>
+                                                        Save test data
+                                                    </LemonButton>
+                                                )}
+                                            </>
+                                        }
+                                    />
                                     <LemonButton
                                         type="primary"
+                                        data-attr="test-hog-function"
                                         onClick={submitTestInvocation}
                                         loading={isTestInvocationSubmitting}
                                     >
@@ -145,7 +226,12 @@ export function HogFunctionTest(props: HogFunctionTestLogicProps): JSX.Element {
                                 </>
                             )}
 
-                            <LemonButton icon={<IconX />} onClick={() => toggleExpanded()} tooltip="Hide testing" />
+                            <LemonButton
+                                data-attr="hide-hog-testing"
+                                icon={<IconX />}
+                                onClick={() => toggleExpanded()}
+                                tooltip="Hide testing"
+                            />
                         </>
                     )}
                 </div>
diff --git a/frontend/src/scenes/pipeline/hogfunctions/hogFunctionConfigurationLogic.tsx b/frontend/src/scenes/pipeline/hogfunctions/hogFunctionConfigurationLogic.tsx
index d6f7c98884a7c..d38b39ce21c59 100644
--- a/frontend/src/scenes/pipeline/hogfunctions/hogFunctionConfigurationLogic.tsx
+++ b/frontend/src/scenes/pipeline/hogfunctions/hogFunctionConfigurationLogic.tsx
@@ -1,6 +1,6 @@
 import { lemonToast } from '@posthog/lemon-ui'
 import equal from 'fast-deep-equal'
-import { actions, afterMount, connect, kea, key, listeners, path, props, reducers, selectors } from 'kea'
+import { actions, afterMount, connect, isBreakpoint, kea, key, listeners, path, props, reducers, selectors } from 'kea'
 import { forms } from 'kea-forms'
 import { loaders } from 'kea-loaders'
 import { beforeUnload, router } from 'kea-router'
@@ -231,8 +231,15 @@ export const hogFunctionConfigurationLogic = kea<hogFunctionConfigurationLogicTy
         setUnsavedConfiguration: (configuration: HogFunctionConfigurationType | null) => ({ configuration }),
         persistForUnload: true,
         setSampleGlobalsError: (error) => ({ error }),
+        setSampleGlobals: (sampleGlobals: HogFunctionInvocationGlobals | null) => ({ sampleGlobals }),
     }),
     reducers(({ props }) => ({
+        sampleGlobals: [
+            null as HogFunctionInvocationGlobals | null,
+            {
+                setSampleGlobals: (_, { sampleGlobals }) => sampleGlobals,
+            },
+        ],
         showSource: [
             // Show source by default for blank templates when creating a new function
             !!(!props.id && props.templateId?.startsWith('template-blank-')),
@@ -440,7 +447,9 @@ export const hogFunctionConfigurationLogic = kea<hogFunctionConfigurationLogicTy
                         }
                         return globals
                     } catch (e: any) {
-                        actions.setSampleGlobalsError(e.message ?? errorMessage)
+                        if (!isBreakpoint(e)) {
+                            actions.setSampleGlobalsError(e.message ?? errorMessage)
+                        }
                         return values.exampleInvocationGlobals
                     }
                 },
diff --git a/frontend/src/scenes/pipeline/hogfunctions/hogFunctionTestLogic.tsx b/frontend/src/scenes/pipeline/hogfunctions/hogFunctionTestLogic.tsx
index 7a06989b5b685..969c5381298b5 100644
--- a/frontend/src/scenes/pipeline/hogfunctions/hogFunctionTestLogic.tsx
+++ b/frontend/src/scenes/pipeline/hogfunctions/hogFunctionTestLogic.tsx
@@ -3,9 +3,10 @@ import { actions, afterMount, connect, kea, key, listeners, path, props, reducer
 import { forms } from 'kea-forms'
 import api from 'lib/api'
 import { tryJsonParse } from 'lib/utils'
+import { getCurrentTeamId } from 'lib/utils/getAppContext'
 
 import { groupsModel } from '~/models/groupsModel'
-import { LogEntry } from '~/types'
+import { HogFunctionInvocationGlobals, LogEntry } from '~/types'
 
 import { hogFunctionConfigurationLogic, sanitizeConfiguration } from './hogFunctionConfigurationLogic'
 import type { hogFunctionTestLogicType } from './hogFunctionTestLogicType'
@@ -45,18 +46,20 @@ export const hogFunctionTestLogic = kea<hogFunctionTestLogicType>([
         ],
         actions: [
             hogFunctionConfigurationLogic({ id: props.id }),
-            ['touchConfigurationField', 'loadSampleGlobalsSuccess', 'loadSampleGlobals'],
+            ['touchConfigurationField', 'loadSampleGlobalsSuccess', 'loadSampleGlobals', 'setSampleGlobals'],
         ],
     })),
     actions({
         setTestResult: (result: HogFunctionTestInvocationResult | null) => ({ result }),
         toggleExpanded: (expanded?: boolean) => ({ expanded }),
+        saveGlobals: (name: string, globals: HogFunctionInvocationGlobals) => ({ name, globals }),
+        deleteSavedGlobals: (index: number) => ({ index }),
     }),
     reducers({
         expanded: [
             false as boolean,
             {
-                toggleExpanded: (_, { expanded }) => (expanded === undefined ? !_ : expanded),
+                toggleExpanded: (state, { expanded }) => (expanded === undefined ? !state : expanded),
             },
         ],
 
@@ -66,11 +69,23 @@ export const hogFunctionTestLogic = kea<hogFunctionTestLogicType>([
                 setTestResult: (_, { result }) => result,
             },
         ],
+
+        savedGlobals: [
+            [] as { name: string; globals: HogFunctionInvocationGlobals }[],
+            { persist: true, prefix: `${getCurrentTeamId()}__` },
+            {
+                saveGlobals: (state, { name, globals }) => [...state, { name, globals }],
+                deleteSavedGlobals: (state, { index }) => state.filter((_, i) => i !== index),
+            },
+        ],
     }),
     listeners(({ values, actions }) => ({
         loadSampleGlobalsSuccess: () => {
             actions.setTestInvocationValue('globals', JSON.stringify(values.sampleGlobals, null, 2))
         },
+        setSampleGlobals: ({ sampleGlobals }) => {
+            actions.setTestInvocationValue('globals', JSON.stringify(sampleGlobals, null, 2))
+        },
     })),
     forms(({ props, actions, values }) => ({
         testInvocation: {
diff --git a/frontend/src/scenes/session-recordings/player/PlayerMeta.tsx b/frontend/src/scenes/session-recordings/player/PlayerMeta.tsx
index 5c811872d4134..6e4e69a038f79 100644
--- a/frontend/src/scenes/session-recordings/player/PlayerMeta.tsx
+++ b/frontend/src/scenes/session-recordings/player/PlayerMeta.tsx
@@ -7,7 +7,7 @@ import { CopyToClipboardInline } from 'lib/components/CopyToClipboard'
 import { useResizeBreakpoints } from 'lib/hooks/useResizeObserver'
 import { LemonSkeleton } from 'lib/lemon-ui/LemonSkeleton'
 import { Tooltip } from 'lib/lemon-ui/Tooltip'
-import { percentage } from 'lib/utils'
+import { isObject, percentage } from 'lib/utils'
 import { DraggableToNotebook } from 'scenes/notebooks/AddToNotebook/DraggableToNotebook'
 import { IconWindow } from 'scenes/session-recordings/player/icons'
 import { PlayerMetaLinks } from 'scenes/session-recordings/player/PlayerMetaLinks'
@@ -20,6 +20,12 @@ import { Logo } from '~/toolbar/assets/Logo'
 import { sessionRecordingPlayerLogic, SessionRecordingPlayerMode } from './sessionRecordingPlayerLogic'
 
 function URLOrScreen({ lastUrl }: { lastUrl: string | undefined }): JSX.Element | null {
+    if (isObject(lastUrl) && 'href' in lastUrl) {
+        // regression protection, we saw a user whose site was sometimes sending the string-ified location object
+        // this is a best-effort attempt to show the href in that case
+        lastUrl = lastUrl['href'] as string | undefined
+    }
+
     if (!lastUrl) {
         return null
     }
diff --git a/frontend/src/scenes/urls.ts b/frontend/src/scenes/urls.ts
index 293481f598e3b..477064de6497e 100644
--- a/frontend/src/scenes/urls.ts
+++ b/frontend/src/scenes/urls.ts
@@ -1,4 +1,3 @@
-import JSONCrush from 'jsoncrush'
 import { combineUrl } from 'kea-router'
 import { AlertType } from 'lib/components/Alerts/types'
 import { getCurrentTeamId } from 'lib/utils/getAppContext'
@@ -75,8 +74,7 @@ export const urls = {
     insightNew: (type?: InsightType, dashboardId?: DashboardType['id'] | null, query?: Node): string =>
         combineUrl('/insights/new', dashboardId ? { dashboard: dashboardId } : {}, {
             ...(type ? { insight: type } : {}),
-            // have to use JSONCrush directly rather than the util to avoid circular dep
-            ...(query ? { q: typeof query === 'string' ? query : JSONCrush.crush(JSON.stringify(query)) } : {}),
+            ...(query ? { q: typeof query === 'string' ? query : JSON.stringify(query) } : {}),
         }).url,
     insightNewHogQL: (query: string, filters?: HogQLFilters): string =>
         combineUrl(
diff --git a/jest.config.ts b/jest.config.ts
index 39ab8232a5592..53e5df5943413 100644
--- a/jest.config.ts
+++ b/jest.config.ts
@@ -8,7 +8,7 @@ process.env.TZ = process.env.TZ || 'UTC'
  * https://jestjs.io/docs/en/configuration.html
  */
 
-const esmModules = ['query-selector-shadow-dom', 'react-syntax-highlighter', '@react-hook', '@medv', 'monaco-editor', 'jsoncrush']
+const esmModules = ['query-selector-shadow-dom', 'react-syntax-highlighter', '@react-hook', '@medv', 'monaco-editor']
 const eeFolderExists = fs.existsSync('ee/frontend/exports.ts')
 function rootDirectories() {
     const rootDirectories = ['<rootDir>/frontend/src']
diff --git a/package.json b/package.json
index 7d55acb0ca23e..58ce1231fde79 100644
--- a/package.json
+++ b/package.json
@@ -140,7 +140,6 @@
         "hls.js": "^1.5.15",
         "husky": "^7.0.4",
         "image-blob-reduce": "^4.1.0",
-        "jsoncrush": "^1.1.8",
         "kea": "^3.1.5",
         "kea-forms": "^3.2.0",
         "kea-loaders": "^3.0.0",
@@ -162,7 +161,7 @@
         "pmtiles": "^2.11.0",
         "postcss": "^8.4.31",
         "postcss-preset-env": "^9.3.0",
-        "posthog-js": "1.202.4",
+        "posthog-js": "1.202.5",
         "posthog-js-lite": "3.0.0",
         "prettier": "^2.8.8",
         "prop-types": "^15.7.2",
diff --git a/plugin-server/src/cdp/cdp-api.ts b/plugin-server/src/cdp/cdp-api.ts
index ed4e60976b1e9..a5c3f84e4f276 100644
--- a/plugin-server/src/cdp/cdp-api.ts
+++ b/plugin-server/src/cdp/cdp-api.ts
@@ -136,6 +136,7 @@ export class CdpApi {
                                 id: team.id,
                                 name: team.name,
                                 url: `${this.hub.SITE_URL ?? 'http://localhost:8000'}/project/${team.id}`,
+                                ...globals.project,
                             },
                         },
                         compoundConfiguration,
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 902f297bc841d..aeaf7828a60e2 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -241,9 +241,6 @@ dependencies:
   image-blob-reduce:
     specifier: ^4.1.0
     version: 4.1.0
-  jsoncrush:
-    specifier: ^1.1.8
-    version: 1.1.8
   kea:
     specifier: ^3.1.5
     version: 3.1.5(react@18.2.0)
@@ -308,8 +305,8 @@ dependencies:
     specifier: ^9.3.0
     version: 9.3.0(postcss@8.4.31)
   posthog-js:
-    specifier: 1.202.4
-    version: 1.202.4
+    specifier: 1.202.5
+    version: 1.202.5
   posthog-js-lite:
     specifier: 3.0.0
     version: 3.0.0
@@ -15415,10 +15412,6 @@ packages:
     resolution: {integrity: sha512-gfFQZrcTc8CnKXp6Y4/CBT3fTc0OVuDofpre4aEeEpSBPV5X5v4+Vmx+8snU7RLPrNHPKSgLxGo9YuQzz20o+w==}
     dev: true
 
-  /jsoncrush@1.1.8:
-    resolution: {integrity: sha512-lvIMGzMUA0fjuqwNcxlTNRq2bibPZ9auqT/LyGdlR5hvydJtA/BasSgkx4qclqTKVeTidrJvsS/oVjlTCPQ4Nw==}
-    dev: false
-
   /jsonfile@6.1.0:
     resolution: {integrity: sha512-5dgndWOriYSm5cnYaJNhalLNDKOqFwyDB/rr1E9ZsGciGvKPs8R2xYGCacuf3z6K1YKDz182fd+fY3cn3pMqXQ==}
     dependencies:
@@ -17909,8 +17902,8 @@ packages:
     resolution: {integrity: sha512-dyajjnfzZD1tht4N7p7iwf7nBnR1MjVaVu+MKr+7gBgA39bn28wizCIJZztZPtHy4PY0YwtSGgwfBCuG/hnHgA==}
     dev: false
 
-  /posthog-js@1.202.4:
-    resolution: {integrity: sha512-YLu6f1ibAkiopGivGQnLBaCKegT+0GHP3DfP72z3KVby2UXLBB7dj+GIa54zfjburE7ehasysRzeK5nW39QOfA==}
+  /posthog-js@1.202.5:
+    resolution: {integrity: sha512-ZkUPkdO35Di2croJByHvBVp/sACq5KJEIaYRchUrr5lRK/57B/s6d5Y5PV374pbWpKhJdXRkUXd3ZUKJ8bOmfw==}
     dependencies:
       core-js: 3.39.0
       fflate: 0.4.8
diff --git a/posthog/api/hog_function.py b/posthog/api/hog_function.py
index 3f50d710acc96..9f9d54e7db1af 100644
--- a/posthog/api/hog_function.py
+++ b/posthog/api/hog_function.py
@@ -360,13 +360,13 @@ def invocations(self, request: Request, *args, **kwargs):
         # Remove the team from the config
         configuration.pop("team")
 
-        globals = serializer.validated_data["globals"]
+        hog_globals = serializer.validated_data["globals"]
         mock_async_functions = serializer.validated_data["mock_async_functions"]
 
         res = create_hog_invocation_test(
             team_id=hog_function.team_id,
             hog_function_id=hog_function.id,
-            globals=globals,
+            globals=hog_globals,
             configuration=configuration,
             mock_async_functions=mock_async_functions,
         )
diff --git a/posthog/conftest.py b/posthog/conftest.py
index c27dbec43955e..e9804d25eff42 100644
--- a/posthog/conftest.py
+++ b/posthog/conftest.py
@@ -14,6 +14,7 @@ def create_clickhouse_tables(num_tables: int):
         CREATE_DATA_QUERIES,
         CREATE_DICTIONARY_QUERIES,
         CREATE_DISTRIBUTED_TABLE_QUERIES,
+        CREATE_KAFKA_TABLE_QUERIES,
         CREATE_MERGETREE_TABLE_QUERIES,
         CREATE_MV_TABLE_QUERIES,
         CREATE_VIEW_QUERIES,
@@ -28,10 +29,18 @@ def create_clickhouse_tables(num_tables: int):
         + len(CREATE_DICTIONARY_QUERIES)
     )
 
+    # Evaluation tests use Kafka for faster data ingestion.
+    if settings.IN_EVAL_TESTING:
+        total_tables += len(CREATE_KAFKA_TABLE_QUERIES)
+
     # Check if all the tables have already been created. Views, materialized views, and dictionaries also count
     if num_tables == total_tables:
         return
 
+    if settings.IN_EVAL_TESTING:
+        kafka_table_queries = list(map(build_query, CREATE_KAFKA_TABLE_QUERIES))
+        run_clickhouse_statement_in_parallel(kafka_table_queries)
+
     table_queries = list(map(build_query, CREATE_MERGETREE_TABLE_QUERIES + CREATE_DISTRIBUTED_TABLE_QUERIES))
     run_clickhouse_statement_in_parallel(table_queries)
 
@@ -62,7 +71,7 @@ def reset_clickhouse_tables():
     from posthog.models.channel_type.sql import TRUNCATE_CHANNEL_DEFINITION_TABLE_SQL
     from posthog.models.cohort.sql import TRUNCATE_COHORTPEOPLE_TABLE_SQL
     from posthog.models.error_tracking.sql import TRUNCATE_ERROR_TRACKING_ISSUE_FINGERPRINT_OVERRIDES_TABLE_SQL
-    from posthog.models.event.sql import TRUNCATE_EVENTS_TABLE_SQL, TRUNCATE_EVENTS_RECENT_TABLE_SQL
+    from posthog.models.event.sql import TRUNCATE_EVENTS_RECENT_TABLE_SQL, TRUNCATE_EVENTS_TABLE_SQL
     from posthog.models.group.sql import TRUNCATE_GROUPS_TABLE_SQL
     from posthog.models.performance.sql import TRUNCATE_PERFORMANCE_EVENTS_TABLE_SQL
     from posthog.models.person.sql import (
@@ -100,6 +109,18 @@ def reset_clickhouse_tables():
         TRUNCATE_HEATMAPS_TABLE_SQL(),
     ]
 
+    # Drop created Kafka tables because some tests don't expect it.
+    if settings.IN_EVAL_TESTING:
+        kafka_tables = sync_execute(
+            f"""
+            SELECT name
+            FROM system.tables
+            WHERE database = '{settings.CLICKHOUSE_DATABASE}' AND name LIKE 'kafka_%'
+            """,
+        )
+        # Using `ON CLUSTER` takes x20 more time to drop the tables: https://github.com/ClickHouse/ClickHouse/issues/15473.
+        TABLES_TO_CREATE_DROP += [f"DROP TABLE {table[0]}" for table in kafka_tables]
+
     run_clickhouse_statement_in_parallel(TABLES_TO_CREATE_DROP)
 
     from posthog.clickhouse.schema import (
diff --git a/posthog/hogql_queries/experiments/experiment_funnels_query_runner.py b/posthog/hogql_queries/experiments/experiment_funnels_query_runner.py
index d9d874d7b6e6b..363ee06bc78be 100644
--- a/posthog/hogql_queries/experiments/experiment_funnels_query_runner.py
+++ b/posthog/hogql_queries/experiments/experiment_funnels_query_runner.py
@@ -7,6 +7,11 @@
     calculate_credible_intervals,
     calculate_probabilities,
 )
+from posthog.hogql_queries.experiments.funnels_statistics_v2 import (
+    are_results_significant_v2,
+    calculate_credible_intervals_v2,
+    calculate_probabilities_v2,
+)
 from posthog.hogql_queries.query_runner import QueryRunner
 from posthog.models.experiment import Experiment
 from ..insights.funnels.funnels_query_runner import FunnelsQueryRunner
@@ -45,6 +50,8 @@ def __init__(self, *args, **kwargs):
         if self.experiment.holdout:
             self.variants.append(f"holdout-{self.experiment.holdout.id}")
 
+        self.stats_version = self.query.stats_version or 1
+
         self.prepared_funnels_query = self._prepare_funnel_query()
         self.funnels_query_runner = FunnelsQueryRunner(
             query=self.prepared_funnels_query, team=self.team, timings=self.timings, limit_context=self.limit_context
@@ -63,9 +70,14 @@ def calculate(self) -> ExperimentFunnelsQueryResponse:
 
             # Statistical analysis
             control_variant, test_variants = self._get_variants_with_base_stats(funnels_result)
-            probabilities = calculate_probabilities(control_variant, test_variants)
-            significance_code, loss = are_results_significant(control_variant, test_variants, probabilities)
-            credible_intervals = calculate_credible_intervals([control_variant, *test_variants])
+            if self.stats_version == 2:
+                probabilities = calculate_probabilities_v2(control_variant, test_variants)
+                significance_code, loss = are_results_significant_v2(control_variant, test_variants, probabilities)
+                credible_intervals = calculate_credible_intervals_v2([control_variant, *test_variants])
+            else:
+                probabilities = calculate_probabilities(control_variant, test_variants)
+                significance_code, loss = are_results_significant(control_variant, test_variants, probabilities)
+                credible_intervals = calculate_credible_intervals([control_variant, *test_variants])
         except Exception as e:
             raise ValueError(f"Error calculating experiment funnel results: {str(e)}") from e
 
@@ -80,6 +92,7 @@ def calculate(self) -> ExperimentFunnelsQueryResponse:
             },
             significant=significance_code == ExperimentSignificanceCode.SIGNIFICANT,
             significance_code=significance_code,
+            stats_version=self.stats_version,
             expected_loss=loss,
             credible_intervals=credible_intervals,
         )
diff --git a/posthog/hogql_queries/experiments/funnels_statistics_v2.py b/posthog/hogql_queries/experiments/funnels_statistics_v2.py
new file mode 100644
index 0000000000000..02f18d2f70740
--- /dev/null
+++ b/posthog/hogql_queries/experiments/funnels_statistics_v2.py
@@ -0,0 +1,216 @@
+import numpy as np
+from scipy import stats
+from posthog.schema import ExperimentVariantFunnelsBaseStats, ExperimentSignificanceCode
+from posthog.hogql_queries.experiments import (
+    FF_DISTRIBUTION_THRESHOLD,
+    MIN_PROBABILITY_FOR_SIGNIFICANCE,
+    EXPECTED_LOSS_SIGNIFICANCE_LEVEL,
+)
+from scipy.stats import betabinom
+
+ALPHA_PRIOR = 1
+BETA_PRIOR = 1
+SAMPLE_SIZE = 10000
+
+
+def calculate_probabilities_v2(
+    control: ExperimentVariantFunnelsBaseStats, variants: list[ExperimentVariantFunnelsBaseStats]
+) -> list[float]:
+    """
+    Calculate the win probabilities for each variant in an experiment using Bayesian analysis
+    for funnel conversion rates.
+
+    This function computes the probability that each variant is the best (i.e., has the highest
+    conversion rate) compared to all other variants, including the control. It uses samples
+    drawn from the posterior Beta distributions of each variant's conversion rate.
+
+    Parameters:
+    -----------
+    control : ExperimentVariantFunnelsBaseStats
+        Statistics for the control group, including success and failure counts
+    variants : list[ExperimentVariantFunnelsBaseStats]
+        List of statistics for test variants to compare against the control
+
+    Returns:
+    --------
+    list[float]
+        A list of probabilities where:
+        - The first element is the probability that the control variant is the best
+        - Subsequent elements are the probabilities that each test variant is the best
+
+    Notes:
+    ------
+    - Uses a Bayesian approach with Beta distributions as the posterior
+    - Uses Beta(1,1) as the prior, which is uniform over [0,1]
+    - Draws 10,000 samples from each variant's posterior distribution
+    """
+    all_variants = [control, *variants]
+
+    # Use Beta distribution for conversion rates
+    samples: list[np.ndarray] = []
+    for variant in all_variants:
+        # Add prior to both successes and failures for Bayesian prior
+        alpha = ALPHA_PRIOR + variant.success_count
+        beta = BETA_PRIOR + variant.failure_count
+        # Generate samples from Beta distribution
+        variant_samples = np.random.beta(alpha, beta, SAMPLE_SIZE)
+        samples.append(variant_samples)
+
+    samples_array = np.array(samples)
+    # Calculate probability of each variant being the best
+    probabilities = []
+    for i in range(len(all_variants)):
+        probability = (samples_array[i] == np.max(samples_array, axis=0)).mean()
+        probabilities.append(float(probability))
+
+    return probabilities
+
+
+def calculate_expected_loss_v2(
+    target_variant: ExperimentVariantFunnelsBaseStats, variants: list[ExperimentVariantFunnelsBaseStats]
+) -> float:
+    """
+    Calculates expected loss in conversion rate using Beta-Binomial conjugate prior.
+
+    This implementation uses a Bayesian approach with Beta-Binomial model
+    to estimate the expected loss when choosing the target variant over others.
+
+    Parameters:
+    -----------
+    target_variant : ExperimentVariantFunnelsBaseStats
+        The variant being evaluated for loss
+    variants : list[ExperimentVariantFunnelsBaseStats]
+        List of other variants to compare against
+
+    Returns:
+    --------
+    float
+        Expected loss in conversion rate if choosing the target variant
+    """
+    # Calculate posterior parameters for target variant
+    target_alpha = int(ALPHA_PRIOR + target_variant.success_count)
+    target_beta = int(BETA_PRIOR + target_variant.failure_count)
+    target_n = int(target_variant.success_count + target_variant.failure_count)
+
+    # Get samples from target variant's Beta-Binomial
+    target_samples = betabinom.rvs(target_n, target_alpha, target_beta, size=SAMPLE_SIZE) / target_n
+
+    # Get samples from each comparison variant
+    variant_samples = []
+    for variant in variants:
+        n = int(variant.success_count + variant.failure_count)
+        alpha = int(ALPHA_PRIOR + variant.success_count)
+        beta = int(BETA_PRIOR + variant.failure_count)
+        samples = betabinom.rvs(n, alpha, beta, size=SAMPLE_SIZE) / n
+        variant_samples.append(samples)
+
+    # Calculate loss
+    variant_max = np.maximum.reduce(variant_samples)
+    losses = np.maximum(0, variant_max - target_samples)
+    expected_loss = float(np.mean(losses))
+
+    return expected_loss
+
+
+def are_results_significant_v2(
+    control: ExperimentVariantFunnelsBaseStats,
+    variants: list[ExperimentVariantFunnelsBaseStats],
+    probabilities: list[float],
+) -> tuple[ExperimentSignificanceCode, float]:
+    """
+    Determine if the experiment results are statistically significant using Bayesian analysis
+    for funnel conversion rates.
+
+    This function evaluates whether there is strong evidence that any variant is better
+    than the others by considering both winning probabilities and expected loss. It checks
+    if the sample size is sufficient and evaluates the risk of choosing the winning variant.
+
+    Parameters:
+    -----------
+    control : ExperimentVariantFunnelsBaseStats
+        Statistics for the control group, including success and failure counts
+    variants : list[ExperimentVariantFunnelsBaseStats]
+        List of statistics for test variants to compare against the control
+    probabilities : list[float]
+        List of probabilities from calculate_probabilities_v2
+
+    Returns:
+    --------
+    tuple[ExperimentSignificanceCode, float]
+        A tuple containing:
+        - Significance code indicating the result (significant, not enough exposure, high loss, etc.)
+        - Expected loss value for significant results, 1.0 for non-significant results
+
+    Notes:
+    ------
+    - Requires minimum exposure threshold per variant for reliable results
+    - Uses probability threshold from MIN_PROBABILITY_FOR_SIGNIFICANCE
+    - Calculates expected loss for the best-performing variant
+    - Returns HIGH_LOSS if expected loss exceeds significance threshold
+    - Returns NOT_ENOUGH_EXPOSURE if sample size requirements not met
+    """
+    # Check minimum exposure
+    if control.success_count + control.failure_count < FF_DISTRIBUTION_THRESHOLD or any(
+        v.success_count + v.failure_count < FF_DISTRIBUTION_THRESHOLD for v in variants
+    ):
+        return ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE, 1.0
+
+    # Check if any variant has high enough probability
+    max_probability = max(probabilities)
+    if max_probability >= MIN_PROBABILITY_FOR_SIGNIFICANCE:
+        # Find best performing variant
+        all_variants = [control, *variants]
+        conversion_rates = [v.success_count / (v.success_count + v.failure_count) for v in all_variants]
+        best_idx = np.argmax(conversion_rates)
+        best_variant = all_variants[best_idx]
+        other_variants = all_variants[:best_idx] + all_variants[best_idx + 1 :]
+        expected_loss = calculate_expected_loss_v2(best_variant, other_variants)
+
+        if expected_loss >= EXPECTED_LOSS_SIGNIFICANCE_LEVEL:
+            return ExperimentSignificanceCode.HIGH_LOSS, expected_loss
+
+        return ExperimentSignificanceCode.SIGNIFICANT, expected_loss
+
+    return ExperimentSignificanceCode.LOW_WIN_PROBABILITY, 1.0
+
+
+def calculate_credible_intervals_v2(variants: list[ExperimentVariantFunnelsBaseStats]) -> dict[str, list[float]]:
+    """
+    Calculate Bayesian credible intervals for conversion rates of each variant.
+
+    This function computes the 95% credible intervals for the true conversion rate
+    of each variant, representing the range where we believe the true rate lies
+    with 95% probability.
+
+    Parameters:
+    -----------
+    variants : list[ExperimentVariantFunnelsBaseStats]
+        List of all variants including control, containing success and failure counts
+
+    Returns:
+    --------
+    dict[str, list[float]]
+        Dictionary mapping variant keys to [lower, upper] credible intervals, where:
+        - lower is the 2.5th percentile of the posterior distribution
+        - upper is the 97.5th percentile of the posterior distribution
+
+    Notes:
+    ------
+    - Uses Beta distribution as the posterior
+    - Uses Beta(1,1) as the prior, which is uniform over [0,1]
+    - Returns 95% credible intervals
+    - Intervals become narrower with larger sample sizes
+    """
+    intervals = {}
+
+    for variant in variants:
+        # Add 1 to both successes and failures for Bayesian prior
+        alpha = ALPHA_PRIOR + variant.success_count
+        beta = BETA_PRIOR + variant.failure_count
+
+        # Calculate 95% credible interval
+        lower, upper = stats.beta.ppf([0.025, 0.975], alpha, beta)
+
+        intervals[variant.key] = [float(lower), float(upper)]
+
+    return intervals
diff --git a/posthog/hogql_queries/experiments/test/test_experiment_funnels_query_runner.py b/posthog/hogql_queries/experiments/test/test_experiment_funnels_query_runner.py
index 3f3127fcb1b55..3dc11499b1393 100644
--- a/posthog/hogql_queries/experiments/test/test_experiment_funnels_query_runner.py
+++ b/posthog/hogql_queries/experiments/test/test_experiment_funnels_query_runner.py
@@ -113,6 +113,8 @@ def test_query_runner(self):
         )
         result = query_runner.calculate()
 
+        self.assertEqual(result.stats_version, 1)
+
         self.assertEqual(len(result.variants), 2)
 
         control_variant = next(variant for variant in result.variants if variant.key == "control")
@@ -123,11 +125,88 @@ def test_query_runner(self):
         self.assertEqual(test_variant.success_count, 8)
         self.assertEqual(test_variant.failure_count, 2)
 
-        self.assertIn("control", result.probability)
-        self.assertIn("test", result.probability)
+        self.assertAlmostEqual(result.probability["control"], 0.2, delta=0.1)
+        self.assertAlmostEqual(result.probability["test"], 0.8, delta=0.1)
 
-        self.assertIn("control", result.credible_intervals)
-        self.assertIn("test", result.credible_intervals)
+        self.assertEqual(result.significant, False)
+        self.assertEqual(result.significance_code, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE)
+        self.assertEqual(result.expected_loss, 1.0)
+
+        self.assertAlmostEqual(result.credible_intervals["control"][0], 0.3, delta=0.1)
+        self.assertAlmostEqual(result.credible_intervals["control"][1], 0.8, delta=0.1)
+        self.assertAlmostEqual(result.credible_intervals["test"][0], 0.5, delta=0.1)
+        self.assertAlmostEqual(result.credible_intervals["test"][1], 0.9, delta=0.1)
+
+    @freeze_time("2020-01-01T12:00:00Z")
+    def test_query_runner_v2(self):
+        feature_flag = self.create_feature_flag()
+        experiment = self.create_experiment(feature_flag=feature_flag)
+
+        feature_flag_property = f"$feature/{feature_flag.key}"
+
+        funnels_query = FunnelsQuery(
+            series=[EventsNode(event="$pageview"), EventsNode(event="purchase")],
+            dateRange={"date_from": "2020-01-01", "date_to": "2020-01-14"},
+        )
+        experiment_query = ExperimentFunnelsQuery(
+            experiment_id=experiment.id,
+            kind="ExperimentFunnelsQuery",
+            funnels_query=funnels_query,
+            stats_version=2,
+        )
+
+        experiment.metrics = [{"type": "primary", "query": experiment_query.model_dump()}]
+        experiment.save()
+
+        for variant, purchase_count in [("control", 6), ("test", 8)]:
+            for i in range(10):
+                _create_person(distinct_ids=[f"user_{variant}_{i}"], team_id=self.team.pk)
+                _create_event(
+                    team=self.team,
+                    event="$pageview",
+                    distinct_id=f"user_{variant}_{i}",
+                    timestamp="2020-01-02T12:00:00Z",
+                    properties={feature_flag_property: variant},
+                )
+                if i < purchase_count:
+                    _create_event(
+                        team=self.team,
+                        event="purchase",
+                        distinct_id=f"user_{variant}_{i}",
+                        timestamp="2020-01-02T12:01:00Z",
+                        properties={feature_flag_property: variant},
+                    )
+
+        flush_persons_and_events()
+
+        query_runner = ExperimentFunnelsQueryRunner(
+            query=ExperimentFunnelsQuery(**experiment.metrics[0]["query"]), team=self.team
+        )
+        result = query_runner.calculate()
+
+        self.assertEqual(result.stats_version, 2)
+
+        self.assertEqual(len(result.variants), 2)
+
+        control_variant = next(variant for variant in result.variants if variant.key == "control")
+        test_variant = next(variant for variant in result.variants if variant.key == "test")
+
+        self.assertEqual(control_variant.success_count, 6)
+        self.assertEqual(control_variant.failure_count, 4)
+        self.assertEqual(test_variant.success_count, 8)
+        self.assertEqual(test_variant.failure_count, 2)
+
+        self.assertAlmostEqual(result.probability["control"], 0.2, delta=0.1)
+        self.assertAlmostEqual(result.probability["test"], 0.8, delta=0.1)
+
+        self.assertEqual(result.significant, False)
+        self.assertEqual(result.significance_code, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE)
+        self.assertEqual(result.expected_loss, 1.0)
+
+        self.assertAlmostEqual(result.credible_intervals["control"][0], 0.3, delta=0.1)
+        self.assertAlmostEqual(result.credible_intervals["control"][1], 0.8, delta=0.1)
+        self.assertAlmostEqual(result.credible_intervals["test"][0], 0.5, delta=0.1)
+        self.assertAlmostEqual(result.credible_intervals["test"][1], 0.9, delta=0.1)
 
     @flaky(max_runs=10, min_passes=1)
     @freeze_time("2020-01-01T12:00:00Z")
diff --git a/posthog/hogql_queries/experiments/test/test_funnels_statistics.py b/posthog/hogql_queries/experiments/test/test_funnels_statistics.py
new file mode 100644
index 0000000000000..2206ff92b9305
--- /dev/null
+++ b/posthog/hogql_queries/experiments/test/test_funnels_statistics.py
@@ -0,0 +1,243 @@
+from posthog.hogql_queries.experiments import MIN_PROBABILITY_FOR_SIGNIFICANCE
+from posthog.schema import ExperimentVariantFunnelsBaseStats, ExperimentSignificanceCode
+from posthog.hogql_queries.experiments.funnels_statistics_v2 import (
+    calculate_probabilities_v2,
+    are_results_significant_v2,
+    calculate_credible_intervals_v2,
+)
+from posthog.hogql_queries.experiments.funnels_statistics import (
+    calculate_probabilities,
+    are_results_significant,
+    calculate_credible_intervals,
+)
+from posthog.test.base import APIBaseTest
+
+
+def create_variant(
+    key: str,
+    success_count: int,
+    failure_count: int,
+) -> ExperimentVariantFunnelsBaseStats:
+    return ExperimentVariantFunnelsBaseStats(
+        key=key,
+        success_count=success_count,
+        failure_count=failure_count,
+    )
+
+
+class TestExperimentFunnelStatistics(APIBaseTest):
+    def run_test_for_both_implementations(self, test_fn):
+        """Run the same test for both implementations"""
+        self.stats_version = 1
+        # Run for original implementation
+        test_fn(
+            stats_version=1,
+            calculate_probabilities=calculate_probabilities,
+            are_results_significant=are_results_significant,
+            calculate_credible_intervals=calculate_credible_intervals,
+        )
+        self.stats_version = 2
+        # Run for v2 implementation
+        test_fn(
+            stats_version=2,
+            calculate_probabilities=calculate_probabilities_v2,
+            are_results_significant=are_results_significant_v2,
+            calculate_credible_intervals=calculate_credible_intervals_v2,
+        )
+
+    def test_small_sample_two_variants_not_significant(self):
+        """Test with small sample size, two variants, no clear winner"""
+
+        def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals):
+            control = create_variant("control", success_count=10, failure_count=90)
+            test = create_variant("test", success_count=15, failure_count=85)
+
+            probabilities = calculate_probabilities(control, [test])
+            significance, p_value = are_results_significant(control, [test], probabilities)
+            intervals = calculate_credible_intervals([control, test])
+
+            self.assertEqual(len(probabilities), 2)
+            if stats_version == 2:
+                self.assertAlmostEqual(probabilities[0], 0.15, delta=0.1)
+                self.assertAlmostEqual(probabilities[1], 0.85, delta=0.1)
+                self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY)
+                self.assertEqual(p_value, 1)
+
+                # Check credible intervals
+                self.assertAlmostEqual(intervals["control"][0], 0.05, delta=0.05)
+                self.assertAlmostEqual(intervals["control"][1], 0.20, delta=0.05)
+                self.assertAlmostEqual(intervals["test"][0], 0.08, delta=0.05)
+                self.assertAlmostEqual(intervals["test"][1], 0.25, delta=0.05)
+            else:
+                # Original implementation behavior
+                self.assertTrue(0.1 < probabilities[0] < 0.5)
+                self.assertTrue(0.5 < probabilities[1] < 0.9)
+                self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY)
+                self.assertEqual(p_value, 1)
+
+                # Original implementation intervals
+                self.assertAlmostEqual(intervals["control"][0], 0.05, delta=0.05)
+                self.assertAlmostEqual(intervals["control"][1], 0.20, delta=0.05)
+                self.assertAlmostEqual(intervals["test"][0], 0.08, delta=0.05)
+                self.assertAlmostEqual(intervals["test"][1], 0.25, delta=0.05)
+
+        self.run_test_for_both_implementations(run_test)
+
+    def test_large_sample_two_variants_significant(self):
+        """Test with large sample size, two variants, clear winner"""
+
+        def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals):
+            control = create_variant("control", success_count=1000, failure_count=9000)
+            test = create_variant("test", success_count=1500, failure_count=8500)
+
+            probabilities = calculate_probabilities(control, [test])
+            significance, p_value = are_results_significant(control, [test], probabilities)
+            intervals = calculate_credible_intervals([control, test])
+
+            self.assertEqual(len(probabilities), 2)
+            if stats_version == 2:
+                self.assertAlmostEqual(probabilities[1], 1.0, delta=0.05)
+                self.assertAlmostEqual(probabilities[0], 0.0, delta=0.05)
+                self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT)
+                self.assertEqual(p_value, 0)
+
+                # Check credible intervals
+                self.assertAlmostEqual(intervals["control"][0], 0.095, delta=0.01)
+                self.assertAlmostEqual(intervals["control"][1], 0.105, delta=0.01)
+                self.assertAlmostEqual(intervals["test"][0], 0.145, delta=0.01)
+                self.assertAlmostEqual(intervals["test"][1], 0.155, delta=0.01)
+            else:
+                # Original implementation behavior
+                self.assertTrue(probabilities[1] > 0.5)  # Test variant winning
+                self.assertTrue(probabilities[0] < 0.5)  # Control variant losing
+                self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT)
+                self.assertLess(p_value, 0.05)
+
+                # Original implementation intervals
+                self.assertAlmostEqual(intervals["control"][0], 0.095, delta=0.01)
+                self.assertAlmostEqual(intervals["control"][1], 0.105, delta=0.01)
+                self.assertAlmostEqual(intervals["test"][0], 0.145, delta=0.01)
+                self.assertAlmostEqual(intervals["test"][1], 0.155, delta=0.01)
+
+        self.run_test_for_both_implementations(run_test)
+
+    def test_many_variants_not_significant(self):
+        """Test with multiple variants, no clear winner"""
+
+        def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals):
+            control = create_variant("control", success_count=100, failure_count=900)
+            test_a = create_variant("test_a", success_count=98, failure_count=902)
+            test_b = create_variant("test_b", success_count=102, failure_count=898)
+            test_c = create_variant("test_c", success_count=101, failure_count=899)
+
+            probabilities = calculate_probabilities(control, [test_a, test_b, test_c])
+            significance, p_value = are_results_significant(control, [test_a, test_b, test_c], probabilities)
+            intervals = calculate_credible_intervals([control, test_a, test_b, test_c])
+
+            self.assertEqual(len(probabilities), 4)
+            if stats_version == 2:
+                self.assertTrue(all(p < MIN_PROBABILITY_FOR_SIGNIFICANCE for p in probabilities))
+                self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY)
+                self.assertEqual(p_value, 1)
+
+                # Check credible intervals overlap
+                # Check credible intervals for control and all test variants
+                self.assertAlmostEqual(intervals["control"][0], 0.09, delta=0.02)
+                self.assertAlmostEqual(intervals["control"][1], 0.12, delta=0.02)
+                self.assertAlmostEqual(intervals["test_a"][0], 0.09, delta=0.02)
+                self.assertAlmostEqual(intervals["test_a"][1], 0.12, delta=0.02)
+                self.assertAlmostEqual(intervals["test_b"][0], 0.09, delta=0.02)
+                self.assertAlmostEqual(intervals["test_b"][1], 0.12, delta=0.02)
+                self.assertAlmostEqual(intervals["test_c"][0], 0.09, delta=0.02)
+                self.assertAlmostEqual(intervals["test_c"][1], 0.12, delta=0.02)
+            else:
+                # Original implementation behavior
+                self.assertTrue(all(0.1 < p < 0.9 for p in probabilities))
+                self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY)
+                self.assertEqual(p_value, 1)
+
+                # Check credible intervals overlap
+                # Check credible intervals for control and all test variants
+                self.assertAlmostEqual(intervals["control"][0], 0.09, delta=0.02)
+                self.assertAlmostEqual(intervals["control"][1], 0.12, delta=0.02)
+                self.assertAlmostEqual(intervals["test_a"][0], 0.09, delta=0.02)
+                self.assertAlmostEqual(intervals["test_a"][1], 0.12, delta=0.02)
+                self.assertAlmostEqual(intervals["test_b"][0], 0.09, delta=0.02)
+                self.assertAlmostEqual(intervals["test_b"][1], 0.12, delta=0.02)
+                self.assertAlmostEqual(intervals["test_c"][0], 0.09, delta=0.02)
+                self.assertAlmostEqual(intervals["test_c"][1], 0.12, delta=0.02)
+
+        self.run_test_for_both_implementations(run_test)
+
+    def test_insufficient_sample_size(self):
+        """Test with sample size below threshold"""
+
+        def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals):
+            control = create_variant("control", success_count=5, failure_count=45)
+            test = create_variant("test", success_count=8, failure_count=42)
+
+            probabilities = calculate_probabilities(control, [test])
+            significance, p_value = are_results_significant(control, [test], probabilities)
+            intervals = calculate_credible_intervals([control, test])
+
+            self.assertEqual(len(probabilities), 2)
+            if stats_version == 2:
+                self.assertEqual(significance, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE)
+                self.assertEqual(p_value, 1.0)
+
+                # Check wide credible intervals due to small sample
+                self.assertTrue(intervals["control"][1] - intervals["control"][0] > 0.15)
+                self.assertTrue(intervals["test"][1] - intervals["test"][0] > 0.15)
+            else:
+                # Original implementation behavior
+                self.assertEqual(significance, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE)
+                self.assertEqual(p_value, 1.0)
+
+                # Check wide credible intervals
+                self.assertTrue(intervals["control"][1] - intervals["control"][0] > 0.15)
+                self.assertTrue(intervals["test"][1] - intervals["test"][0] > 0.15)
+
+        self.run_test_for_both_implementations(run_test)
+
+    def test_expected_loss_minimal_difference(self):
+        """Test expected loss when variants have very similar performance"""
+
+        def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals):
+            control = create_variant("control", success_count=1000, failure_count=9000)  # 11% conversion
+            test = create_variant("test", success_count=1050, failure_count=8800)  # 11.9% conversion
+
+            probabilities = calculate_probabilities(control, [test])
+            significance, expected_loss = are_results_significant(control, [test], probabilities)
+
+            if stats_version == 2:
+                self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT)
+                # Expected loss should still be relatively small
+                self.assertLess(expected_loss, 0.03)  # Less than 3% expected loss
+                self.assertGreater(expected_loss, 0)
+            else:
+                # Original implementation behavior
+                self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT)
+                self.assertLess(expected_loss, 0.03)  # Less than 3% expected loss
+                self.assertGreater(expected_loss, 0)
+
+        self.run_test_for_both_implementations(run_test)
+
+    def test_expected_loss_test_variant_clear_winner(self):
+        """Test expected loss when one variant is clearly better"""
+
+        def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals):
+            control = create_variant("control", success_count=1000, failure_count=9000)  # 11% conversion
+            test = create_variant("test", success_count=2000, failure_count=8000)  # 20% conversion
+
+            probabilities = calculate_probabilities(control, [test])
+            significance, expected_loss = are_results_significant(control, [test], probabilities)
+
+            if stats_version == 2:
+                self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT)
+                self.assertEqual(expected_loss, 0.0)
+            else:
+                # Original implementation behavior
+                self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT)
+                self.assertEqual(expected_loss, 0.0)
+
+        self.run_test_for_both_implementations(run_test)
diff --git a/posthog/schema.py b/posthog/schema.py
index 564dcc321fa60..a46596350f23f 100644
--- a/posthog/schema.py
+++ b/posthog/schema.py
@@ -6292,6 +6292,7 @@ class QueryResponseAlternative15(BaseModel):
     probability: dict[str, float]
     significance_code: ExperimentSignificanceCode
     significant: bool
+    stats_version: Optional[int] = None
     variants: list[ExperimentVariantFunnelsBaseStats]
 
 
@@ -6324,6 +6325,7 @@ class QueryResponseAlternative26(BaseModel):
     probability: dict[str, float]
     significance_code: ExperimentSignificanceCode
     significant: bool
+    stats_version: Optional[int] = None
     variants: list[ExperimentVariantFunnelsBaseStats]
 
 
@@ -6461,6 +6463,7 @@ class CachedExperimentFunnelsQueryResponse(BaseModel):
     )
     significance_code: ExperimentSignificanceCode
     significant: bool
+    stats_version: Optional[int] = None
     timezone: str
     variants: list[ExperimentVariantFunnelsBaseStats]
 
@@ -6477,6 +6480,7 @@ class Response9(BaseModel):
     probability: dict[str, float]
     significance_code: ExperimentSignificanceCode
     significant: bool
+    stats_version: Optional[int] = None
     variants: list[ExperimentVariantFunnelsBaseStats]
 
 
@@ -6508,6 +6512,7 @@ class ExperimentFunnelsQueryResponse(BaseModel):
     probability: dict[str, float]
     significance_code: ExperimentSignificanceCode
     significant: bool
+    stats_version: Optional[int] = None
     variants: list[ExperimentVariantFunnelsBaseStats]
 
 
@@ -6640,6 +6645,7 @@ class ExperimentFunnelsQuery(BaseModel):
     )
     name: Optional[str] = None
     response: Optional[ExperimentFunnelsQueryResponse] = None
+    stats_version: Optional[int] = None
 
 
 class FunnelCorrelationQuery(BaseModel):
diff --git a/posthog/settings/__init__.py b/posthog/settings/__init__.py
index c6067fd19c1f7..3e7ebc0b7c984 100644
--- a/posthog/settings/__init__.py
+++ b/posthog/settings/__init__.py
@@ -108,6 +108,7 @@
 PROM_PUSHGATEWAY_ADDRESS: str | None = os.getenv("PROM_PUSHGATEWAY_ADDRESS", None)
 
 IN_UNIT_TESTING: bool = get_from_env("IN_UNIT_TESTING", False, type_cast=str_to_bool)
+IN_EVAL_TESTING: bool = get_from_env("DEEPEVAL", False, type_cast=str_to_bool)
 
 HOGQL_INCREASED_MAX_EXECUTION_TIME: int = get_from_env("HOGQL_INCREASED_MAX_EXECUTION_TIME", 600, type_cast=int)
 
diff --git a/posthog/temporal/data_imports/external_data_job.py b/posthog/temporal/data_imports/external_data_job.py
index 62a1e1bc834ed..59508a2ee6f25 100644
--- a/posthog/temporal/data_imports/external_data_job.py
+++ b/posthog/temporal/data_imports/external_data_job.py
@@ -3,12 +3,17 @@
 import datetime as dt
 import json
 import re
+import threading
+import time
 
 from django.conf import settings
 from django.db import close_old_connections
 import posthoganalytics
+import psutil
 from temporalio import activity, exceptions, workflow
 from temporalio.common import RetryPolicy
+from temporalio.exceptions import WorkflowAlreadyStartedError
+
 
 from posthog.constants import DATA_WAREHOUSE_TASK_QUEUE_V2
 
@@ -144,20 +149,22 @@ def trigger_pipeline_v2(inputs: ExternalDataWorkflowInputs):
     logger.debug("Triggering V2 pipeline")
 
     temporal = sync_connect()
-
-    asyncio.run(
-        temporal.start_workflow(
-            workflow="external-data-job",
-            arg=dataclasses.asdict(inputs),
-            id=f"{inputs.external_data_schema_id}-V2",
-            task_queue=str(DATA_WAREHOUSE_TASK_QUEUE_V2),
-            retry_policy=RetryPolicy(
-                maximum_interval=dt.timedelta(seconds=60),
-                maximum_attempts=1,
-                non_retryable_error_types=["NondeterminismError"],
-            ),
+    try:
+        asyncio.run(
+            temporal.start_workflow(
+                workflow="external-data-job",
+                arg=dataclasses.asdict(inputs),
+                id=f"{inputs.external_data_schema_id}-V2",
+                task_queue=str(DATA_WAREHOUSE_TASK_QUEUE_V2),
+                retry_policy=RetryPolicy(
+                    maximum_interval=dt.timedelta(seconds=60),
+                    maximum_attempts=1,
+                    non_retryable_error_types=["NondeterminismError"],
+                ),
+            )
         )
-    )
+    except WorkflowAlreadyStartedError:
+        pass
 
     logger.debug("V2 pipeline triggered")
 
@@ -173,6 +180,22 @@ def create_source_templates(inputs: CreateSourceTemplateInputs) -> None:
     create_warehouse_templates_for_source(team_id=inputs.team_id, run_id=inputs.run_id)
 
 
+def log_memory_usage():
+    process = psutil.Process()
+    logger = bind_temporal_worker_logger_sync(team_id=0)
+
+    while True:
+        memory_info = process.memory_info()
+        logger.info(f"Memory Usage: RSS = {memory_info.rss / (1024 * 1024):.2f} MB")
+
+        time.sleep(10)  # Log every 10 seconds
+
+
+if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE_V2:
+    thread = threading.Thread(target=log_memory_usage, daemon=True)
+    thread.start()
+
+
 # TODO: update retry policies
 @workflow.defn(name="external-data-job")
 class ExternalDataJobWorkflow(PostHogWorkflow):
diff --git a/posthog/temporal/data_imports/pipelines/sql_database/helpers.py b/posthog/temporal/data_imports/pipelines/sql_database/helpers.py
index 0400a60b32fd5..9bf72a26f3c1e 100644
--- a/posthog/temporal/data_imports/pipelines/sql_database/helpers.py
+++ b/posthog/temporal/data_imports/pipelines/sql_database/helpers.py
@@ -24,7 +24,7 @@ def __init__(
         self,
         engine: Engine,
         table: Table,
-        chunk_size: int = 1000,
+        chunk_size: int = DEFAULT_CHUNK_SIZE,
         incremental: Optional[dlt.sources.incremental[Any]] = None,
         connect_args: Optional[list[str]] = None,
         db_incremental_field_last_value: Optional[Any] = None,
diff --git a/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py b/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py
index a3fc1c6b2838b..0c49b04ba1d1c 100644
--- a/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py
+++ b/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py
@@ -15,6 +15,7 @@
 from dlt.sources.credentials import ConnectionStringCredentials
 
 from posthog.settings.utils import get_from_env
+from posthog.temporal.data_imports.pipelines.sql_database_v2.settings import DEFAULT_CHUNK_SIZE
 from posthog.temporal.data_imports.pipelines.sql_database_v2._json import BigQueryJSON
 from posthog.utils import str_to_bool
 from posthog.warehouse.models import ExternalDataSource
@@ -252,7 +253,7 @@ def sql_database(
     schema: Optional[str] = dlt.config.value,
     metadata: Optional[MetaData] = None,
     table_names: Optional[list[str]] = dlt.config.value,
-    chunk_size: int = 50000,
+    chunk_size: int = DEFAULT_CHUNK_SIZE,
     backend: TableBackend = "pyarrow",
     detect_precision_hints: Optional[bool] = False,
     reflection_level: Optional[ReflectionLevel] = "full",
@@ -365,7 +366,7 @@ def sql_table(
     schema: Optional[str] = dlt.config.value,
     metadata: Optional[MetaData] = None,
     incremental: Optional[dlt.sources.incremental[Any]] = None,
-    chunk_size: int = 50000,
+    chunk_size: int = DEFAULT_CHUNK_SIZE,
     backend: TableBackend = "sqlalchemy",
     detect_precision_hints: Optional[bool] = None,
     reflection_level: Optional[ReflectionLevel] = "full",
diff --git a/posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py b/posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py
index acd64c97aae99..74a79650caa15 100644
--- a/posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py
+++ b/posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py
@@ -18,6 +18,8 @@
 
 from dlt.sources.credentials import ConnectionStringCredentials
 
+from posthog.temporal.data_imports.pipelines.sql_database_v2.settings import DEFAULT_CHUNK_SIZE
+
 from .arrow_helpers import row_tuples_to_arrow
 from .schema_types import (
     default_table_adapter,
@@ -44,7 +46,7 @@ def __init__(
         backend: TableBackend,
         table: Table,
         columns: TTableSchemaColumns,
-        chunk_size: int = 1000,
+        chunk_size: int = DEFAULT_CHUNK_SIZE,
         incremental: Optional[dlt.sources.incremental[Any]] = None,
         db_incremental_field_last_value: Optional[Any] = None,
         query_adapter_callback: Optional[TQueryAdapter] = None,
@@ -302,7 +304,7 @@ class SqlTableResourceConfiguration(BaseConfiguration):
     table: Optional[str] = None
     schema: Optional[str] = None
     incremental: Optional[dlt.sources.incremental] = None  # type: ignore[type-arg]
-    chunk_size: int = 50000
+    chunk_size: int = DEFAULT_CHUNK_SIZE
     backend: TableBackend = "sqlalchemy"
     detect_precision_hints: Optional[bool] = None
     defer_table_reflect: Optional[bool] = False
diff --git a/posthog/temporal/data_imports/pipelines/sql_database_v2/settings.py b/posthog/temporal/data_imports/pipelines/sql_database_v2/settings.py
new file mode 100644
index 0000000000000..d730961c096e8
--- /dev/null
+++ b/posthog/temporal/data_imports/pipelines/sql_database_v2/settings.py
@@ -0,0 +1 @@
+DEFAULT_CHUNK_SIZE = 10_000
diff --git a/requirements.in b/requirements.in
index faefd16d9294d..8c0b98d16587b 100644
--- a/requirements.in
+++ b/requirements.in
@@ -16,6 +16,7 @@ clickhouse-driver==0.2.7
 clickhouse-pool==0.5.3
 conditional-cache==1.2
 cryptography==39.0.2
+deltalake==0.22.3
 dj-database-url==0.5.0
 Django~=4.2.15
 django-axes==5.9.0
@@ -34,7 +35,6 @@ djangorestframework==3.15.1
 djangorestframework-csv==2.1.1
 djangorestframework-dataclasses==1.2.0
 dlt==1.3.0
-dlt[deltalake]==1.3.0
 dnspython==2.2.1
 drf-exceptions-hog==0.4.0
 drf-extensions==0.7.0
diff --git a/requirements.txt b/requirements.txt
index 639c98066ccd4..e4a1521f8dd7a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -138,8 +138,8 @@ defusedxml==0.6.0
     # via
     #   python3-openid
     #   social-auth-core
-deltalake==0.19.1
-    # via dlt
+deltalake==0.22.3
+    # via -r requirements.in
 distro==1.9.0
     # via openai
 dj-database-url==0.5.0
@@ -273,8 +273,6 @@ googleapis-common-protos==1.60.0
     # via
     #   google-api-core
     #   grpcio-status
-greenlet==3.1.1
-    # via sqlalchemy
 grpcio==1.63.2
     # via
     #   -r requirements.in
@@ -505,7 +503,6 @@ pyarrow==17.0.0
     # via
     #   -r requirements.in
     #   deltalake
-    #   dlt
     #   sqlalchemy-bigquery
 pyasn1==0.5.0
     # via