diff --git a/bin/mprocs.yaml b/bin/mprocs.yaml
index cb3761250d035..82409edab38f9 100644
--- a/bin/mprocs.yaml
+++ b/bin/mprocs.yaml
@@ -19,8 +19,7 @@ procs:
         shell: 'bin/check_kafka_clickhouse_up && bin/check_temporal_up && python manage.py start_temporal_worker'
 
     docker-compose:
-        shell: 'docker compose -f docker-compose.dev.yml up'
-        stop:
-            send-keys: ['<C-c>']
+        # docker-compose makes sure the stack is up, and then follows its logs - but doesn't tear down on exit for speed
+        shell: 'docker compose -f docker-compose.dev.yml up -d && docker compose -f docker-compose.dev.yml logs --tail=0 -f'
 
 mouse_scroll_speed: 1
diff --git a/bin/start b/bin/start
index ceaddede2140f..7ec881e57b81c 100755
--- a/bin/start
+++ b/bin/start
@@ -9,4 +9,14 @@ export HOG_HOOK_URL=${HOG_HOOK_URL:-http://localhost:3300/hoghook}
 
 [ ! -f ./share/GeoLite2-City.mmdb ] && ( curl -L "https://mmdbcdn.posthog.net/" --http1.1 | brotli --decompress --output=./share/GeoLite2-City.mmdb )
 
+if ! command -v mprocs &> /dev/null; then
+    if command -v brew &> /dev/null; then
+        echo "🔁 Installing mprocs via Homebrew..."
+        brew install mprocs
+    else
+        echo "👉 To run bin/start, install mprocs: https://github.com/pvolok/mprocs#installation"
+        exit 1
+    fi
+fi
+
 exec mprocs --config bin/mprocs.yaml
diff --git a/cypress/e2e/insights-reload-query.ts b/cypress/e2e/insights-reload-query.ts
index 2f944f8993da7..55685ae309fb8 100644
--- a/cypress/e2e/insights-reload-query.ts
+++ b/cypress/e2e/insights-reload-query.ts
@@ -1,5 +1,3 @@
-import JSONCrush from 'jsoncrush'
-
 describe('ReloadInsight component', () => {
     beforeEach(() => {
         // Clear local storage before each test to ensure a clean state
@@ -21,8 +19,7 @@ describe('ReloadInsight component', () => {
             const draftQuery = window.localStorage.getItem(`draft-query-${currentTeamId}`)
             expect(draftQuery).to.not.be.null
 
-            const draftQueryObjUncrushed = JSONCrush.uncrush(draftQuery)
-            const draftQueryObj = JSON.parse(draftQueryObjUncrushed)
+            const draftQueryObj = JSON.parse(draftQuery)
 
             expect(draftQueryObj).to.have.property('query')
 
diff --git a/ee/hogai/eval/conftest.py b/ee/hogai/eval/conftest.py
index d0bc75348eeac..1a88ebffa2e33 100644
--- a/ee/hogai/eval/conftest.py
+++ b/ee/hogai/eval/conftest.py
@@ -1,28 +1,107 @@
+import functools
+from collections.abc import Generator
+from pathlib import Path
+
 import pytest
+from django.conf import settings
+from django.test import override_settings
+from langchain_core.runnables import RunnableConfig
+
+from ee.models import Conversation
+from posthog.demo.matrix.manager import MatrixManager
+from posthog.models import Organization, Project, Team, User
+from posthog.tasks.demo_create_data import HedgeboxMatrix
+from posthog.test.base import BaseTest
+
+
+# Flaky is a handy tool, but it always runs setup fixtures for retries.
+# This decorator will just retry without re-running setup.
+def retry_test_only(max_retries=3):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            last_error: Exception | None = None
+            for attempt in range(max_retries):
+                try:
+                    return func(*args, **kwargs)
+                except Exception as e:
+                    last_error = e
+                    print(f"\nRetrying test (attempt {attempt + 1}/{max_retries})...")  # noqa
+            if last_error:
+                raise last_error
+
+        return wrapper
+
+    return decorator
+
+
+# Apply decorators to all tests in the package.
+def pytest_collection_modifyitems(items):
+    current_dir = Path(__file__).parent
+    for item in items:
+        if Path(item.fspath).is_relative_to(current_dir):
+            item.add_marker(
+                pytest.mark.skipif(not settings.IN_EVAL_TESTING, reason="Only runs for the assistant evaluation")
+            )
+            # Apply our custom retry decorator to the test function
+            item.obj = retry_test_only(max_retries=3)(item.obj)
+
+
+@pytest.fixture(scope="package")
+def team(django_db_blocker) -> Generator[Team, None, None]:
+    with django_db_blocker.unblock():
+        organization = Organization.objects.create(name=BaseTest.CONFIG_ORGANIZATION_NAME)
+        project = Project.objects.create(id=Team.objects.increment_id_sequence(), organization=organization)
+        team = Team.objects.create(
+            id=project.id,
+            project=project,
+            organization=organization,
+            test_account_filters=[
+                {
+                    "key": "email",
+                    "value": "@posthog.com",
+                    "operator": "not_icontains",
+                    "type": "person",
+                }
+            ],
+            has_completed_onboarding_for={"product_analytics": True},
+        )
+        yield team
+        organization.delete()
 
-from posthog.test.base import run_clickhouse_statement_in_parallel
 
+@pytest.fixture(scope="package")
+def user(team, django_db_blocker) -> Generator[User, None, None]:
+    with django_db_blocker.unblock():
+        user = User.objects.create_and_join(team.organization, "eval@posthog.com", "password1234")
+        yield user
+        user.delete()
 
-@pytest.fixture(scope="module", autouse=True)
-def setup_kafka_tables(django_db_setup):
-    from posthog.clickhouse.client import sync_execute
-    from posthog.clickhouse.schema import (
-        CREATE_KAFKA_TABLE_QUERIES,
-        build_query,
-    )
-    from posthog.settings import CLICKHOUSE_CLUSTER, CLICKHOUSE_DATABASE
 
-    kafka_queries = list(map(build_query, CREATE_KAFKA_TABLE_QUERIES))
-    run_clickhouse_statement_in_parallel(kafka_queries)
+@pytest.mark.django_db(transaction=True)
+@pytest.fixture
+def runnable_config(team, user) -> Generator[RunnableConfig, None, None]:
+    conversation = Conversation.objects.create(team=team, user=user)
+    yield {
+        "configurable": {
+            "thread_id": conversation.id,
+        }
+    }
+    conversation.delete()
 
-    yield
 
-    kafka_tables = sync_execute(
-        f"""
-        SELECT name
-        FROM system.tables
-        WHERE database = '{CLICKHOUSE_DATABASE}' AND name LIKE 'kafka_%'
-        """,
-    )
-    kafka_truncate_queries = [f"DROP TABLE {table[0]} ON CLUSTER '{CLICKHOUSE_CLUSTER}'" for table in kafka_tables]
-    run_clickhouse_statement_in_parallel(kafka_truncate_queries)
+@pytest.fixture(scope="package", autouse=True)
+def setup_test_data(django_db_setup, team, user, django_db_blocker):
+    with django_db_blocker.unblock():
+        matrix = HedgeboxMatrix(
+            seed="b1ef3c66-5f43-488a-98be-6b46d92fbcef",  # this seed generates all events
+            days_past=120,
+            days_future=30,
+            n_clusters=500,
+            group_type_index_offset=0,
+        )
+        matrix_manager = MatrixManager(matrix, print_steps=True)
+        with override_settings(TEST=False):
+            # Simulation saving should occur in non-test mode, so that Kafka isn't mocked. Normally in tests we don't
+            # want to ingest via Kafka, but simulation saving is specifically designed to use that route for speed
+            matrix_manager.run_on_team(team, user)
diff --git a/ee/hogai/eval/tests/test_eval_funnel_generator.py b/ee/hogai/eval/tests/test_eval_funnel_generator.py
index 4d7876ca6f73c..5f0f29243296a 100644
--- a/ee/hogai/eval/tests/test_eval_funnel_generator.py
+++ b/ee/hogai/eval/tests/test_eval_funnel_generator.py
@@ -1,40 +1,46 @@
+from collections.abc import Callable
 from typing import cast
 
+import pytest
 from langgraph.graph.state import CompiledStateGraph
 
 from ee.hogai.assistant import AssistantGraph
-from ee.hogai.eval.utils import EvalBaseTest
 from ee.hogai.utils.types import AssistantNodeName, AssistantState
 from posthog.schema import AssistantFunnelsQuery, HumanMessage, VisualizationMessage
 
 
-class TestEvalFunnelGenerator(EvalBaseTest):
-    def _call_node(self, query: str, plan: str) -> AssistantFunnelsQuery:
-        graph: CompiledStateGraph = (
-            AssistantGraph(self.team)
-            .add_edge(AssistantNodeName.START, AssistantNodeName.FUNNEL_GENERATOR)
-            .add_funnel_generator(AssistantNodeName.END)
-            .compile()
-        )
+@pytest.fixture
+def call_node(team, runnable_config) -> Callable[[str, str], AssistantFunnelsQuery]:
+    graph: CompiledStateGraph = (
+        AssistantGraph(team)
+        .add_edge(AssistantNodeName.START, AssistantNodeName.FUNNEL_GENERATOR)
+        .add_funnel_generator(AssistantNodeName.END)
+        .compile()
+    )
+
+    def callable(query: str, plan: str) -> AssistantFunnelsQuery:
         state = graph.invoke(
             AssistantState(messages=[HumanMessage(content=query)], plan=plan),
-            self._get_config(),
+            runnable_config,
         )
         return cast(VisualizationMessage, AssistantState.model_validate(state).messages[-1]).answer
 
-    def test_node_replaces_equals_with_contains(self):
-        query = "what is the conversion rate from a page view to sign up for users with name John?"
-        plan = """Sequence:
-        1. $pageview
-        - property filter 1
-            - person
-            - name
-            - equals
-            - John
-        2. signed_up
-        """
-        actual_output = self._call_node(query, plan).model_dump_json(exclude_none=True)
-        assert "exact" not in actual_output
-        assert "icontains" in actual_output
-        assert "John" not in actual_output
-        assert "john" in actual_output
+    return callable
+
+
+def test_node_replaces_equals_with_contains(call_node):
+    query = "what is the conversion rate from a page view to sign up for users with name John?"
+    plan = """Sequence:
+    1. $pageview
+    - property filter 1
+        - person
+        - name
+        - equals
+        - John
+    2. signed_up
+    """
+    actual_output = call_node(query, plan).model_dump_json(exclude_none=True)
+    assert "exact" not in actual_output
+    assert "icontains" in actual_output
+    assert "John" not in actual_output
+    assert "john" in actual_output
diff --git a/ee/hogai/eval/tests/test_eval_funnel_planner.py b/ee/hogai/eval/tests/test_eval_funnel_planner.py
index 9adbd75e77c6c..c8bc25bc0b5dc 100644
--- a/ee/hogai/eval/tests/test_eval_funnel_planner.py
+++ b/ee/hogai/eval/tests/test_eval_funnel_planner.py
@@ -1,208 +1,224 @@
+from collections.abc import Callable
+
+import pytest
 from deepeval import assert_test
 from deepeval.metrics import GEval
 from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from langchain_core.runnables.config import RunnableConfig
 from langgraph.graph.state import CompiledStateGraph
 
 from ee.hogai.assistant import AssistantGraph
-from ee.hogai.eval.utils import EvalBaseTest
 from ee.hogai.utils.types import AssistantNodeName, AssistantState
 from posthog.schema import HumanMessage
 
 
-class TestEvalFunnelPlanner(EvalBaseTest):
-    def _get_plan_correctness_metric(self):
-        return GEval(
-            name="Funnel Plan Correctness",
-            criteria="You will be given expected and actual generated plans to provide a taxonomy to answer a user's question with a funnel insight. Compare the plans to determine whether the taxonomy of the actual plan matches the expected plan. Do not apply general knowledge about funnel insights.",
-            evaluation_steps=[
-                "A plan must define at least two series in the sequence, but it is not required to define any filters, exclusion steps, or a breakdown.",
-                "Compare events, properties, math types, and property values of 'expected output' and 'actual output'. Do not penalize if the actual output does not include a timeframe.",
-                "Check if the combination of events, properties, and property values in 'actual output' can answer the user's question according to the 'expected output'.",
-                # The criteria for aggregations must be more specific because there isn't a way to bypass them.
-                "Check if the math types in 'actual output' match those in 'expected output.' If the aggregation type is specified by a property, user, or group in 'expected output', the same property, user, or group must be used in 'actual output'.",
-                "If 'expected output' contains exclusion steps, check if 'actual output' contains those, and heavily penalize if the exclusion steps are not present or different.",
-                "If 'expected output' contains a breakdown, check if 'actual output' contains a similar breakdown, and heavily penalize if the breakdown is not present or different. Plans may only have one breakdown.",
-                # We don't want to see in the output unnecessary property filters. The assistant tries to use them all the time.
-                "Heavily penalize if the 'actual output' contains any excessive output not present in the 'expected output'. For example, the `is set` operator in filters should not be used unless the user explicitly asks for it.",
-            ],
-            evaluation_params=[
-                LLMTestCaseParams.INPUT,
-                LLMTestCaseParams.EXPECTED_OUTPUT,
-                LLMTestCaseParams.ACTUAL_OUTPUT,
-            ],
-            threshold=0.7,
-        )
+@pytest.fixture(scope="module")
+def metric():
+    return GEval(
+        name="Funnel Plan Correctness",
+        criteria="You will be given expected and actual generated plans to provide a taxonomy to answer a user's question with a funnel insight. Compare the plans to determine whether the taxonomy of the actual plan matches the expected plan. Do not apply general knowledge about funnel insights.",
+        evaluation_steps=[
+            "A plan must define at least two series in the sequence, but it is not required to define any filters, exclusion steps, or a breakdown.",
+            "Compare events, properties, math types, and property values of 'expected output' and 'actual output'. Do not penalize if the actual output does not include a timeframe.",
+            "Check if the combination of events, properties, and property values in 'actual output' can answer the user's question according to the 'expected output'.",
+            # The criteria for aggregations must be more specific because there isn't a way to bypass them.
+            "Check if the math types in 'actual output' match those in 'expected output.' If the aggregation type is specified by a property, user, or group in 'expected output', the same property, user, or group must be used in 'actual output'.",
+            "If 'expected output' contains exclusion steps, check if 'actual output' contains those, and heavily penalize if the exclusion steps are not present or different.",
+            "If 'expected output' contains a breakdown, check if 'actual output' contains a similar breakdown, and heavily penalize if the breakdown is not present or different. Plans may only have one breakdown.",
+            # We don't want to see in the output unnecessary property filters. The assistant tries to use them all the time.
+            "Heavily penalize if the 'actual output' contains any excessive output not present in the 'expected output'. For example, the `is set` operator in filters should not be used unless the user explicitly asks for it.",
+        ],
+        evaluation_params=[
+            LLMTestCaseParams.INPUT,
+            LLMTestCaseParams.EXPECTED_OUTPUT,
+            LLMTestCaseParams.ACTUAL_OUTPUT,
+        ],
+        threshold=0.7,
+    )
 
-    def _call_node(self, query):
-        graph: CompiledStateGraph = (
-            AssistantGraph(self.team)
-            .add_edge(AssistantNodeName.START, AssistantNodeName.FUNNEL_PLANNER)
-            .add_funnel_planner(AssistantNodeName.END)
-            .compile()
-        )
+
+@pytest.fixture
+def call_node(team, runnable_config: RunnableConfig) -> Callable[[str], str]:
+    graph: CompiledStateGraph = (
+        AssistantGraph(team)
+        .add_edge(AssistantNodeName.START, AssistantNodeName.FUNNEL_PLANNER)
+        .add_funnel_planner(AssistantNodeName.END)
+        .compile()
+    )
+
+    def callable(query: str) -> str:
         state = graph.invoke(
             AssistantState(messages=[HumanMessage(content=query)]),
-            self._get_config(),
+            runnable_config,
         )
         return AssistantState.model_validate(state).plan or ""
 
-    def test_basic_funnel(self):
-        query = "what was the conversion from a page view to sign up?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Sequence:
-            1. $pageview
-            2. signed_up
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_outputs_at_least_two_events(self):
-        """
-        Ambigious query. The funnel must return at least two events.
-        """
-        query = "how many users paid a bill?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Sequence:
-            1. any event
-            2. upgrade_plan
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_no_excessive_property_filters(self):
-        query = "Show the user conversion from a sign up to a file download"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Sequence:
-            1. signed_up
-            2. downloaded_file
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
+    return callable
 
-    def test_basic_filtering(self):
-        query = (
-            "What was the conversion from uploading a file to downloading it from Chrome and Safari in the last 30d?"
-        )
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Sequence:
-            1. uploaded_file
-                - property filter 1:
-                    - entity: event
-                    - property name: $browser
-                    - property type: String
-                    - operator: equals
-                    - property value: Chrome
-                - property filter 2:
-                    - entity: event
-                    - property name: $browser
-                    - property type: String
-                    - operator: equals
-                    - property value: Safari
-            2. downloaded_file
-                - property filter 1:
-                    - entity: event
-                    - property name: $browser
-                    - property type: String
-                    - operator: equals
-                    - property value: Chrome
-                - property filter 2:
-                    - entity: event
-                    - property name: $browser
-                    - property type: String
-                    - operator: equals
-                    - property value: Safari
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_exclusion_steps(self):
-        query = "What was the conversion from uploading a file to downloading it in the last 30d excluding users that deleted a file?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Sequence:
-            1. uploaded_file
-            2. downloaded_file
-
-            Exclusions:
-            - deleted_file
-                - start index: 0
-                - end index: 1
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_breakdown(self):
-        query = "Show a conversion from uploading a file to downloading it segmented by a browser"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Sequence:
-            1. uploaded_file
-            2. downloaded_file
-
-            Breakdown by:
-            - entity: event
-            - property name: $browser
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_needle_in_a_haystack(self):
-        query = "What was the conversion from a sign up to a paying customer on the personal-pro plan?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Sequence:
-            1. signed_up
-            2. paid_bill
-                - property filter 1:
-                    - entity: event
-                    - property name: plan
-                    - property type: String
-                    - operator: equals
-                    - property value: personal/pro
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_planner_outputs_multiple_series_from_a_single_series_question(self):
-        query = "What's our sign-up funnel?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Sequence:
-            1. $pageview
-            2. signed_up
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_funnel_does_not_include_timeframe(self):
-        query = "what was the conversion from a page view to sign up for event time before 2024-01-01?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Sequence:
-            1. $pageview
-            2. signed_up
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
+
+def test_basic_funnel(metric, call_node):
+    query = "what was the conversion from a page view to sign up?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Sequence:
+        1. $pageview
+        2. signed_up
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_outputs_at_least_two_events(metric, call_node):
+    """
+    Ambigious query. The funnel must return at least two events.
+    """
+    query = "how many users paid a bill?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Sequence:
+        1. any event
+        2. upgrade_plan
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_no_excessive_property_filters(metric, call_node):
+    query = "Show the user conversion from a sign up to a file download"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Sequence:
+        1. signed_up
+        2. downloaded_file
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_basic_filtering(metric, call_node):
+    query = "What was the conversion from uploading a file to downloading it from Chrome and Safari in the last 30d?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Sequence:
+        1. uploaded_file
+            - property filter 1:
+                - entity: event
+                - property name: $browser
+                - property type: String
+                - operator: equals
+                - property value: Chrome
+            - property filter 2:
+                - entity: event
+                - property name: $browser
+                - property type: String
+                - operator: equals
+                - property value: Safari
+        2. downloaded_file
+            - property filter 1:
+                - entity: event
+                - property name: $browser
+                - property type: String
+                - operator: equals
+                - property value: Chrome
+            - property filter 2:
+                - entity: event
+                - property name: $browser
+                - property type: String
+                - operator: equals
+                - property value: Safari
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_exclusion_steps(metric, call_node):
+    query = "What was the conversion from uploading a file to downloading it in the last 30d excluding users that deleted a file?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Sequence:
+        1. uploaded_file
+        2. downloaded_file
+
+        Exclusions:
+        - deleted_file
+            - start index: 0
+            - end index: 1
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_breakdown(metric, call_node):
+    query = "Show a conversion from uploading a file to downloading it segmented by a browser"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Sequence:
+        1. uploaded_file
+        2. downloaded_file
+
+        Breakdown by:
+        - entity: event
+        - property name: $browser
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_needle_in_a_haystack(metric, call_node):
+    query = "What was the conversion from a sign up to a paying customer on the personal-pro plan?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Sequence:
+        1. signed_up
+        2. paid_bill
+            - property filter 1:
+                - entity: event
+                - property name: plan
+                - property type: String
+                - operator: equals
+                - property value: personal/pro
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_planner_outputs_multiple_series_from_a_single_series_question(metric, call_node):
+    query = "What's our sign-up funnel?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Sequence:
+        1. $pageview
+        2. signed_up
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_funnel_does_not_include_timeframe(metric, call_node):
+    query = "what was the conversion from a page view to sign up for event time before 2024-01-01?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Sequence:
+        1. $pageview
+        2. signed_up
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
diff --git a/ee/hogai/eval/tests/test_eval_router.py b/ee/hogai/eval/tests/test_eval_router.py
index c1307e9d40f00..84e5c4c809972 100644
--- a/ee/hogai/eval/tests/test_eval_router.py
+++ b/ee/hogai/eval/tests/test_eval_router.py
@@ -1,69 +1,80 @@
+from collections.abc import Callable
 from typing import cast
 
+import pytest
 from langgraph.graph.state import CompiledStateGraph
 
 from ee.hogai.assistant import AssistantGraph
-from ee.hogai.eval.utils import EvalBaseTest
 from ee.hogai.utils.types import AssistantNodeName, AssistantState
 from posthog.schema import HumanMessage, RouterMessage
 
 
-class TestEvalRouter(EvalBaseTest):
-    def _call_node(self, query: str | list):
-        graph: CompiledStateGraph = (
-            AssistantGraph(self.team)
-            .add_start()
-            .add_router(path_map={"trends": AssistantNodeName.END, "funnel": AssistantNodeName.END})
-            .compile()
-        )
+@pytest.fixture
+def call_node(team, runnable_config) -> Callable[[str | list], str]:
+    graph: CompiledStateGraph = (
+        AssistantGraph(team)
+        .add_start()
+        .add_router(path_map={"trends": AssistantNodeName.END, "funnel": AssistantNodeName.END})
+        .compile()
+    )
+
+    def callable(query: str | list) -> str:
         messages = [HumanMessage(content=query)] if isinstance(query, str) else query
         state = graph.invoke(
             AssistantState(messages=messages),
-            self._get_config(),
+            runnable_config,
         )
         return cast(RouterMessage, AssistantState.model_validate(state).messages[-1]).content
 
-    def test_outputs_basic_trends_insight(self):
-        query = "Show the $pageview trend"
-        res = self._call_node(query)
-        self.assertEqual(res, "trends")
-
-    def test_outputs_basic_funnel_insight(self):
-        query = "What is the conversion rate of users who uploaded a file to users who paid for a plan?"
-        res = self._call_node(query)
-        self.assertEqual(res, "funnel")
-
-    def test_converts_trends_to_funnel(self):
-        conversation = [
-            HumanMessage(content="Show trends of $pageview and $identify"),
-            RouterMessage(content="trends"),
-            HumanMessage(content="Convert this insight to a funnel"),
-        ]
-        res = self._call_node(conversation[:1])
-        self.assertEqual(res, "trends")
-        res = self._call_node(conversation)
-        self.assertEqual(res, "funnel")
-
-    def test_converts_funnel_to_trends(self):
-        conversation = [
-            HumanMessage(content="What is the conversion from a page view to a sign up?"),
-            RouterMessage(content="funnel"),
-            HumanMessage(content="Convert this insight to a trends"),
-        ]
-        res = self._call_node(conversation[:1])
-        self.assertEqual(res, "funnel")
-        res = self._call_node(conversation)
-        self.assertEqual(res, "trends")
-
-    def test_outputs_single_trends_insight(self):
-        """
-        Must display a trends insight because it's not possible to build a funnel with a single series.
-        """
-        query = "how many users upgraded their plan to personal pro?"
-        res = self._call_node(query)
-        self.assertEqual(res, "trends")
-
-    def test_classifies_funnel_with_single_series(self):
-        query = "What's our sign-up funnel?"
-        res = self._call_node(query)
-        self.assertEqual(res, "funnel")
+    return callable
+
+
+def test_outputs_basic_trends_insight(call_node):
+    query = "Show the $pageview trend"
+    res = call_node(query)
+    assert res == "trends"
+
+
+def test_outputs_basic_funnel_insight(call_node):
+    query = "What is the conversion rate of users who uploaded a file to users who paid for a plan?"
+    res = call_node(query)
+    assert res == "funnel"
+
+
+def test_converts_trends_to_funnel(call_node):
+    conversation = [
+        HumanMessage(content="Show trends of $pageview and $identify"),
+        RouterMessage(content="trends"),
+        HumanMessage(content="Convert this insight to a funnel"),
+    ]
+    res = call_node(conversation[:1])
+    assert res == "trends"
+    res = call_node(conversation)
+    assert res == "funnel"
+
+
+def test_converts_funnel_to_trends(call_node):
+    conversation = [
+        HumanMessage(content="What is the conversion from a page view to a sign up?"),
+        RouterMessage(content="funnel"),
+        HumanMessage(content="Convert this insight to a trends"),
+    ]
+    res = call_node(conversation[:1])
+    assert res == "funnel"
+    res = call_node(conversation)
+    assert res == "trends"
+
+
+def test_outputs_single_trends_insight(call_node):
+    """
+    Must display a trends insight because it's not possible to build a funnel with a single series.
+    """
+    query = "how many users upgraded their plan to personal pro?"
+    res = call_node(query)
+    assert res == "trends"
+
+
+def test_classifies_funnel_with_single_series(call_node):
+    query = "What's our sign-up funnel?"
+    res = call_node(query)
+    assert res == "funnel"
diff --git a/ee/hogai/eval/tests/test_eval_trends_generator.py b/ee/hogai/eval/tests/test_eval_trends_generator.py
index 496bbf0100b51..c8491957c868f 100644
--- a/ee/hogai/eval/tests/test_eval_trends_generator.py
+++ b/ee/hogai/eval/tests/test_eval_trends_generator.py
@@ -1,58 +1,65 @@
+from collections.abc import Callable
 from typing import cast
 
+import pytest
 from langgraph.graph.state import CompiledStateGraph
 
 from ee.hogai.assistant import AssistantGraph
-from ee.hogai.eval.utils import EvalBaseTest
 from ee.hogai.utils.types import AssistantNodeName, AssistantState
 from posthog.schema import AssistantTrendsQuery, HumanMessage, VisualizationMessage
 
 
-class TestEvalTrendsGenerator(EvalBaseTest):
-    def _call_node(self, query: str, plan: str) -> AssistantTrendsQuery:
-        graph: CompiledStateGraph = (
-            AssistantGraph(self.team)
-            .add_edge(AssistantNodeName.START, AssistantNodeName.TRENDS_GENERATOR)
-            .add_trends_generator(AssistantNodeName.END)
-            .compile()
-        )
+@pytest.fixture
+def call_node(team, runnable_config) -> Callable[[str, str], AssistantTrendsQuery]:
+    graph: CompiledStateGraph = (
+        AssistantGraph(team)
+        .add_edge(AssistantNodeName.START, AssistantNodeName.TRENDS_GENERATOR)
+        .add_trends_generator(AssistantNodeName.END)
+        .compile()
+    )
+
+    def callable(query: str, plan: str) -> AssistantTrendsQuery:
         state = graph.invoke(
             AssistantState(messages=[HumanMessage(content=query)], plan=plan),
-            self._get_config(),
+            runnable_config,
         )
         return cast(VisualizationMessage, AssistantState.model_validate(state).messages[-1]).answer
 
-    def test_node_replaces_equals_with_contains(self):
-        query = "what is pageview trend for users with name John?"
-        plan = """Events:
-        - $pageview
-            - math operation: total count
-            - property filter 1
-                - person
-                - name
-                - equals
-                - John
-        """
-        actual_output = self._call_node(query, plan).model_dump_json(exclude_none=True)
-        assert "exact" not in actual_output
-        assert "icontains" in actual_output
-        assert "John" not in actual_output
-        assert "john" in actual_output
-
-    def test_node_leans_towards_line_graph(self):
-        query = "How often do users download files?"
-        # We ideally want to consider both total count of downloads per period, as well as how often a median user downloads
-        plan = """Events:
-        - downloaded_file
-            - math operation: total count
-        - downloaded_file
-            - math operation: median count per user
-        """
-        actual_output = self._call_node(query, plan)
-        assert actual_output.trendsFilter.display == "ActionsLineGraph"
-        assert actual_output.series[0].kind == "EventsNode"
-        assert actual_output.series[0].event == "downloaded_file"
-        assert actual_output.series[0].math == "total"
-        assert actual_output.series[1].kind == "EventsNode"
-        assert actual_output.series[1].event == "downloaded_file"
-        assert actual_output.series[1].math == "median_count_per_actor"
+    return callable
+
+
+def test_node_replaces_equals_with_contains(call_node):
+    query = "what is pageview trend for users with name John?"
+    plan = """Events:
+    - $pageview
+        - math operation: total count
+        - property filter 1
+            - person
+            - name
+            - equals
+            - John
+    """
+    actual_output = call_node(query, plan).model_dump_json(exclude_none=True)
+    assert "exact" not in actual_output
+    assert "icontains" in actual_output
+    assert "John" not in actual_output
+    assert "john" in actual_output
+
+
+def test_node_leans_towards_line_graph(call_node):
+    query = "How often do users download files?"
+    # We ideally want to consider both total count of downloads per period, as well as how often a median user downloads
+    plan = """Events:
+    - downloaded_file
+        - math operation: total count
+    - downloaded_file
+        - math operation: median count per user
+    """
+    actual_output = call_node(query, plan)
+    assert actual_output.trendsFilter.display == "ActionsLineGraph"
+    assert actual_output.series[0].kind == "EventsNode"
+    assert actual_output.series[0].event == "downloaded_file"
+    assert actual_output.series[0].math == "total"
+    assert actual_output.series[1].kind == "EventsNode"
+    assert actual_output.series[1].event == "downloaded_file"
+    assert actual_output.series[1].math == "median_count_per_actor"
diff --git a/ee/hogai/eval/tests/test_eval_trends_planner.py b/ee/hogai/eval/tests/test_eval_trends_planner.py
index d4fbff456a91c..4d4ea4c41dfbf 100644
--- a/ee/hogai/eval/tests/test_eval_trends_planner.py
+++ b/ee/hogai/eval/tests/test_eval_trends_planner.py
@@ -1,179 +1,196 @@
+from collections.abc import Callable
+
+import pytest
 from deepeval import assert_test
 from deepeval.metrics import GEval
 from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from langchain_core.runnables.config import RunnableConfig
 from langgraph.graph.state import CompiledStateGraph
 
 from ee.hogai.assistant import AssistantGraph
-from ee.hogai.eval.utils import EvalBaseTest
 from ee.hogai.utils.types import AssistantNodeName, AssistantState
 from posthog.schema import HumanMessage
 
 
-class TestEvalTrendsPlanner(EvalBaseTest):
-    def _get_plan_correctness_metric(self):
-        return GEval(
-            name="Trends Plan Correctness",
-            criteria="You will be given expected and actual generated plans to provide a taxonomy to answer a user's question with a trends insight. Compare the plans to determine whether the taxonomy of the actual plan matches the expected plan. Do not apply general knowledge about trends insights.",
-            evaluation_steps=[
-                "A plan must define at least one event and a math type, but it is not required to define any filters, breakdowns, or formulas.",
-                "Compare events, properties, math types, and property values of 'expected output' and 'actual output'. Do not penalize if the actual output does not include a timeframe.",
-                "Check if the combination of events, properties, and property values in 'actual output' can answer the user's question according to the 'expected output'.",
-                # The criteria for aggregations must be more specific because there isn't a way to bypass them.
-                "Check if the math types in 'actual output' match those in 'expected output'. Math types sometimes are interchangeable, so use your judgement. If the aggregation type is specified by a property, user, or group in 'expected output', the same property, user, or group must be used in 'actual output'.",
-                "If 'expected output' contains a breakdown, check if 'actual output' contains a similar breakdown, and heavily penalize if the breakdown is not present or different.",
-                "If 'expected output' contains a formula, check if 'actual output' contains a similar formula, and heavily penalize if the formula is not present or different.",
-                # We don't want to see in the output unnecessary property filters. The assistant tries to use them all the time.
-                "Heavily penalize if the 'actual output' contains any excessive output not present in the 'expected output'. For example, the `is set` operator in filters should not be used unless the user explicitly asks for it.",
-            ],
-            evaluation_params=[
-                LLMTestCaseParams.INPUT,
-                LLMTestCaseParams.EXPECTED_OUTPUT,
-                LLMTestCaseParams.ACTUAL_OUTPUT,
-            ],
-            threshold=0.7,
-        )
+@pytest.fixture(scope="module")
+def metric():
+    return GEval(
+        name="Trends Plan Correctness",
+        criteria="You will be given expected and actual generated plans to provide a taxonomy to answer a user's question with a trends insight. Compare the plans to determine whether the taxonomy of the actual plan matches the expected plan. Do not apply general knowledge about trends insights.",
+        evaluation_steps=[
+            "A plan must define at least one event and a math type, but it is not required to define any filters, breakdowns, or formulas.",
+            "Compare events, properties, math types, and property values of 'expected output' and 'actual output'. Do not penalize if the actual output does not include a timeframe.",
+            "Check if the combination of events, properties, and property values in 'actual output' can answer the user's question according to the 'expected output'.",
+            # The criteria for aggregations must be more specific because there isn't a way to bypass them.
+            "Check if the math types in 'actual output' match those in 'expected output'. Math types sometimes are interchangeable, so use your judgement. If the aggregation type is specified by a property, user, or group in 'expected output', the same property, user, or group must be used in 'actual output'.",
+            "If 'expected output' contains a breakdown, check if 'actual output' contains a similar breakdown, and heavily penalize if the breakdown is not present or different.",
+            "If 'expected output' contains a formula, check if 'actual output' contains a similar formula, and heavily penalize if the formula is not present or different.",
+            # We don't want to see in the output unnecessary property filters. The assistant tries to use them all the time.
+            "Heavily penalize if the 'actual output' contains any excessive output not present in the 'expected output'. For example, the `is set` operator in filters should not be used unless the user explicitly asks for it.",
+        ],
+        evaluation_params=[
+            LLMTestCaseParams.INPUT,
+            LLMTestCaseParams.EXPECTED_OUTPUT,
+            LLMTestCaseParams.ACTUAL_OUTPUT,
+        ],
+        threshold=0.7,
+    )
 
-    def _call_node(self, query):
-        graph: CompiledStateGraph = (
-            AssistantGraph(self.team)
-            .add_edge(AssistantNodeName.START, AssistantNodeName.TRENDS_PLANNER)
-            .add_trends_planner(AssistantNodeName.END)
-            .compile()
-        )
+
+@pytest.fixture
+def call_node(team, runnable_config: RunnableConfig) -> Callable[[str], str]:
+    graph: CompiledStateGraph = (
+        AssistantGraph(team)
+        .add_edge(AssistantNodeName.START, AssistantNodeName.TRENDS_PLANNER)
+        .add_trends_planner(AssistantNodeName.END)
+        .compile()
+    )
+
+    def callable(query: str) -> str:
         state = graph.invoke(
             AssistantState(messages=[HumanMessage(content=query)]),
-            self._get_config(),
+            runnable_config,
         )
         return AssistantState.model_validate(state).plan or ""
 
-    def test_no_excessive_property_filters(self):
-        query = "Show the $pageview trend"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Events:
-            - $pageview
-                - math operation: total count
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_no_excessive_property_filters_for_a_defined_math_type(self):
-        query = "What is the MAU?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Events:
-            - $pageview
-                - math operation: unique users
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_basic_filtering(self):
-        query = "can you compare how many Chrome vs Safari users uploaded a file in the last 30d?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Events:
-            - uploaded_file
-                - math operation: total count
-                - property filter 1:
-                    - entity: event
-                    - property name: $browser
-                    - property type: String
-                    - operator: equals
-                    - property value: Chrome
-                - property filter 2:
-                    - entity: event
-                    - property name: $browser
-                    - property type: String
-                    - operator: equals
-                    - property value: Safari
-
-            Breakdown by:
-            - breakdown 1:
+    return callable
+
+
+def test_no_excessive_property_filters(metric, call_node):
+    query = "Show the $pageview trend"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Events:
+        - $pageview
+            - math operation: total count
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_no_excessive_property_filters_for_a_defined_math_type(metric, call_node):
+    query = "What is the MAU?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Events:
+        - $pageview
+            - math operation: unique users
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_basic_filtering(metric, call_node):
+    query = "can you compare how many Chrome vs Safari users uploaded a file in the last 30d?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Events:
+        - uploaded_file
+            - math operation: total count
+            - property filter 1:
                 - entity: event
                 - property name: $browser
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_formula_mode(self):
-        query = "i want to see a ratio of identify divided by page views"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Events:
-            - $identify
-                - math operation: total count
-            - $pageview
-                - math operation: total count
-
-            Formula:
-            `A/B`, where `A` is the total count of `$identify` and `B` is the total count of `$pageview`
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_math_type_by_a_property(self):
-        query = "what is the average session duration?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Events:
-            - All Events
-                - math operation: average by `$session_duration`
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_math_type_by_a_user(self):
-        query = "What is the median page view count for a user?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Events:
-            - $pageview
-                - math operation: median by users
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_needle_in_a_haystack(self):
-        query = "How frequently do people pay for a personal-pro plan?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Events:
-            - paid_bill
-                - math operation: total count
-                - property filter 1:
-                    - entity: event
-                    - property name: plan
-                    - property type: String
-                    - operator: contains
-                    - property value: personal/pro
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
-
-    def test_funnel_does_not_include_timeframe(self):
-        query = "what is the pageview trend for event time before 2024-01-01?"
-        test_case = LLMTestCase(
-            input=query,
-            expected_output="""
-            Events:
-            - $pageview
-                - math operation: total count
-            """,
-            actual_output=self._call_node(query),
-        )
-        assert_test(test_case, [self._get_plan_correctness_metric()])
+                - property type: String
+                - operator: equals
+                - property value: Chrome
+            - property filter 2:
+                - entity: event
+                - property name: $browser
+                - property type: String
+                - operator: equals
+                - property value: Safari
+
+        Breakdown by:
+        - breakdown 1:
+            - entity: event
+            - property name: $browser
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_formula_mode(metric, call_node):
+    query = "i want to see a ratio of identify divided by page views"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Events:
+        - $identify
+            - math operation: total count
+        - $pageview
+            - math operation: total count
+
+        Formula:
+        `A/B`, where `A` is the total count of `$identify` and `B` is the total count of `$pageview`
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_math_type_by_a_property(metric, call_node):
+    query = "what is the average session duration?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Events:
+        - All Events
+            - math operation: average by `$session_duration`
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_math_type_by_a_user(metric, call_node):
+    query = "What is the median page view count for a user?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Events:
+        - $pageview
+            - math operation: median by users
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_needle_in_a_haystack(metric, call_node):
+    query = "How frequently do people pay for a personal-pro plan?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Events:
+        - paid_bill
+            - math operation: total count
+            - property filter 1:
+                - entity: event
+                - property name: plan
+                - property type: String
+                - operator: contains
+                - property value: personal/pro
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
+
+
+def test_trends_does_not_include_timeframe(metric, call_node):
+    query = "what is the pageview trend for event time before 2024-01-01?"
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="""
+        Events:
+        - $pageview
+            - math operation: total count
+        """,
+        actual_output=call_node(query),
+    )
+    assert_test(test_case, [metric])
diff --git a/ee/hogai/eval/utils.py b/ee/hogai/eval/utils.py
deleted file mode 100644
index 6e03c4cfafa9f..0000000000000
--- a/ee/hogai/eval/utils.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import os
-
-import pytest
-from django.test import override_settings
-from flaky import flaky
-from langchain_core.runnables import RunnableConfig
-
-from ee.models.assistant import Conversation
-from posthog.demo.matrix.manager import MatrixManager
-from posthog.tasks.demo_create_data import HedgeboxMatrix
-from posthog.test.base import NonAtomicBaseTest
-
-
-@pytest.mark.skipif(os.environ.get("DEEPEVAL") != "YES", reason="Only runs for the assistant evaluation")
-@flaky(max_runs=3, min_passes=1)
-class EvalBaseTest(NonAtomicBaseTest):
-    def _get_config(self) -> RunnableConfig:
-        conversation = Conversation.objects.create(team=self.team, user=self.user)
-        return {
-            "configurable": {
-                "thread_id": conversation.id,
-            }
-        }
-
-    @classmethod
-    def setUpTestData(cls):
-        super().setUpTestData()
-        matrix = HedgeboxMatrix(
-            seed="b1ef3c66-5f43-488a-98be-6b46d92fbcef",  # this seed generates all events
-            days_past=120,
-            days_future=30,
-            n_clusters=500,
-            group_type_index_offset=0,
-        )
-        matrix_manager = MatrixManager(matrix, print_steps=True)
-        existing_user = cls.team.organization.members.first()
-        with override_settings(TEST=False):
-            # Simulation saving should occur in non-test mode, so that Kafka isn't mocked. Normally in tests we don't
-            # want to ingest via Kafka, but simulation saving is specifically designed to use that route for speed
-            matrix_manager.run_on_team(cls.team, existing_user)
diff --git a/frontend/__snapshots__/components-command-bar--search--dark.png b/frontend/__snapshots__/components-command-bar--search--dark.png
index 599e3c20f7aea..0692091cc3e32 100644
Binary files a/frontend/__snapshots__/components-command-bar--search--dark.png and b/frontend/__snapshots__/components-command-bar--search--dark.png differ
diff --git a/frontend/__snapshots__/components-command-bar--search--light.png b/frontend/__snapshots__/components-command-bar--search--light.png
index 75bd57cddaff1..8231897e6233f 100644
Binary files a/frontend/__snapshots__/components-command-bar--search--light.png and b/frontend/__snapshots__/components-command-bar--search--light.png differ
diff --git a/frontend/__snapshots__/lemon-ui-lemon-input--search--dark.png b/frontend/__snapshots__/lemon-ui-lemon-input--search--dark.png
index 2618d982587b5..8dd6e08c842af 100644
Binary files a/frontend/__snapshots__/lemon-ui-lemon-input--search--dark.png and b/frontend/__snapshots__/lemon-ui-lemon-input--search--dark.png differ
diff --git a/frontend/__snapshots__/lemon-ui-lemon-input--search--light.png b/frontend/__snapshots__/lemon-ui-lemon-input--search--light.png
index 171b0fb4ae3bc..42f6a48d5f2a9 100644
Binary files a/frontend/__snapshots__/lemon-ui-lemon-input--search--light.png and b/frontend/__snapshots__/lemon-ui-lemon-input--search--light.png differ
diff --git a/frontend/__snapshots__/replay-player-success--second-recording-in-list--dark.png b/frontend/__snapshots__/replay-player-success--second-recording-in-list--dark.png
index 93ad7064fd68b..deadbaa964285 100644
Binary files a/frontend/__snapshots__/replay-player-success--second-recording-in-list--dark.png and b/frontend/__snapshots__/replay-player-success--second-recording-in-list--dark.png differ
diff --git a/frontend/__snapshots__/replay-player-success--second-recording-in-list--light.png b/frontend/__snapshots__/replay-player-success--second-recording-in-list--light.png
index 537566ccb05ab..f25c112bb904b 100644
Binary files a/frontend/__snapshots__/replay-player-success--second-recording-in-list--light.png and b/frontend/__snapshots__/replay-player-success--second-recording-in-list--light.png differ
diff --git a/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-breakdown-edit--light.png b/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-breakdown-edit--light.png
index 02bc921745ecd..00916265cd38b 100644
Binary files a/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-breakdown-edit--light.png and b/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-breakdown-edit--light.png differ
diff --git a/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-edit--light.png b/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-edit--light.png
index 3a86dc325987a..8de14b30417ac 100644
Binary files a/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-edit--light.png and b/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-edit--light.png differ
diff --git a/frontend/src/lib/components/CommandBar/CommandBar.tsx b/frontend/src/lib/components/CommandBar/CommandBar.tsx
index fe4b9c2e4555e..5d25486df6862 100644
--- a/frontend/src/lib/components/CommandBar/CommandBar.tsx
+++ b/frontend/src/lib/components/CommandBar/CommandBar.tsx
@@ -26,7 +26,7 @@ const CommandBarOverlay = forwardRef<HTMLDivElement, CommandBarOverlayProps>(fun
                     data-attr="command-bar"
                     className={`w-full ${
                         barStatus === BarStatus.SHOW_SEARCH && 'h-full'
-                    } bg-bg-3000 rounded overflow-hidden border border-border-bold`}
+                    } w-full bg-bg-3000 rounded overflow-hidden border border-border-bold`}
                     ref={ref}
                 >
                     {children}
diff --git a/frontend/src/lib/components/CommandBar/SearchBar.tsx b/frontend/src/lib/components/CommandBar/SearchBar.tsx
index 3eba8e50e2aad..d9b27c7e806db 100644
--- a/frontend/src/lib/components/CommandBar/SearchBar.tsx
+++ b/frontend/src/lib/components/CommandBar/SearchBar.tsx
@@ -13,9 +13,10 @@ export const SearchBar = (): JSX.Element => {
     const inputRef = useRef<HTMLInputElement>(null)
 
     return (
-        <div className="flex w-full h-full">
+        <div className="grid grid-cols-[8.5rem_1fr] lg:grid-cols-[12.5rem_1fr] h-full">
             <SearchTabs inputRef={inputRef} />
-            <div className="grow flex flex-col overscroll-contain overflow-hidden">
+            {/* 49px = height of search input, 40rem = height of search results */}
+            <div className="grid grid-rows-[49px_calc(40rem-49px)] overflow-hidden overscroll-contain">
                 <SearchInput ref={inputRef} />
                 <SearchResults />
             </div>
diff --git a/frontend/src/lib/components/CommandBar/SearchResult.tsx b/frontend/src/lib/components/CommandBar/SearchResult.tsx
index 354470759518e..6bb04fce0693c 100644
--- a/frontend/src/lib/components/CommandBar/SearchResult.tsx
+++ b/frontend/src/lib/components/CommandBar/SearchResult.tsx
@@ -1,6 +1,8 @@
 import { LemonSkeleton } from '@posthog/lemon-ui'
 import clsx from 'clsx'
 import { useActions, useValues } from 'kea'
+import { TAILWIND_BREAKPOINTS } from 'lib/constants'
+import { useWindowSize } from 'lib/hooks/useWindowSize'
 import { capitalizeFirstLetter } from 'lib/utils'
 import { useLayoutEffect, useRef } from 'react'
 import { useSummarizeInsight } from 'scenes/insights/summarizeInsight'
@@ -22,10 +24,12 @@ type SearchResultProps = {
 
 export const SearchResult = ({ result, resultIndex, focused }: SearchResultProps): JSX.Element => {
     const { aggregationLabel } = useValues(searchBarLogic)
-    const { openResult } = useActions(searchBarLogic)
+    const { setActiveResultIndex, openResult } = useActions(searchBarLogic)
 
     const ref = useRef<HTMLDivElement | null>(null)
 
+    const { width } = useWindowSize()
+
     useLayoutEffect(() => {
         if (focused) {
             // :HACKY: This uses the non-standard scrollIntoViewIfNeeded api
@@ -40,27 +44,33 @@ export const SearchResult = ({ result, resultIndex, focused }: SearchResultProps
     }, [focused])
 
     return (
-        <div
-            className={clsx(
-                'w-full px-2 hover:bg-bg-3000 border-l-4 border-r border-b cursor-pointer',
-                focused ? 'bg-bg-3000 border-l-primary-3000' : 'bg-bg-light'
-            )}
-            onClick={() => {
-                openResult(resultIndex)
-            }}
-            ref={ref}
-        >
-            <div className="px-2 py-3 w-full space-y-0.5 flex flex-col items-start">
-                <span className="text-muted-3000 text-xs">
-                    {result.type !== 'group'
-                        ? tabToName[result.type]
-                        : `${capitalizeFirstLetter(aggregationLabel(result.extra_fields.group_type_index).plural)}`}
-                </span>
-                <span className="text-text-3000 font-bold">
-                    <ResultName result={result} />
-                </span>
+        <>
+            <div
+                className={clsx(
+                    'w-full px-2 hover:bg-bg-3000 border-l-4 border-b cursor-pointer',
+                    focused ? 'bg-bg-3000 border-l-primary-3000' : 'bg-bg-light'
+                )}
+                onClick={() => {
+                    if (width && width <= TAILWIND_BREAKPOINTS.md) {
+                        openResult(resultIndex)
+                    } else {
+                        setActiveResultIndex(resultIndex)
+                    }
+                }}
+                ref={ref}
+            >
+                <div className="px-2 py-3 w-full space-y-0.5 flex flex-col items-start">
+                    <span className="text-muted-3000 text-xs">
+                        {result.type !== 'group'
+                            ? tabToName[result.type]
+                            : `${capitalizeFirstLetter(aggregationLabel(result.extra_fields.group_type_index).plural)}`}
+                    </span>
+                    <span className="text-text-3000 font-bold">
+                        <ResultName result={result} />
+                    </span>
+                </div>
             </div>
-        </div>
+        </>
     )
 }
 
diff --git a/frontend/src/lib/components/CommandBar/SearchResultPreview.tsx b/frontend/src/lib/components/CommandBar/SearchResultPreview.tsx
index 498150ebada3a..af4e70df0a08a 100644
--- a/frontend/src/lib/components/CommandBar/SearchResultPreview.tsx
+++ b/frontend/src/lib/components/CommandBar/SearchResultPreview.tsx
@@ -1,11 +1,15 @@
-import { useValues } from 'kea'
+import { useActions, useValues } from 'kea'
 import { ResultDescription, ResultName } from 'lib/components/CommandBar/SearchResult'
+import { LemonButton } from 'lib/lemon-ui/LemonButton'
+
+import { KeyboardShortcut } from '~/layout/navigation-3000/components/KeyboardShortcut'
 
 import { tabToName } from './constants'
 import { searchBarLogic, urlForResult } from './searchBarLogic'
 
 export const SearchResultPreview = (): JSX.Element | null => {
     const { activeResultIndex, combinedSearchResults } = useValues(searchBarLogic)
+    const { openResult } = useActions(searchBarLogic)
 
     if (!combinedSearchResults || combinedSearchResults.length === 0) {
         return null
@@ -14,17 +18,41 @@ export const SearchResultPreview = (): JSX.Element | null => {
     const result = combinedSearchResults[activeResultIndex]
 
     return (
-        <div className="border bg-bg-light rounded p-6">
-            <div>{tabToName[result.type]}</div>
-            <div className="text-text-3000 font-bold text-lg">
-                <ResultName result={result} />
-            </div>
-            <span className="text-[var(--trace-3000)] text-xs">
-                {location.host}
-                <span className="text-muted-3000">{urlForResult(result)}</span>
-            </span>
-            <div className="mt-2 text-muted">
-                <ResultDescription result={result} />
+        <div className="border bg-bg-light rounded p-4 md:p-6">
+            <div className="space-y-4">
+                <div>
+                    <div>{tabToName[result.type as keyof typeof tabToName]}</div>
+                    <div className="text-text-3000 font-bold text-lg">
+                        <ResultName result={result} />
+                    </div>
+                    <span className="text-[var(--trace-3000)] text-xs break-all">
+                        {location.host}
+                        <span className="text-muted-3000">{urlForResult(result)}</span>
+                    </span>
+                    <div className="mt-2 text-muted">
+                        <ResultDescription result={result} />
+                    </div>
+                </div>
+                <div className="grid grid-cols-[auto_1fr] items-center gap-2">
+                    <LemonButton
+                        type="secondary"
+                        size="small"
+                        onClick={() => {
+                            openResult(activeResultIndex)
+                        }}
+                        tooltip={
+                            <>
+                                Open <KeyboardShortcut enter />
+                            </>
+                        }
+                        aria-label="Open search result"
+                    >
+                        Open
+                    </LemonButton>
+                    <div>
+                        <KeyboardShortcut enter /> Open
+                    </div>
+                </div>
             </div>
         </div>
     )
diff --git a/frontend/src/lib/components/CommandBar/SearchResults.tsx b/frontend/src/lib/components/CommandBar/SearchResults.tsx
index 2dde6f78cbead..3e6abbc35a27d 100644
--- a/frontend/src/lib/components/CommandBar/SearchResults.tsx
+++ b/frontend/src/lib/components/CommandBar/SearchResults.tsx
@@ -1,6 +1,4 @@
-import clsx from 'clsx'
 import { useValues } from 'kea'
-import { useResizeBreakpoints } from 'lib/hooks/useResizeObserver'
 
 import { DetectiveHog } from '../hedgehogs'
 import { searchBarLogic } from './searchBarLogic'
@@ -10,27 +8,17 @@ import { SearchResultPreview } from './SearchResultPreview'
 export const SearchResults = (): JSX.Element => {
     const { combinedSearchResults, combinedSearchLoading, activeResultIndex } = useValues(searchBarLogic)
 
-    const { ref, size } = useResizeBreakpoints({
-        0: 'small',
-        550: 'normal',
-    })
-
     return (
-        <div className="SearchResults grow" ref={ref}>
+        <>
             {!combinedSearchLoading && combinedSearchResults?.length === 0 ? (
-                <div className="w-full h-full flex flex-col items-center justify-center p-3">
+                <div className="w-full h-full flex flex-col items-center justify-center p-3 text-center">
                     <h3 className="mb-0 text-xl">No results</h3>
                     <p className="opacity-75 mb-0">This doesn't happen often, but we're stumped!</p>
                     <DetectiveHog height={150} width={150} />
                 </div>
             ) : (
-                <div className="overflow-hidden overscroll-contain flex h-full">
-                    <div
-                        className={clsx(
-                            'border-r bg-bg-3000 overscroll-contain overflow-y-scroll grow-0 shrink-0 w-full',
-                            size !== 'small' && 'max-w-80'
-                        )}
-                    >
+                <div className="md:grid md:grid-cols-[320px_1fr]">
+                    <div className="border-r border-b md:border-b-0 bg-bg-3000 overscroll-contain overflow-y-auto">
                         {combinedSearchLoading && (
                             <>
                                 <SearchResultSkeleton />
@@ -48,13 +36,11 @@ export const SearchResults = (): JSX.Element => {
                                 />
                             ))}
                     </div>
-                    {size !== 'small' ? (
-                        <div className="p-2 grow">
-                            <SearchResultPreview />
-                        </div>
-                    ) : null}
+                    <div className="p-2 grow hidden md:block overflow-auto">
+                        <SearchResultPreview />
+                    </div>
                 </div>
             )}
-        </div>
+        </>
     )
 }
diff --git a/frontend/src/lib/components/CommandBar/SearchTabs.tsx b/frontend/src/lib/components/CommandBar/SearchTabs.tsx
index 37ff41ff30a53..aa3ddb67e8496 100644
--- a/frontend/src/lib/components/CommandBar/SearchTabs.tsx
+++ b/frontend/src/lib/components/CommandBar/SearchTabs.tsx
@@ -12,11 +12,13 @@ type SearchTabsProps = {
 export const SearchTabs = ({ inputRef }: SearchTabsProps): JSX.Element | null => {
     const { tabsGrouped } = useValues(searchBarLogic)
     return (
-        <div className="flex flex-col border-r bg-bg-light min-w-30 w-1/3 max-w-50 flex-0 overflow-y-auto">
+        <div className="flex flex-col border-r bg-bg-light overflow-y-auto">
             {Object.entries(tabsGrouped).map(([group, tabs]) => (
                 <div key={group} className={group !== 'all' ? 'pt-1.5' : ''}>
                     {group !== 'all' && (
-                        <span className="ml-4 text-xxs text-muted uppercase">{groupToName[group]}</span>
+                        <span className="ml-4 text-xxs text-muted uppercase">
+                            {groupToName[group as keyof typeof groupToName]}
+                        </span>
                     )}
                     {tabs.map((tab) => (
                         <SearchBarTab key={tab} tab={tab} inputRef={inputRef} />
diff --git a/frontend/src/lib/components/CommandBar/index.scss b/frontend/src/lib/components/CommandBar/index.scss
index 02aa24cb7a11d..3150d46ed5ada 100644
--- a/frontend/src/lib/components/CommandBar/index.scss
+++ b/frontend/src/lib/components/CommandBar/index.scss
@@ -16,11 +16,6 @@
     }
 }
 
-.SearchResults {
-    // offset container height by input
-    height: calc(100% - 2.875rem);
-}
-
 .CommandBar__overlay {
     position: fixed;
     top: 0;
diff --git a/frontend/src/lib/components/CommandBar/searchBarLogic.ts b/frontend/src/lib/components/CommandBar/searchBarLogic.ts
index b3576ac482d10..1b96c64c34b81 100644
--- a/frontend/src/lib/components/CommandBar/searchBarLogic.ts
+++ b/frontend/src/lib/components/CommandBar/searchBarLogic.ts
@@ -61,6 +61,7 @@ export const searchBarLogic = kea<searchBarLogicType>([
         onArrowUp: (activeIndex: number, maxIndex: number) => ({ activeIndex, maxIndex }),
         onArrowDown: (activeIndex: number, maxIndex: number) => ({ activeIndex, maxIndex }),
         openResult: (index: number) => ({ index }),
+        setActiveResultIndex: (index: number) => ({ index }),
     }),
     loaders(({ values, actions }) => ({
         rawSearchResponse: [
@@ -208,6 +209,7 @@ export const searchBarLogic = kea<searchBarLogicType>([
                 openResult: () => 0,
                 onArrowUp: (_, { activeIndex, maxIndex }) => (activeIndex > 0 ? activeIndex - 1 : maxIndex),
                 onArrowDown: (_, { activeIndex, maxIndex }) => (activeIndex < maxIndex ? activeIndex + 1 : 0),
+                setActiveResultIndex: (_, { index }) => index,
             },
         ],
         activeTab: [
diff --git a/frontend/src/lib/constants.tsx b/frontend/src/lib/constants.tsx
index d41a232518b18..ae93860525488 100644
--- a/frontend/src/lib/constants.tsx
+++ b/frontend/src/lib/constants.tsx
@@ -314,3 +314,11 @@ export const SESSION_REPLAY_MINIMUM_DURATION_OPTIONS: LemonSelectOptions<number
 ]
 
 export const UNSUBSCRIBE_SURVEY_ID = '018b6e13-590c-0000-decb-c727a2b3f462'
+
+export const TAILWIND_BREAKPOINTS = {
+    sm: 526,
+    md: 768,
+    lg: 992,
+    xl: 1200,
+    '2xl': 1600,
+}
diff --git a/frontend/src/lib/lemon-ui/LemonInput/LemonInput.scss b/frontend/src/lib/lemon-ui/LemonInput/LemonInput.scss
index 9f0ecf27a7c4a..166c7ff50528a 100644
--- a/frontend/src/lib/lemon-ui/LemonInput/LemonInput.scss
+++ b/frontend/src/lib/lemon-ui/LemonInput/LemonInput.scss
@@ -97,6 +97,12 @@
     &.LemonInput--type-search {
         // NOTE Design: Search inputs are given a specific small width
         max-width: 240px;
+        border-radius: 0;
+
+        // Add a focus ring to the element (not the input) when focused
+        &:has(:focus-visible) {
+            outline: -webkit-focus-ring-color auto 1px;
+        }
     }
 
     &.LemonInput--type-number {
diff --git a/frontend/src/lib/utils/concurrencyController.ts b/frontend/src/lib/utils/concurrencyController.ts
index 941af92f33b74..7326165b623a5 100644
--- a/frontend/src/lib/utils/concurrencyController.ts
+++ b/frontend/src/lib/utils/concurrencyController.ts
@@ -1,5 +1,8 @@
 import FastPriorityQueue from 'fastpriorityqueue'
 import { promiseResolveReject } from 'lib/utils'
+
+// Note that this file also exists in the plugin-server, please keep them in sync as the tests only exist for this version
+
 class ConcurrencyControllerItem<T> {
     _debugTag?: string
     _runFn: () => Promise<void>
@@ -8,7 +11,7 @@ class ConcurrencyControllerItem<T> {
     constructor(
         concurrencyController: ConcurrencyController,
         userFn: () => Promise<T>,
-        abortController: AbortController,
+        abortController: AbortController | undefined,
         priority: number = Infinity,
         debugTag: string | undefined
     ) {
@@ -17,7 +20,7 @@ class ConcurrencyControllerItem<T> {
         const { promise, resolve, reject } = promiseResolveReject<T>()
         this._promise = promise
         this._runFn = async () => {
-            if (abortController.signal.aborted) {
+            if (abortController?.signal.aborted) {
                 reject(new FakeAbortError(abortController.signal.reason || 'AbortError'))
                 return
             }
@@ -32,7 +35,7 @@ class ConcurrencyControllerItem<T> {
                 reject(error)
             }
         }
-        abortController.signal.addEventListener('abort', () => {
+        abortController?.signal.addEventListener('abort', () => {
             reject(new FakeAbortError(abortController.signal.reason || 'AbortError'))
         })
         promise
@@ -76,7 +79,7 @@ export class ConcurrencyController {
     }: {
         fn: () => Promise<T>
         priority?: number
-        abortController: AbortController
+        abortController?: AbortController
         debugTag?: string
     }): Promise<T> => {
         const item = new ConcurrencyControllerItem(this, fn, abortController, priority, debugTag)
diff --git a/frontend/src/queries/schema.json b/frontend/src/queries/schema.json
index 1d2a4d94012aa..005fb78c94497 100644
--- a/frontend/src/queries/schema.json
+++ b/frontend/src/queries/schema.json
@@ -2132,6 +2132,9 @@
                 "significant": {
                     "type": "boolean"
                 },
+                "stats_version": {
+                    "type": "integer"
+                },
                 "timezone": {
                     "type": "string"
                 },
@@ -4511,6 +4514,9 @@
                                 "significant": {
                                     "type": "boolean"
                                 },
+                                "stats_version": {
+                                    "type": "integer"
+                                },
                                 "variants": {
                                     "items": {
                                         "$ref": "#/definitions/ExperimentVariantFunnelsBaseStats"
@@ -5991,6 +5997,9 @@
                 },
                 "response": {
                     "$ref": "#/definitions/ExperimentFunnelsQueryResponse"
+                },
+                "stats_version": {
+                    "type": "integer"
                 }
             },
             "required": ["funnels_query", "kind"],
@@ -6041,6 +6050,9 @@
                 "significant": {
                     "type": "boolean"
                 },
+                "stats_version": {
+                    "type": "integer"
+                },
                 "variants": {
                     "items": {
                         "$ref": "#/definitions/ExperimentVariantFunnelsBaseStats"
@@ -9787,6 +9799,9 @@
                         "significant": {
                             "type": "boolean"
                         },
+                        "stats_version": {
+                            "type": "integer"
+                        },
                         "variants": {
                             "items": {
                                 "$ref": "#/definitions/ExperimentVariantFunnelsBaseStats"
@@ -10418,6 +10433,9 @@
                         "significant": {
                             "type": "boolean"
                         },
+                        "stats_version": {
+                            "type": "integer"
+                        },
                         "variants": {
                             "items": {
                                 "$ref": "#/definitions/ExperimentVariantFunnelsBaseStats"
diff --git a/frontend/src/queries/schema.ts b/frontend/src/queries/schema.ts
index 5360ae06d99f4..9e4252c989b38 100644
--- a/frontend/src/queries/schema.ts
+++ b/frontend/src/queries/schema.ts
@@ -2003,6 +2003,7 @@ export interface ExperimentFunnelsQueryResponse {
     significance_code: ExperimentSignificanceCode
     expected_loss: number
     credible_intervals: Record<string, [number, number]>
+    stats_version?: integer
 }
 
 export type CachedExperimentFunnelsQueryResponse = CachedQueryResponse<ExperimentFunnelsQueryResponse>
@@ -2012,6 +2013,7 @@ export interface ExperimentFunnelsQuery extends DataNode<ExperimentFunnelsQueryR
     name?: string
     experiment_id?: integer
     funnels_query: FunnelsQuery
+    stats_version?: integer
 }
 
 export interface ExperimentTrendsQuery extends DataNode<ExperimentTrendsQueryResponse> {
diff --git a/frontend/src/scenes/experiments/experimentLogic.tsx b/frontend/src/scenes/experiments/experimentLogic.tsx
index 698c4182dc84d..6094de7721879 100644
--- a/frontend/src/scenes/experiments/experimentLogic.tsx
+++ b/frontend/src/scenes/experiments/experimentLogic.tsx
@@ -861,10 +861,7 @@ export const experimentLogic = kea<experimentLogicType>([
                                 ...values.experiment.metrics[0],
                                 experiment_id: values.experimentId,
                             }
-                            if (
-                                queryWithExperimentId.kind === NodeKind.ExperimentTrendsQuery &&
-                                values.featureFlags[FEATURE_FLAGS.EXPERIMENT_STATS_V2]
-                            ) {
+                            if (values.featureFlags[FEATURE_FLAGS.EXPERIMENT_STATS_V2]) {
                                 queryWithExperimentId.stats_version = 2
                             }
 
diff --git a/frontend/src/scenes/insights/insightDataLogic.tsx b/frontend/src/scenes/insights/insightDataLogic.tsx
index 1de631587cf35..b5036267488b0 100644
--- a/frontend/src/scenes/insights/insightDataLogic.tsx
+++ b/frontend/src/scenes/insights/insightDataLogic.tsx
@@ -219,6 +219,7 @@ export const insightDataLogic = kea<insightDataLogicType>([
             if (isQueryTooLarge(query)) {
                 localStorage.removeItem(`draft-query-${values.currentTeamId}`)
             }
+
             localStorage.setItem(
                 `draft-query-${values.currentTeamId}`,
                 crushDraftQueryForLocalStorage(query, Date.now())
diff --git a/frontend/src/scenes/insights/utils.tsx b/frontend/src/scenes/insights/utils.tsx
index 5a1b4d56ec7d9..04e55519ab05b 100644
--- a/frontend/src/scenes/insights/utils.tsx
+++ b/frontend/src/scenes/insights/utils.tsx
@@ -1,4 +1,3 @@
-import JSONCrush from 'jsoncrush'
 import api from 'lib/api'
 import { dayjs } from 'lib/dayjs'
 import { CORE_FILTER_DEFINITIONS_BY_GROUP } from 'lib/taxonomy'
@@ -445,40 +444,27 @@ export function isQueryTooLarge(query: Node<Record<string, any>>): boolean {
 export function parseDraftQueryFromLocalStorage(
     query: string
 ): { query: Node<Record<string, any>>; timestamp: number } | null {
-    // First try to uncrush the query if it's a JSONCrush query else fall back to parsing it as a JSON
     try {
-        const uncrushedQuery = JSONCrush.uncrush(query)
-        return JSON.parse(uncrushedQuery)
+        return JSON.parse(query)
     } catch (e) {
-        console.error('Error parsing uncrushed query', e)
-        try {
-            return JSON.parse(query)
-        } catch (e) {
-            console.error('Error parsing query', e)
-            return null
-        }
+        console.error('Error parsing query', e)
+        return null
     }
 }
 
 export function crushDraftQueryForLocalStorage(query: Node<Record<string, any>>, timestamp: number): string {
-    return JSONCrush.crush(JSON.stringify({ query, timestamp }))
+    return JSON.stringify({ query, timestamp })
 }
 
 export function parseDraftQueryFromURL(query: string): Node<Record<string, any>> | null {
     try {
-        const uncrushedQuery = JSONCrush.uncrush(query)
-        return JSON.parse(uncrushedQuery)
+        return JSON.parse(query)
     } catch (e) {
-        console.error('Error parsing uncrushed query', e)
-        try {
-            return JSON.parse(query)
-        } catch (e) {
-            console.error('Error parsing query', e)
-            return null
-        }
+        console.error('Error parsing query', e)
+        return null
     }
 }
 
 export function crushDraftQueryForURL(query: Node<Record<string, any>>): string {
-    return JSONCrush.crush(JSON.stringify(query))
+    return JSON.stringify(query)
 }
diff --git a/frontend/src/scenes/surveys/surveyViewViz.tsx b/frontend/src/scenes/surveys/surveyViewViz.tsx
index 8e19c575fef10..7384fec294c94 100644
--- a/frontend/src/scenes/surveys/surveyViewViz.tsx
+++ b/frontend/src/scenes/surveys/surveyViewViz.tsx
@@ -207,6 +207,7 @@ export function RatingQuestionBarChart({
     }
     useEffect(() => {
         loadSurveyRatingResults({ questionIndex, iteration })
+        // eslint-disable-next-line react-hooks/exhaustive-deps
     }, [questionIndex])
 
     return (
@@ -301,6 +302,7 @@ export function NPSSurveyResultsBarChart({
 
     useEffect(() => {
         loadSurveyRecurringNPSResults({ questionIndex })
+        // eslint-disable-next-line react-hooks/exhaustive-deps
     }, [questionIndex])
 
     return (
@@ -397,6 +399,7 @@ export function SingleChoiceQuestionPieChart({
 
     useEffect(() => {
         loadSurveySingleChoiceResults({ questionIndex })
+        // eslint-disable-next-line react-hooks/exhaustive-deps
     }, [questionIndex])
 
     return (
@@ -499,12 +502,15 @@ export function MultipleChoiceQuestionBarChart({
 
     useEffect(() => {
         loadSurveyMultipleChoiceResults({ questionIndex })
+        // eslint-disable-next-line react-hooks/exhaustive-deps
     }, [questionIndex])
 
     useEffect(() => {
         if (surveyMultipleChoiceResults?.[questionIndex]?.data?.length) {
             setChartHeight(100 + 20 * surveyMultipleChoiceResults[questionIndex].data.length)
         }
+        // TODO this one maybe should have questionIndex as a dependency
+        // eslint-disable-next-line react-hooks/exhaustive-deps
     }, [surveyMultipleChoiceResults])
 
     return (
@@ -581,6 +587,7 @@ export function OpenTextViz({
 
     useEffect(() => {
         loadSurveyOpenTextResults({ questionIndex })
+        // eslint-disable-next-line react-hooks/exhaustive-deps
     }, [questionIndex])
 
     return (
@@ -736,9 +743,9 @@ function ResponseSummaryFeedback({ surveyId }: { surveyId: string }): JSX.Elemen
             return // Already rated
         }
         setRating(newRating)
-        posthog.capture('survey_resonse_rated', {
+        posthog.capture('ai_survey_summary_rated', {
             survey_id: surveyId,
-            answer_rating: rating,
+            answer_rating: newRating,
         })
     }
 
diff --git a/frontend/src/scenes/urls.ts b/frontend/src/scenes/urls.ts
index 293481f598e3b..477064de6497e 100644
--- a/frontend/src/scenes/urls.ts
+++ b/frontend/src/scenes/urls.ts
@@ -1,4 +1,3 @@
-import JSONCrush from 'jsoncrush'
 import { combineUrl } from 'kea-router'
 import { AlertType } from 'lib/components/Alerts/types'
 import { getCurrentTeamId } from 'lib/utils/getAppContext'
@@ -75,8 +74,7 @@ export const urls = {
     insightNew: (type?: InsightType, dashboardId?: DashboardType['id'] | null, query?: Node): string =>
         combineUrl('/insights/new', dashboardId ? { dashboard: dashboardId } : {}, {
             ...(type ? { insight: type } : {}),
-            // have to use JSONCrush directly rather than the util to avoid circular dep
-            ...(query ? { q: typeof query === 'string' ? query : JSONCrush.crush(JSON.stringify(query)) } : {}),
+            ...(query ? { q: typeof query === 'string' ? query : JSON.stringify(query) } : {}),
         }).url,
     insightNewHogQL: (query: string, filters?: HogQLFilters): string =>
         combineUrl(
diff --git a/jest.config.ts b/jest.config.ts
index 39ab8232a5592..53e5df5943413 100644
--- a/jest.config.ts
+++ b/jest.config.ts
@@ -8,7 +8,7 @@ process.env.TZ = process.env.TZ || 'UTC'
  * https://jestjs.io/docs/en/configuration.html
  */
 
-const esmModules = ['query-selector-shadow-dom', 'react-syntax-highlighter', '@react-hook', '@medv', 'monaco-editor', 'jsoncrush']
+const esmModules = ['query-selector-shadow-dom', 'react-syntax-highlighter', '@react-hook', '@medv', 'monaco-editor']
 const eeFolderExists = fs.existsSync('ee/frontend/exports.ts')
 function rootDirectories() {
     const rootDirectories = ['<rootDir>/frontend/src']
diff --git a/package.json b/package.json
index 6944c5b335e23..06dc6653b7713 100644
--- a/package.json
+++ b/package.json
@@ -140,7 +140,6 @@
         "hls.js": "^1.5.15",
         "husky": "^7.0.4",
         "image-blob-reduce": "^4.1.0",
-        "jsoncrush": "^1.1.8",
         "kea": "^3.1.5",
         "kea-forms": "^3.2.0",
         "kea-loaders": "^3.0.0",
@@ -162,7 +161,7 @@
         "pmtiles": "^2.11.0",
         "postcss": "^8.4.31",
         "postcss-preset-env": "^9.3.0",
-        "posthog-js": "1.202.2",
+        "posthog-js": "1.202.4",
         "posthog-js-lite": "3.0.0",
         "prettier": "^2.8.8",
         "prop-types": "^15.7.2",
diff --git a/plugin-server/package.json b/plugin-server/package.json
index 11df155e0757c..9014d19be548b 100644
--- a/plugin-server/package.json
+++ b/plugin-server/package.json
@@ -69,16 +69,17 @@
         "express": "^4.18.2",
         "faker": "^5.5.3",
         "fast-deep-equal": "^3.1.3",
+        "fastpriorityqueue": "^0.7.5",
         "fernet-nodejs": "^1.0.6",
         "generic-pool": "^3.7.1",
         "graphile-worker": "0.13.0",
         "ioredis": "^4.27.6",
         "ipaddr.js": "^2.1.0",
         "kafkajs": "^2.2.0",
-        "lz4-kafkajs": "1.0.0",
         "kafkajs-snappy": "^1.1.0",
         "lru-cache": "^6.0.0",
         "luxon": "^3.4.4",
+        "lz4-kafkajs": "1.0.0",
         "node-fetch": "^2.6.1",
         "node-rdkafka": "^2.17.0",
         "node-schedule": "^2.1.0",
diff --git a/plugin-server/pnpm-lock.yaml b/plugin-server/pnpm-lock.yaml
index c297462845d8e..f187191553102 100644
--- a/plugin-server/pnpm-lock.yaml
+++ b/plugin-server/pnpm-lock.yaml
@@ -91,6 +91,9 @@ dependencies:
   fast-deep-equal:
     specifier: ^3.1.3
     version: 3.1.3
+  fastpriorityqueue:
+    specifier: ^0.7.5
+    version: 0.7.5
   fernet-nodejs:
     specifier: ^1.0.6
     version: 1.0.6
@@ -6276,6 +6279,10 @@ packages:
       strnum: 1.0.5
     dev: false
 
+  /fastpriorityqueue@0.7.5:
+    resolution: {integrity: sha512-3Pa0n9gwy8yIbEsT3m2j/E9DXgWvvjfiZjjqcJ+AdNKTAlVMIuFYrYG5Y3RHEM8O6cwv9hOpOWY/NaMfywoQVA==}
+    dev: false
+
   /fastq@1.15.0:
     resolution: {integrity: sha512-wBrocU2LCXXa+lWBt8RoIRD89Fi8OdABODa/kEnyeyjS5aZO5/GNvI5sEINADqP/h8M29UHTHUb53sUu5Ihqdw==}
     dependencies:
diff --git a/plugin-server/src/utils/concurrencyController.ts b/plugin-server/src/utils/concurrencyController.ts
new file mode 100644
index 0000000000000..ac84d439fd507
--- /dev/null
+++ b/plugin-server/src/utils/concurrencyController.ts
@@ -0,0 +1,133 @@
+import FastPriorityQueue from 'fastpriorityqueue'
+
+export function promiseResolveReject<T>(): {
+    resolve: (value: T) => void
+    reject: (reason?: any) => void
+    promise: Promise<T>
+} {
+    let resolve: (value: T) => void
+    let reject: (reason?: any) => void
+    const promise = new Promise<T>((innerResolve, innerReject) => {
+        resolve = innerResolve
+        reject = innerReject
+    })
+    return { resolve: resolve!, reject: reject!, promise }
+}
+
+// Note that this file also exists in the frontend code, please keep them in sync as the tests only exist in the other version
+class ConcurrencyControllerItem<T> {
+    _debugTag?: string
+    _runFn: () => Promise<void>
+    _priority: number = Infinity
+    _promise: Promise<T>
+    constructor(
+        concurrencyController: ConcurrencyController,
+        userFn: () => Promise<T>,
+        abortController: AbortController | undefined,
+        priority: number = Infinity,
+        debugTag: string | undefined
+    ) {
+        this._debugTag = debugTag
+        this._priority = priority
+        const { promise, resolve, reject } = promiseResolveReject<T>()
+        this._promise = promise
+        this._runFn = async () => {
+            if (abortController?.signal.aborted) {
+                reject(new FakeAbortError(abortController.signal.reason || 'AbortError'))
+                return
+            }
+            if (concurrencyController._current.length >= concurrencyController._concurrencyLimit) {
+                throw new Error('Developer Error: ConcurrencyControllerItem: _runFn called while already running')
+            }
+            try {
+                concurrencyController._current.push(this)
+                const result = await userFn()
+                resolve(result)
+            } catch (error) {
+                reject(error)
+            }
+        }
+        abortController?.signal.addEventListener('abort', () => {
+            reject(new FakeAbortError(abortController.signal.reason || 'AbortError'))
+        })
+        promise
+            .catch(() => {
+                // ignore
+            })
+            .finally(() => {
+                if (concurrencyController._current.includes(this)) {
+                    concurrencyController._current = concurrencyController._current.filter((item) => item !== this)
+                    concurrencyController._runNext()
+                }
+            })
+    }
+}
+
+export class ConcurrencyController {
+    _concurrencyLimit: number
+
+    _current: ConcurrencyControllerItem<any>[] = []
+    private _queue: FastPriorityQueue<ConcurrencyControllerItem<any>> = new FastPriorityQueue(
+        (a, b) => a._priority < b._priority
+    )
+
+    constructor(concurrencyLimit: number) {
+        this._concurrencyLimit = concurrencyLimit
+    }
+
+    /**
+     * Run a function with a mutex. If the mutex is already running, the function will be queued and run when the mutex
+     * is available.
+     * @param fn The function to run
+     * @param priority The priority of the function. Lower numbers will be run first. Defaults to Infinity.
+     * @param abortController An AbortController that, if aborted,  will reject the promise and immediately start the next item in the queue.
+     * @param debugTag
+     */
+    run = <T>({
+        fn,
+        priority,
+        abortController,
+        debugTag,
+    }: {
+        fn: () => Promise<T>
+        priority?: number
+        abortController?: AbortController
+        debugTag?: string
+    }): Promise<T> => {
+        const item = new ConcurrencyControllerItem(this, fn, abortController, priority, debugTag)
+
+        this._queue.add(item)
+
+        this._tryRunNext()
+
+        return item._promise
+    }
+
+    _runNext(): void {
+        const next = this._queue.poll()
+        if (next) {
+            next._runFn()
+                .catch(() => {
+                    // ignore
+                })
+                .finally(() => {
+                    this._tryRunNext()
+                })
+        }
+    }
+
+    _tryRunNext(): void {
+        if (this._current.length < this._concurrencyLimit) {
+            this._runNext()
+        }
+    }
+
+    setConcurrencyLimit = (limit: number): void => {
+        this._concurrencyLimit = limit
+    }
+}
+
+// Create a fake AbortError that allows us to use e.name === 'AbortError' to check if an error is an AbortError
+class FakeAbortError extends Error {
+    name = 'AbortError'
+}
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index f383926dd1807..765be82fa6826 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -241,9 +241,6 @@ dependencies:
   image-blob-reduce:
     specifier: ^4.1.0
     version: 4.1.0
-  jsoncrush:
-    specifier: ^1.1.8
-    version: 1.1.8
   kea:
     specifier: ^3.1.5
     version: 3.1.5(react@18.2.0)
@@ -308,8 +305,8 @@ dependencies:
     specifier: ^9.3.0
     version: 9.3.0(postcss@8.4.31)
   posthog-js:
-    specifier: 1.202.2
-    version: 1.202.2
+    specifier: 1.202.4
+    version: 1.202.4
   posthog-js-lite:
     specifier: 3.0.0
     version: 3.0.0
@@ -13319,7 +13316,7 @@ packages:
       gopd: 1.2.0
       has-symbols: 1.1.0
       hasown: 2.0.2
-      math-intrinsics: 1.0.0
+      math-intrinsics: 1.1.0
     dev: true
 
   /get-nonce@1.0.1:
@@ -14076,7 +14073,7 @@ packages:
       hogan.js: 3.0.2
       htm: 3.1.1
       instantsearch-ui-components: 0.3.0
-      preact: 10.25.2
+      preact: 10.25.3
       qs: 6.9.7
       search-insights: 2.13.0
     dev: false
@@ -15415,10 +15412,6 @@ packages:
     resolution: {integrity: sha512-gfFQZrcTc8CnKXp6Y4/CBT3fTc0OVuDofpre4aEeEpSBPV5X5v4+Vmx+8snU7RLPrNHPKSgLxGo9YuQzz20o+w==}
     dev: true
 
-  /jsoncrush@1.1.8:
-    resolution: {integrity: sha512-lvIMGzMUA0fjuqwNcxlTNRq2bibPZ9auqT/LyGdlR5hvydJtA/BasSgkx4qclqTKVeTidrJvsS/oVjlTCPQ4Nw==}
-    dev: false
-
   /jsonfile@6.1.0:
     resolution: {integrity: sha512-5dgndWOriYSm5cnYaJNhalLNDKOqFwyDB/rr1E9ZsGciGvKPs8R2xYGCacuf3z6K1YKDz182fd+fY3cn3pMqXQ==}
     dependencies:
@@ -16003,8 +15996,8 @@ packages:
     resolution: {integrity: sha512-6qE4B9deFBIa9YSpOc9O0Sgc43zTeVYbgDT5veRKSlB2+ZuHNoVVxA1L/ckMUayV9Ay9y7Z/SZCLcGteW9i7bg==}
     dev: false
 
-  /math-intrinsics@1.0.0:
-    resolution: {integrity: sha512-4MqMiKP90ybymYvsut0CH2g4XWbfLtmlCkXmtmdcDCxNB+mQcu1w/1+L/VD7vi/PSv7X2JYV7SCcR+jiPXnQtA==}
+  /math-intrinsics@1.1.0:
+    resolution: {integrity: sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==}
     engines: {node: '>= 0.4'}
     dev: true
 
@@ -17909,12 +17902,12 @@ packages:
     resolution: {integrity: sha512-dyajjnfzZD1tht4N7p7iwf7nBnR1MjVaVu+MKr+7gBgA39bn28wizCIJZztZPtHy4PY0YwtSGgwfBCuG/hnHgA==}
     dev: false
 
-  /posthog-js@1.202.2:
-    resolution: {integrity: sha512-9p7dAWuCfoM0WrasubGwtC8i38HU3iMqK3gd0mhyAoTrEVMVozTQq64Toc2VEv8H69NGNn6ikk5t2LclHT9XFA==}
+  /posthog-js@1.202.4:
+    resolution: {integrity: sha512-YLu6f1ibAkiopGivGQnLBaCKegT+0GHP3DfP72z3KVby2UXLBB7dj+GIa54zfjburE7ehasysRzeK5nW39QOfA==}
     dependencies:
       core-js: 3.39.0
       fflate: 0.4.8
-      preact: 10.25.2
+      preact: 10.25.3
       web-vitals: 4.2.4
     dev: false
 
@@ -17922,8 +17915,8 @@ packages:
     resolution: {integrity: sha512-Q+/tYsFU9r7xoOJ+y/ZTtdVQwTWfzjbiXBDMM/JKUux3+QPP02iUuIoeBQ+Ot6oEDlC+/PGjB/5A3K7KKb7hcw==}
     dev: false
 
-  /preact@10.25.2:
-    resolution: {integrity: sha512-GEts1EH3oMnqdOIeXhlbBSddZ9nrINd070WBOiPO2ous1orrKGUM4SMDbwyjSWD1iMS2dBvaDjAa5qUhz3TXqw==}
+  /preact@10.25.3:
+    resolution: {integrity: sha512-dzQmIFtM970z+fP9ziQ3yG4e3ULIbwZzJ734vaMVUTaKQ2+Ru1Ou/gjshOYVHCcd1rpAelC6ngjvjDXph98unQ==}
     dev: false
 
   /prelude-ls@1.2.1:
diff --git a/posthog/conftest.py b/posthog/conftest.py
index c27dbec43955e..e9804d25eff42 100644
--- a/posthog/conftest.py
+++ b/posthog/conftest.py
@@ -14,6 +14,7 @@ def create_clickhouse_tables(num_tables: int):
         CREATE_DATA_QUERIES,
         CREATE_DICTIONARY_QUERIES,
         CREATE_DISTRIBUTED_TABLE_QUERIES,
+        CREATE_KAFKA_TABLE_QUERIES,
         CREATE_MERGETREE_TABLE_QUERIES,
         CREATE_MV_TABLE_QUERIES,
         CREATE_VIEW_QUERIES,
@@ -28,10 +29,18 @@ def create_clickhouse_tables(num_tables: int):
         + len(CREATE_DICTIONARY_QUERIES)
     )
 
+    # Evaluation tests use Kafka for faster data ingestion.
+    if settings.IN_EVAL_TESTING:
+        total_tables += len(CREATE_KAFKA_TABLE_QUERIES)
+
     # Check if all the tables have already been created. Views, materialized views, and dictionaries also count
     if num_tables == total_tables:
         return
 
+    if settings.IN_EVAL_TESTING:
+        kafka_table_queries = list(map(build_query, CREATE_KAFKA_TABLE_QUERIES))
+        run_clickhouse_statement_in_parallel(kafka_table_queries)
+
     table_queries = list(map(build_query, CREATE_MERGETREE_TABLE_QUERIES + CREATE_DISTRIBUTED_TABLE_QUERIES))
     run_clickhouse_statement_in_parallel(table_queries)
 
@@ -62,7 +71,7 @@ def reset_clickhouse_tables():
     from posthog.models.channel_type.sql import TRUNCATE_CHANNEL_DEFINITION_TABLE_SQL
     from posthog.models.cohort.sql import TRUNCATE_COHORTPEOPLE_TABLE_SQL
     from posthog.models.error_tracking.sql import TRUNCATE_ERROR_TRACKING_ISSUE_FINGERPRINT_OVERRIDES_TABLE_SQL
-    from posthog.models.event.sql import TRUNCATE_EVENTS_TABLE_SQL, TRUNCATE_EVENTS_RECENT_TABLE_SQL
+    from posthog.models.event.sql import TRUNCATE_EVENTS_RECENT_TABLE_SQL, TRUNCATE_EVENTS_TABLE_SQL
     from posthog.models.group.sql import TRUNCATE_GROUPS_TABLE_SQL
     from posthog.models.performance.sql import TRUNCATE_PERFORMANCE_EVENTS_TABLE_SQL
     from posthog.models.person.sql import (
@@ -100,6 +109,18 @@ def reset_clickhouse_tables():
         TRUNCATE_HEATMAPS_TABLE_SQL(),
     ]
 
+    # Drop created Kafka tables because some tests don't expect it.
+    if settings.IN_EVAL_TESTING:
+        kafka_tables = sync_execute(
+            f"""
+            SELECT name
+            FROM system.tables
+            WHERE database = '{settings.CLICKHOUSE_DATABASE}' AND name LIKE 'kafka_%'
+            """,
+        )
+        # Using `ON CLUSTER` takes x20 more time to drop the tables: https://github.com/ClickHouse/ClickHouse/issues/15473.
+        TABLES_TO_CREATE_DROP += [f"DROP TABLE {table[0]}" for table in kafka_tables]
+
     run_clickhouse_statement_in_parallel(TABLES_TO_CREATE_DROP)
 
     from posthog.clickhouse.schema import (
diff --git a/posthog/hogql_queries/experiments/experiment_funnels_query_runner.py b/posthog/hogql_queries/experiments/experiment_funnels_query_runner.py
index d9d874d7b6e6b..363ee06bc78be 100644
--- a/posthog/hogql_queries/experiments/experiment_funnels_query_runner.py
+++ b/posthog/hogql_queries/experiments/experiment_funnels_query_runner.py
@@ -7,6 +7,11 @@
     calculate_credible_intervals,
     calculate_probabilities,
 )
+from posthog.hogql_queries.experiments.funnels_statistics_v2 import (
+    are_results_significant_v2,
+    calculate_credible_intervals_v2,
+    calculate_probabilities_v2,
+)
 from posthog.hogql_queries.query_runner import QueryRunner
 from posthog.models.experiment import Experiment
 from ..insights.funnels.funnels_query_runner import FunnelsQueryRunner
@@ -45,6 +50,8 @@ def __init__(self, *args, **kwargs):
         if self.experiment.holdout:
             self.variants.append(f"holdout-{self.experiment.holdout.id}")
 
+        self.stats_version = self.query.stats_version or 1
+
         self.prepared_funnels_query = self._prepare_funnel_query()
         self.funnels_query_runner = FunnelsQueryRunner(
             query=self.prepared_funnels_query, team=self.team, timings=self.timings, limit_context=self.limit_context
@@ -63,9 +70,14 @@ def calculate(self) -> ExperimentFunnelsQueryResponse:
 
             # Statistical analysis
             control_variant, test_variants = self._get_variants_with_base_stats(funnels_result)
-            probabilities = calculate_probabilities(control_variant, test_variants)
-            significance_code, loss = are_results_significant(control_variant, test_variants, probabilities)
-            credible_intervals = calculate_credible_intervals([control_variant, *test_variants])
+            if self.stats_version == 2:
+                probabilities = calculate_probabilities_v2(control_variant, test_variants)
+                significance_code, loss = are_results_significant_v2(control_variant, test_variants, probabilities)
+                credible_intervals = calculate_credible_intervals_v2([control_variant, *test_variants])
+            else:
+                probabilities = calculate_probabilities(control_variant, test_variants)
+                significance_code, loss = are_results_significant(control_variant, test_variants, probabilities)
+                credible_intervals = calculate_credible_intervals([control_variant, *test_variants])
         except Exception as e:
             raise ValueError(f"Error calculating experiment funnel results: {str(e)}") from e
 
@@ -80,6 +92,7 @@ def calculate(self) -> ExperimentFunnelsQueryResponse:
             },
             significant=significance_code == ExperimentSignificanceCode.SIGNIFICANT,
             significance_code=significance_code,
+            stats_version=self.stats_version,
             expected_loss=loss,
             credible_intervals=credible_intervals,
         )
diff --git a/posthog/hogql_queries/experiments/funnels_statistics_v2.py b/posthog/hogql_queries/experiments/funnels_statistics_v2.py
new file mode 100644
index 0000000000000..02f18d2f70740
--- /dev/null
+++ b/posthog/hogql_queries/experiments/funnels_statistics_v2.py
@@ -0,0 +1,216 @@
+import numpy as np
+from scipy import stats
+from posthog.schema import ExperimentVariantFunnelsBaseStats, ExperimentSignificanceCode
+from posthog.hogql_queries.experiments import (
+    FF_DISTRIBUTION_THRESHOLD,
+    MIN_PROBABILITY_FOR_SIGNIFICANCE,
+    EXPECTED_LOSS_SIGNIFICANCE_LEVEL,
+)
+from scipy.stats import betabinom
+
+ALPHA_PRIOR = 1
+BETA_PRIOR = 1
+SAMPLE_SIZE = 10000
+
+
+def calculate_probabilities_v2(
+    control: ExperimentVariantFunnelsBaseStats, variants: list[ExperimentVariantFunnelsBaseStats]
+) -> list[float]:
+    """
+    Calculate the win probabilities for each variant in an experiment using Bayesian analysis
+    for funnel conversion rates.
+
+    This function computes the probability that each variant is the best (i.e., has the highest
+    conversion rate) compared to all other variants, including the control. It uses samples
+    drawn from the posterior Beta distributions of each variant's conversion rate.
+
+    Parameters:
+    -----------
+    control : ExperimentVariantFunnelsBaseStats
+        Statistics for the control group, including success and failure counts
+    variants : list[ExperimentVariantFunnelsBaseStats]
+        List of statistics for test variants to compare against the control
+
+    Returns:
+    --------
+    list[float]
+        A list of probabilities where:
+        - The first element is the probability that the control variant is the best
+        - Subsequent elements are the probabilities that each test variant is the best
+
+    Notes:
+    ------
+    - Uses a Bayesian approach with Beta distributions as the posterior
+    - Uses Beta(1,1) as the prior, which is uniform over [0,1]
+    - Draws 10,000 samples from each variant's posterior distribution
+    """
+    all_variants = [control, *variants]
+
+    # Use Beta distribution for conversion rates
+    samples: list[np.ndarray] = []
+    for variant in all_variants:
+        # Add prior to both successes and failures for Bayesian prior
+        alpha = ALPHA_PRIOR + variant.success_count
+        beta = BETA_PRIOR + variant.failure_count
+        # Generate samples from Beta distribution
+        variant_samples = np.random.beta(alpha, beta, SAMPLE_SIZE)
+        samples.append(variant_samples)
+
+    samples_array = np.array(samples)
+    # Calculate probability of each variant being the best
+    probabilities = []
+    for i in range(len(all_variants)):
+        probability = (samples_array[i] == np.max(samples_array, axis=0)).mean()
+        probabilities.append(float(probability))
+
+    return probabilities
+
+
+def calculate_expected_loss_v2(
+    target_variant: ExperimentVariantFunnelsBaseStats, variants: list[ExperimentVariantFunnelsBaseStats]
+) -> float:
+    """
+    Calculates expected loss in conversion rate using Beta-Binomial conjugate prior.
+
+    This implementation uses a Bayesian approach with Beta-Binomial model
+    to estimate the expected loss when choosing the target variant over others.
+
+    Parameters:
+    -----------
+    target_variant : ExperimentVariantFunnelsBaseStats
+        The variant being evaluated for loss
+    variants : list[ExperimentVariantFunnelsBaseStats]
+        List of other variants to compare against
+
+    Returns:
+    --------
+    float
+        Expected loss in conversion rate if choosing the target variant
+    """
+    # Calculate posterior parameters for target variant
+    target_alpha = int(ALPHA_PRIOR + target_variant.success_count)
+    target_beta = int(BETA_PRIOR + target_variant.failure_count)
+    target_n = int(target_variant.success_count + target_variant.failure_count)
+
+    # Get samples from target variant's Beta-Binomial
+    target_samples = betabinom.rvs(target_n, target_alpha, target_beta, size=SAMPLE_SIZE) / target_n
+
+    # Get samples from each comparison variant
+    variant_samples = []
+    for variant in variants:
+        n = int(variant.success_count + variant.failure_count)
+        alpha = int(ALPHA_PRIOR + variant.success_count)
+        beta = int(BETA_PRIOR + variant.failure_count)
+        samples = betabinom.rvs(n, alpha, beta, size=SAMPLE_SIZE) / n
+        variant_samples.append(samples)
+
+    # Calculate loss
+    variant_max = np.maximum.reduce(variant_samples)
+    losses = np.maximum(0, variant_max - target_samples)
+    expected_loss = float(np.mean(losses))
+
+    return expected_loss
+
+
+def are_results_significant_v2(
+    control: ExperimentVariantFunnelsBaseStats,
+    variants: list[ExperimentVariantFunnelsBaseStats],
+    probabilities: list[float],
+) -> tuple[ExperimentSignificanceCode, float]:
+    """
+    Determine if the experiment results are statistically significant using Bayesian analysis
+    for funnel conversion rates.
+
+    This function evaluates whether there is strong evidence that any variant is better
+    than the others by considering both winning probabilities and expected loss. It checks
+    if the sample size is sufficient and evaluates the risk of choosing the winning variant.
+
+    Parameters:
+    -----------
+    control : ExperimentVariantFunnelsBaseStats
+        Statistics for the control group, including success and failure counts
+    variants : list[ExperimentVariantFunnelsBaseStats]
+        List of statistics for test variants to compare against the control
+    probabilities : list[float]
+        List of probabilities from calculate_probabilities_v2
+
+    Returns:
+    --------
+    tuple[ExperimentSignificanceCode, float]
+        A tuple containing:
+        - Significance code indicating the result (significant, not enough exposure, high loss, etc.)
+        - Expected loss value for significant results, 1.0 for non-significant results
+
+    Notes:
+    ------
+    - Requires minimum exposure threshold per variant for reliable results
+    - Uses probability threshold from MIN_PROBABILITY_FOR_SIGNIFICANCE
+    - Calculates expected loss for the best-performing variant
+    - Returns HIGH_LOSS if expected loss exceeds significance threshold
+    - Returns NOT_ENOUGH_EXPOSURE if sample size requirements not met
+    """
+    # Check minimum exposure
+    if control.success_count + control.failure_count < FF_DISTRIBUTION_THRESHOLD or any(
+        v.success_count + v.failure_count < FF_DISTRIBUTION_THRESHOLD for v in variants
+    ):
+        return ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE, 1.0
+
+    # Check if any variant has high enough probability
+    max_probability = max(probabilities)
+    if max_probability >= MIN_PROBABILITY_FOR_SIGNIFICANCE:
+        # Find best performing variant
+        all_variants = [control, *variants]
+        conversion_rates = [v.success_count / (v.success_count + v.failure_count) for v in all_variants]
+        best_idx = np.argmax(conversion_rates)
+        best_variant = all_variants[best_idx]
+        other_variants = all_variants[:best_idx] + all_variants[best_idx + 1 :]
+        expected_loss = calculate_expected_loss_v2(best_variant, other_variants)
+
+        if expected_loss >= EXPECTED_LOSS_SIGNIFICANCE_LEVEL:
+            return ExperimentSignificanceCode.HIGH_LOSS, expected_loss
+
+        return ExperimentSignificanceCode.SIGNIFICANT, expected_loss
+
+    return ExperimentSignificanceCode.LOW_WIN_PROBABILITY, 1.0
+
+
+def calculate_credible_intervals_v2(variants: list[ExperimentVariantFunnelsBaseStats]) -> dict[str, list[float]]:
+    """
+    Calculate Bayesian credible intervals for conversion rates of each variant.
+
+    This function computes the 95% credible intervals for the true conversion rate
+    of each variant, representing the range where we believe the true rate lies
+    with 95% probability.
+
+    Parameters:
+    -----------
+    variants : list[ExperimentVariantFunnelsBaseStats]
+        List of all variants including control, containing success and failure counts
+
+    Returns:
+    --------
+    dict[str, list[float]]
+        Dictionary mapping variant keys to [lower, upper] credible intervals, where:
+        - lower is the 2.5th percentile of the posterior distribution
+        - upper is the 97.5th percentile of the posterior distribution
+
+    Notes:
+    ------
+    - Uses Beta distribution as the posterior
+    - Uses Beta(1,1) as the prior, which is uniform over [0,1]
+    - Returns 95% credible intervals
+    - Intervals become narrower with larger sample sizes
+    """
+    intervals = {}
+
+    for variant in variants:
+        # Add 1 to both successes and failures for Bayesian prior
+        alpha = ALPHA_PRIOR + variant.success_count
+        beta = BETA_PRIOR + variant.failure_count
+
+        # Calculate 95% credible interval
+        lower, upper = stats.beta.ppf([0.025, 0.975], alpha, beta)
+
+        intervals[variant.key] = [float(lower), float(upper)]
+
+    return intervals
diff --git a/posthog/hogql_queries/experiments/test/test_experiment_funnels_query_runner.py b/posthog/hogql_queries/experiments/test/test_experiment_funnels_query_runner.py
index 3f3127fcb1b55..3dc11499b1393 100644
--- a/posthog/hogql_queries/experiments/test/test_experiment_funnels_query_runner.py
+++ b/posthog/hogql_queries/experiments/test/test_experiment_funnels_query_runner.py
@@ -113,6 +113,8 @@ def test_query_runner(self):
         )
         result = query_runner.calculate()
 
+        self.assertEqual(result.stats_version, 1)
+
         self.assertEqual(len(result.variants), 2)
 
         control_variant = next(variant for variant in result.variants if variant.key == "control")
@@ -123,11 +125,88 @@ def test_query_runner(self):
         self.assertEqual(test_variant.success_count, 8)
         self.assertEqual(test_variant.failure_count, 2)
 
-        self.assertIn("control", result.probability)
-        self.assertIn("test", result.probability)
+        self.assertAlmostEqual(result.probability["control"], 0.2, delta=0.1)
+        self.assertAlmostEqual(result.probability["test"], 0.8, delta=0.1)
 
-        self.assertIn("control", result.credible_intervals)
-        self.assertIn("test", result.credible_intervals)
+        self.assertEqual(result.significant, False)
+        self.assertEqual(result.significance_code, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE)
+        self.assertEqual(result.expected_loss, 1.0)
+
+        self.assertAlmostEqual(result.credible_intervals["control"][0], 0.3, delta=0.1)
+        self.assertAlmostEqual(result.credible_intervals["control"][1], 0.8, delta=0.1)
+        self.assertAlmostEqual(result.credible_intervals["test"][0], 0.5, delta=0.1)
+        self.assertAlmostEqual(result.credible_intervals["test"][1], 0.9, delta=0.1)
+
+    @freeze_time("2020-01-01T12:00:00Z")
+    def test_query_runner_v2(self):
+        feature_flag = self.create_feature_flag()
+        experiment = self.create_experiment(feature_flag=feature_flag)
+
+        feature_flag_property = f"$feature/{feature_flag.key}"
+
+        funnels_query = FunnelsQuery(
+            series=[EventsNode(event="$pageview"), EventsNode(event="purchase")],
+            dateRange={"date_from": "2020-01-01", "date_to": "2020-01-14"},
+        )
+        experiment_query = ExperimentFunnelsQuery(
+            experiment_id=experiment.id,
+            kind="ExperimentFunnelsQuery",
+            funnels_query=funnels_query,
+            stats_version=2,
+        )
+
+        experiment.metrics = [{"type": "primary", "query": experiment_query.model_dump()}]
+        experiment.save()
+
+        for variant, purchase_count in [("control", 6), ("test", 8)]:
+            for i in range(10):
+                _create_person(distinct_ids=[f"user_{variant}_{i}"], team_id=self.team.pk)
+                _create_event(
+                    team=self.team,
+                    event="$pageview",
+                    distinct_id=f"user_{variant}_{i}",
+                    timestamp="2020-01-02T12:00:00Z",
+                    properties={feature_flag_property: variant},
+                )
+                if i < purchase_count:
+                    _create_event(
+                        team=self.team,
+                        event="purchase",
+                        distinct_id=f"user_{variant}_{i}",
+                        timestamp="2020-01-02T12:01:00Z",
+                        properties={feature_flag_property: variant},
+                    )
+
+        flush_persons_and_events()
+
+        query_runner = ExperimentFunnelsQueryRunner(
+            query=ExperimentFunnelsQuery(**experiment.metrics[0]["query"]), team=self.team
+        )
+        result = query_runner.calculate()
+
+        self.assertEqual(result.stats_version, 2)
+
+        self.assertEqual(len(result.variants), 2)
+
+        control_variant = next(variant for variant in result.variants if variant.key == "control")
+        test_variant = next(variant for variant in result.variants if variant.key == "test")
+
+        self.assertEqual(control_variant.success_count, 6)
+        self.assertEqual(control_variant.failure_count, 4)
+        self.assertEqual(test_variant.success_count, 8)
+        self.assertEqual(test_variant.failure_count, 2)
+
+        self.assertAlmostEqual(result.probability["control"], 0.2, delta=0.1)
+        self.assertAlmostEqual(result.probability["test"], 0.8, delta=0.1)
+
+        self.assertEqual(result.significant, False)
+        self.assertEqual(result.significance_code, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE)
+        self.assertEqual(result.expected_loss, 1.0)
+
+        self.assertAlmostEqual(result.credible_intervals["control"][0], 0.3, delta=0.1)
+        self.assertAlmostEqual(result.credible_intervals["control"][1], 0.8, delta=0.1)
+        self.assertAlmostEqual(result.credible_intervals["test"][0], 0.5, delta=0.1)
+        self.assertAlmostEqual(result.credible_intervals["test"][1], 0.9, delta=0.1)
 
     @flaky(max_runs=10, min_passes=1)
     @freeze_time("2020-01-01T12:00:00Z")
diff --git a/posthog/hogql_queries/experiments/test/test_funnels_statistics.py b/posthog/hogql_queries/experiments/test/test_funnels_statistics.py
new file mode 100644
index 0000000000000..2206ff92b9305
--- /dev/null
+++ b/posthog/hogql_queries/experiments/test/test_funnels_statistics.py
@@ -0,0 +1,243 @@
+from posthog.hogql_queries.experiments import MIN_PROBABILITY_FOR_SIGNIFICANCE
+from posthog.schema import ExperimentVariantFunnelsBaseStats, ExperimentSignificanceCode
+from posthog.hogql_queries.experiments.funnels_statistics_v2 import (
+    calculate_probabilities_v2,
+    are_results_significant_v2,
+    calculate_credible_intervals_v2,
+)
+from posthog.hogql_queries.experiments.funnels_statistics import (
+    calculate_probabilities,
+    are_results_significant,
+    calculate_credible_intervals,
+)
+from posthog.test.base import APIBaseTest
+
+
+def create_variant(
+    key: str,
+    success_count: int,
+    failure_count: int,
+) -> ExperimentVariantFunnelsBaseStats:
+    return ExperimentVariantFunnelsBaseStats(
+        key=key,
+        success_count=success_count,
+        failure_count=failure_count,
+    )
+
+
+class TestExperimentFunnelStatistics(APIBaseTest):
+    def run_test_for_both_implementations(self, test_fn):
+        """Run the same test for both implementations"""
+        self.stats_version = 1
+        # Run for original implementation
+        test_fn(
+            stats_version=1,
+            calculate_probabilities=calculate_probabilities,
+            are_results_significant=are_results_significant,
+            calculate_credible_intervals=calculate_credible_intervals,
+        )
+        self.stats_version = 2
+        # Run for v2 implementation
+        test_fn(
+            stats_version=2,
+            calculate_probabilities=calculate_probabilities_v2,
+            are_results_significant=are_results_significant_v2,
+            calculate_credible_intervals=calculate_credible_intervals_v2,
+        )
+
+    def test_small_sample_two_variants_not_significant(self):
+        """Test with small sample size, two variants, no clear winner"""
+
+        def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals):
+            control = create_variant("control", success_count=10, failure_count=90)
+            test = create_variant("test", success_count=15, failure_count=85)
+
+            probabilities = calculate_probabilities(control, [test])
+            significance, p_value = are_results_significant(control, [test], probabilities)
+            intervals = calculate_credible_intervals([control, test])
+
+            self.assertEqual(len(probabilities), 2)
+            if stats_version == 2:
+                self.assertAlmostEqual(probabilities[0], 0.15, delta=0.1)
+                self.assertAlmostEqual(probabilities[1], 0.85, delta=0.1)
+                self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY)
+                self.assertEqual(p_value, 1)
+
+                # Check credible intervals
+                self.assertAlmostEqual(intervals["control"][0], 0.05, delta=0.05)
+                self.assertAlmostEqual(intervals["control"][1], 0.20, delta=0.05)
+                self.assertAlmostEqual(intervals["test"][0], 0.08, delta=0.05)
+                self.assertAlmostEqual(intervals["test"][1], 0.25, delta=0.05)
+            else:
+                # Original implementation behavior
+                self.assertTrue(0.1 < probabilities[0] < 0.5)
+                self.assertTrue(0.5 < probabilities[1] < 0.9)
+                self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY)
+                self.assertEqual(p_value, 1)
+
+                # Original implementation intervals
+                self.assertAlmostEqual(intervals["control"][0], 0.05, delta=0.05)
+                self.assertAlmostEqual(intervals["control"][1], 0.20, delta=0.05)
+                self.assertAlmostEqual(intervals["test"][0], 0.08, delta=0.05)
+                self.assertAlmostEqual(intervals["test"][1], 0.25, delta=0.05)
+
+        self.run_test_for_both_implementations(run_test)
+
+    def test_large_sample_two_variants_significant(self):
+        """Test with large sample size, two variants, clear winner"""
+
+        def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals):
+            control = create_variant("control", success_count=1000, failure_count=9000)
+            test = create_variant("test", success_count=1500, failure_count=8500)
+
+            probabilities = calculate_probabilities(control, [test])
+            significance, p_value = are_results_significant(control, [test], probabilities)
+            intervals = calculate_credible_intervals([control, test])
+
+            self.assertEqual(len(probabilities), 2)
+            if stats_version == 2:
+                self.assertAlmostEqual(probabilities[1], 1.0, delta=0.05)
+                self.assertAlmostEqual(probabilities[0], 0.0, delta=0.05)
+                self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT)
+                self.assertEqual(p_value, 0)
+
+                # Check credible intervals
+                self.assertAlmostEqual(intervals["control"][0], 0.095, delta=0.01)
+                self.assertAlmostEqual(intervals["control"][1], 0.105, delta=0.01)
+                self.assertAlmostEqual(intervals["test"][0], 0.145, delta=0.01)
+                self.assertAlmostEqual(intervals["test"][1], 0.155, delta=0.01)
+            else:
+                # Original implementation behavior
+                self.assertTrue(probabilities[1] > 0.5)  # Test variant winning
+                self.assertTrue(probabilities[0] < 0.5)  # Control variant losing
+                self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT)
+                self.assertLess(p_value, 0.05)
+
+                # Original implementation intervals
+                self.assertAlmostEqual(intervals["control"][0], 0.095, delta=0.01)
+                self.assertAlmostEqual(intervals["control"][1], 0.105, delta=0.01)
+                self.assertAlmostEqual(intervals["test"][0], 0.145, delta=0.01)
+                self.assertAlmostEqual(intervals["test"][1], 0.155, delta=0.01)
+
+        self.run_test_for_both_implementations(run_test)
+
+    def test_many_variants_not_significant(self):
+        """Test with multiple variants, no clear winner"""
+
+        def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals):
+            control = create_variant("control", success_count=100, failure_count=900)
+            test_a = create_variant("test_a", success_count=98, failure_count=902)
+            test_b = create_variant("test_b", success_count=102, failure_count=898)
+            test_c = create_variant("test_c", success_count=101, failure_count=899)
+
+            probabilities = calculate_probabilities(control, [test_a, test_b, test_c])
+            significance, p_value = are_results_significant(control, [test_a, test_b, test_c], probabilities)
+            intervals = calculate_credible_intervals([control, test_a, test_b, test_c])
+
+            self.assertEqual(len(probabilities), 4)
+            if stats_version == 2:
+                self.assertTrue(all(p < MIN_PROBABILITY_FOR_SIGNIFICANCE for p in probabilities))
+                self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY)
+                self.assertEqual(p_value, 1)
+
+                # Check credible intervals overlap
+                # Check credible intervals for control and all test variants
+                self.assertAlmostEqual(intervals["control"][0], 0.09, delta=0.02)
+                self.assertAlmostEqual(intervals["control"][1], 0.12, delta=0.02)
+                self.assertAlmostEqual(intervals["test_a"][0], 0.09, delta=0.02)
+                self.assertAlmostEqual(intervals["test_a"][1], 0.12, delta=0.02)
+                self.assertAlmostEqual(intervals["test_b"][0], 0.09, delta=0.02)
+                self.assertAlmostEqual(intervals["test_b"][1], 0.12, delta=0.02)
+                self.assertAlmostEqual(intervals["test_c"][0], 0.09, delta=0.02)
+                self.assertAlmostEqual(intervals["test_c"][1], 0.12, delta=0.02)
+            else:
+                # Original implementation behavior
+                self.assertTrue(all(0.1 < p < 0.9 for p in probabilities))
+                self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY)
+                self.assertEqual(p_value, 1)
+
+                # Check credible intervals overlap
+                # Check credible intervals for control and all test variants
+                self.assertAlmostEqual(intervals["control"][0], 0.09, delta=0.02)
+                self.assertAlmostEqual(intervals["control"][1], 0.12, delta=0.02)
+                self.assertAlmostEqual(intervals["test_a"][0], 0.09, delta=0.02)
+                self.assertAlmostEqual(intervals["test_a"][1], 0.12, delta=0.02)
+                self.assertAlmostEqual(intervals["test_b"][0], 0.09, delta=0.02)
+                self.assertAlmostEqual(intervals["test_b"][1], 0.12, delta=0.02)
+                self.assertAlmostEqual(intervals["test_c"][0], 0.09, delta=0.02)
+                self.assertAlmostEqual(intervals["test_c"][1], 0.12, delta=0.02)
+
+        self.run_test_for_both_implementations(run_test)
+
+    def test_insufficient_sample_size(self):
+        """Test with sample size below threshold"""
+
+        def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals):
+            control = create_variant("control", success_count=5, failure_count=45)
+            test = create_variant("test", success_count=8, failure_count=42)
+
+            probabilities = calculate_probabilities(control, [test])
+            significance, p_value = are_results_significant(control, [test], probabilities)
+            intervals = calculate_credible_intervals([control, test])
+
+            self.assertEqual(len(probabilities), 2)
+            if stats_version == 2:
+                self.assertEqual(significance, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE)
+                self.assertEqual(p_value, 1.0)
+
+                # Check wide credible intervals due to small sample
+                self.assertTrue(intervals["control"][1] - intervals["control"][0] > 0.15)
+                self.assertTrue(intervals["test"][1] - intervals["test"][0] > 0.15)
+            else:
+                # Original implementation behavior
+                self.assertEqual(significance, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE)
+                self.assertEqual(p_value, 1.0)
+
+                # Check wide credible intervals
+                self.assertTrue(intervals["control"][1] - intervals["control"][0] > 0.15)
+                self.assertTrue(intervals["test"][1] - intervals["test"][0] > 0.15)
+
+        self.run_test_for_both_implementations(run_test)
+
+    def test_expected_loss_minimal_difference(self):
+        """Test expected loss when variants have very similar performance"""
+
+        def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals):
+            control = create_variant("control", success_count=1000, failure_count=9000)  # 11% conversion
+            test = create_variant("test", success_count=1050, failure_count=8800)  # 11.9% conversion
+
+            probabilities = calculate_probabilities(control, [test])
+            significance, expected_loss = are_results_significant(control, [test], probabilities)
+
+            if stats_version == 2:
+                self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT)
+                # Expected loss should still be relatively small
+                self.assertLess(expected_loss, 0.03)  # Less than 3% expected loss
+                self.assertGreater(expected_loss, 0)
+            else:
+                # Original implementation behavior
+                self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT)
+                self.assertLess(expected_loss, 0.03)  # Less than 3% expected loss
+                self.assertGreater(expected_loss, 0)
+
+        self.run_test_for_both_implementations(run_test)
+
+    def test_expected_loss_test_variant_clear_winner(self):
+        """Test expected loss when one variant is clearly better"""
+
+        def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals):
+            control = create_variant("control", success_count=1000, failure_count=9000)  # 11% conversion
+            test = create_variant("test", success_count=2000, failure_count=8000)  # 20% conversion
+
+            probabilities = calculate_probabilities(control, [test])
+            significance, expected_loss = are_results_significant(control, [test], probabilities)
+
+            if stats_version == 2:
+                self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT)
+                self.assertEqual(expected_loss, 0.0)
+            else:
+                # Original implementation behavior
+                self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT)
+                self.assertEqual(expected_loss, 0.0)
+
+        self.run_test_for_both_implementations(run_test)
diff --git a/posthog/schema.py b/posthog/schema.py
index 564dcc321fa60..a46596350f23f 100644
--- a/posthog/schema.py
+++ b/posthog/schema.py
@@ -6292,6 +6292,7 @@ class QueryResponseAlternative15(BaseModel):
     probability: dict[str, float]
     significance_code: ExperimentSignificanceCode
     significant: bool
+    stats_version: Optional[int] = None
     variants: list[ExperimentVariantFunnelsBaseStats]
 
 
@@ -6324,6 +6325,7 @@ class QueryResponseAlternative26(BaseModel):
     probability: dict[str, float]
     significance_code: ExperimentSignificanceCode
     significant: bool
+    stats_version: Optional[int] = None
     variants: list[ExperimentVariantFunnelsBaseStats]
 
 
@@ -6461,6 +6463,7 @@ class CachedExperimentFunnelsQueryResponse(BaseModel):
     )
     significance_code: ExperimentSignificanceCode
     significant: bool
+    stats_version: Optional[int] = None
     timezone: str
     variants: list[ExperimentVariantFunnelsBaseStats]
 
@@ -6477,6 +6480,7 @@ class Response9(BaseModel):
     probability: dict[str, float]
     significance_code: ExperimentSignificanceCode
     significant: bool
+    stats_version: Optional[int] = None
     variants: list[ExperimentVariantFunnelsBaseStats]
 
 
@@ -6508,6 +6512,7 @@ class ExperimentFunnelsQueryResponse(BaseModel):
     probability: dict[str, float]
     significance_code: ExperimentSignificanceCode
     significant: bool
+    stats_version: Optional[int] = None
     variants: list[ExperimentVariantFunnelsBaseStats]
 
 
@@ -6640,6 +6645,7 @@ class ExperimentFunnelsQuery(BaseModel):
     )
     name: Optional[str] = None
     response: Optional[ExperimentFunnelsQueryResponse] = None
+    stats_version: Optional[int] = None
 
 
 class FunnelCorrelationQuery(BaseModel):
diff --git a/posthog/settings/__init__.py b/posthog/settings/__init__.py
index c6067fd19c1f7..3e7ebc0b7c984 100644
--- a/posthog/settings/__init__.py
+++ b/posthog/settings/__init__.py
@@ -108,6 +108,7 @@
 PROM_PUSHGATEWAY_ADDRESS: str | None = os.getenv("PROM_PUSHGATEWAY_ADDRESS", None)
 
 IN_UNIT_TESTING: bool = get_from_env("IN_UNIT_TESTING", False, type_cast=str_to_bool)
+IN_EVAL_TESTING: bool = get_from_env("DEEPEVAL", False, type_cast=str_to_bool)
 
 HOGQL_INCREASED_MAX_EXECUTION_TIME: int = get_from_env("HOGQL_INCREASED_MAX_EXECUTION_TIME", 600, type_cast=int)
 
diff --git a/posthog/temporal/data_imports/pipelines/sql_database/helpers.py b/posthog/temporal/data_imports/pipelines/sql_database/helpers.py
index 0400a60b32fd5..9bf72a26f3c1e 100644
--- a/posthog/temporal/data_imports/pipelines/sql_database/helpers.py
+++ b/posthog/temporal/data_imports/pipelines/sql_database/helpers.py
@@ -24,7 +24,7 @@ def __init__(
         self,
         engine: Engine,
         table: Table,
-        chunk_size: int = 1000,
+        chunk_size: int = DEFAULT_CHUNK_SIZE,
         incremental: Optional[dlt.sources.incremental[Any]] = None,
         connect_args: Optional[list[str]] = None,
         db_incremental_field_last_value: Optional[Any] = None,
diff --git a/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py b/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py
index a3fc1c6b2838b..de868d62ee51d 100644
--- a/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py
+++ b/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py
@@ -15,6 +15,7 @@
 from dlt.sources.credentials import ConnectionStringCredentials
 
 from posthog.settings.utils import get_from_env
+from posthog.temporal.data_imports.pipelines.sql_database_v2.settings import DEFAULT_CHUNK_SIZE
 from posthog.temporal.data_imports.pipelines.sql_database_v2._json import BigQueryJSON
 from posthog.utils import str_to_bool
 from posthog.warehouse.models import ExternalDataSource
@@ -252,7 +253,7 @@ def sql_database(
     schema: Optional[str] = dlt.config.value,
     metadata: Optional[MetaData] = None,
     table_names: Optional[list[str]] = dlt.config.value,
-    chunk_size: int = 50000,
+    chunk_size: int = DEFAULT_CHUNK_SIZE,
     backend: TableBackend = "pyarrow",
     detect_precision_hints: Optional[bool] = False,
     reflection_level: Optional[ReflectionLevel] = "full",
diff --git a/posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py b/posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py
index acd64c97aae99..74a79650caa15 100644
--- a/posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py
+++ b/posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py
@@ -18,6 +18,8 @@
 
 from dlt.sources.credentials import ConnectionStringCredentials
 
+from posthog.temporal.data_imports.pipelines.sql_database_v2.settings import DEFAULT_CHUNK_SIZE
+
 from .arrow_helpers import row_tuples_to_arrow
 from .schema_types import (
     default_table_adapter,
@@ -44,7 +46,7 @@ def __init__(
         backend: TableBackend,
         table: Table,
         columns: TTableSchemaColumns,
-        chunk_size: int = 1000,
+        chunk_size: int = DEFAULT_CHUNK_SIZE,
         incremental: Optional[dlt.sources.incremental[Any]] = None,
         db_incremental_field_last_value: Optional[Any] = None,
         query_adapter_callback: Optional[TQueryAdapter] = None,
@@ -302,7 +304,7 @@ class SqlTableResourceConfiguration(BaseConfiguration):
     table: Optional[str] = None
     schema: Optional[str] = None
     incremental: Optional[dlt.sources.incremental] = None  # type: ignore[type-arg]
-    chunk_size: int = 50000
+    chunk_size: int = DEFAULT_CHUNK_SIZE
     backend: TableBackend = "sqlalchemy"
     detect_precision_hints: Optional[bool] = None
     defer_table_reflect: Optional[bool] = False
diff --git a/posthog/temporal/data_imports/pipelines/sql_database_v2/settings.py b/posthog/temporal/data_imports/pipelines/sql_database_v2/settings.py
new file mode 100644
index 0000000000000..d730961c096e8
--- /dev/null
+++ b/posthog/temporal/data_imports/pipelines/sql_database_v2/settings.py
@@ -0,0 +1 @@
+DEFAULT_CHUNK_SIZE = 10_000
diff --git a/requirements.in b/requirements.in
index faefd16d9294d..8c0b98d16587b 100644
--- a/requirements.in
+++ b/requirements.in
@@ -16,6 +16,7 @@ clickhouse-driver==0.2.7
 clickhouse-pool==0.5.3
 conditional-cache==1.2
 cryptography==39.0.2
+deltalake==0.22.3
 dj-database-url==0.5.0
 Django~=4.2.15
 django-axes==5.9.0
@@ -34,7 +35,6 @@ djangorestframework==3.15.1
 djangorestframework-csv==2.1.1
 djangorestframework-dataclasses==1.2.0
 dlt==1.3.0
-dlt[deltalake]==1.3.0
 dnspython==2.2.1
 drf-exceptions-hog==0.4.0
 drf-extensions==0.7.0
diff --git a/requirements.txt b/requirements.txt
index 639c98066ccd4..e4a1521f8dd7a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -138,8 +138,8 @@ defusedxml==0.6.0
     # via
     #   python3-openid
     #   social-auth-core
-deltalake==0.19.1
-    # via dlt
+deltalake==0.22.3
+    # via -r requirements.in
 distro==1.9.0
     # via openai
 dj-database-url==0.5.0
@@ -273,8 +273,6 @@ googleapis-common-protos==1.60.0
     # via
     #   google-api-core
     #   grpcio-status
-greenlet==3.1.1
-    # via sqlalchemy
 grpcio==1.63.2
     # via
     #   -r requirements.in
@@ -505,7 +503,6 @@ pyarrow==17.0.0
     # via
     #   -r requirements.in
     #   deltalake
-    #   dlt
     #   sqlalchemy-bigquery
 pyasn1==0.5.0
     # via