diff --git a/bin/mprocs.yaml b/bin/mprocs.yaml index cb3761250d035..82409edab38f9 100644 --- a/bin/mprocs.yaml +++ b/bin/mprocs.yaml @@ -19,8 +19,7 @@ procs: shell: 'bin/check_kafka_clickhouse_up && bin/check_temporal_up && python manage.py start_temporal_worker' docker-compose: - shell: 'docker compose -f docker-compose.dev.yml up' - stop: - send-keys: [''] + # docker-compose makes sure the stack is up, and then follows its logs - but doesn't tear down on exit for speed + shell: 'docker compose -f docker-compose.dev.yml up -d && docker compose -f docker-compose.dev.yml logs --tail=0 -f' mouse_scroll_speed: 1 diff --git a/bin/start b/bin/start index ceaddede2140f..7ec881e57b81c 100755 --- a/bin/start +++ b/bin/start @@ -9,4 +9,14 @@ export HOG_HOOK_URL=${HOG_HOOK_URL:-http://localhost:3300/hoghook} [ ! -f ./share/GeoLite2-City.mmdb ] && ( curl -L "https://mmdbcdn.posthog.net/" --http1.1 | brotli --decompress --output=./share/GeoLite2-City.mmdb ) +if ! command -v mprocs &> /dev/null; then + if command -v brew &> /dev/null; then + echo "🔁 Installing mprocs via Homebrew..." + brew install mprocs + else + echo "👉 To run bin/start, install mprocs: https://github.com/pvolok/mprocs#installation" + exit 1 + fi +fi + exec mprocs --config bin/mprocs.yaml diff --git a/cypress/e2e/insights-reload-query.ts b/cypress/e2e/insights-reload-query.ts index 2f944f8993da7..55685ae309fb8 100644 --- a/cypress/e2e/insights-reload-query.ts +++ b/cypress/e2e/insights-reload-query.ts @@ -1,5 +1,3 @@ -import JSONCrush from 'jsoncrush' - describe('ReloadInsight component', () => { beforeEach(() => { // Clear local storage before each test to ensure a clean state @@ -21,8 +19,7 @@ describe('ReloadInsight component', () => { const draftQuery = window.localStorage.getItem(`draft-query-${currentTeamId}`) expect(draftQuery).to.not.be.null - const draftQueryObjUncrushed = JSONCrush.uncrush(draftQuery) - const draftQueryObj = JSON.parse(draftQueryObjUncrushed) + const draftQueryObj = JSON.parse(draftQuery) expect(draftQueryObj).to.have.property('query') diff --git a/ee/hogai/eval/conftest.py b/ee/hogai/eval/conftest.py index d0bc75348eeac..1a88ebffa2e33 100644 --- a/ee/hogai/eval/conftest.py +++ b/ee/hogai/eval/conftest.py @@ -1,28 +1,107 @@ +import functools +from collections.abc import Generator +from pathlib import Path + import pytest +from django.conf import settings +from django.test import override_settings +from langchain_core.runnables import RunnableConfig + +from ee.models import Conversation +from posthog.demo.matrix.manager import MatrixManager +from posthog.models import Organization, Project, Team, User +from posthog.tasks.demo_create_data import HedgeboxMatrix +from posthog.test.base import BaseTest + + +# Flaky is a handy tool, but it always runs setup fixtures for retries. +# This decorator will just retry without re-running setup. +def retry_test_only(max_retries=3): + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + last_error: Exception | None = None + for attempt in range(max_retries): + try: + return func(*args, **kwargs) + except Exception as e: + last_error = e + print(f"\nRetrying test (attempt {attempt + 1}/{max_retries})...") # noqa + if last_error: + raise last_error + + return wrapper + + return decorator + + +# Apply decorators to all tests in the package. +def pytest_collection_modifyitems(items): + current_dir = Path(__file__).parent + for item in items: + if Path(item.fspath).is_relative_to(current_dir): + item.add_marker( + pytest.mark.skipif(not settings.IN_EVAL_TESTING, reason="Only runs for the assistant evaluation") + ) + # Apply our custom retry decorator to the test function + item.obj = retry_test_only(max_retries=3)(item.obj) + + +@pytest.fixture(scope="package") +def team(django_db_blocker) -> Generator[Team, None, None]: + with django_db_blocker.unblock(): + organization = Organization.objects.create(name=BaseTest.CONFIG_ORGANIZATION_NAME) + project = Project.objects.create(id=Team.objects.increment_id_sequence(), organization=organization) + team = Team.objects.create( + id=project.id, + project=project, + organization=organization, + test_account_filters=[ + { + "key": "email", + "value": "@posthog.com", + "operator": "not_icontains", + "type": "person", + } + ], + has_completed_onboarding_for={"product_analytics": True}, + ) + yield team + organization.delete() -from posthog.test.base import run_clickhouse_statement_in_parallel +@pytest.fixture(scope="package") +def user(team, django_db_blocker) -> Generator[User, None, None]: + with django_db_blocker.unblock(): + user = User.objects.create_and_join(team.organization, "eval@posthog.com", "password1234") + yield user + user.delete() -@pytest.fixture(scope="module", autouse=True) -def setup_kafka_tables(django_db_setup): - from posthog.clickhouse.client import sync_execute - from posthog.clickhouse.schema import ( - CREATE_KAFKA_TABLE_QUERIES, - build_query, - ) - from posthog.settings import CLICKHOUSE_CLUSTER, CLICKHOUSE_DATABASE - kafka_queries = list(map(build_query, CREATE_KAFKA_TABLE_QUERIES)) - run_clickhouse_statement_in_parallel(kafka_queries) +@pytest.mark.django_db(transaction=True) +@pytest.fixture +def runnable_config(team, user) -> Generator[RunnableConfig, None, None]: + conversation = Conversation.objects.create(team=team, user=user) + yield { + "configurable": { + "thread_id": conversation.id, + } + } + conversation.delete() - yield - kafka_tables = sync_execute( - f""" - SELECT name - FROM system.tables - WHERE database = '{CLICKHOUSE_DATABASE}' AND name LIKE 'kafka_%' - """, - ) - kafka_truncate_queries = [f"DROP TABLE {table[0]} ON CLUSTER '{CLICKHOUSE_CLUSTER}'" for table in kafka_tables] - run_clickhouse_statement_in_parallel(kafka_truncate_queries) +@pytest.fixture(scope="package", autouse=True) +def setup_test_data(django_db_setup, team, user, django_db_blocker): + with django_db_blocker.unblock(): + matrix = HedgeboxMatrix( + seed="b1ef3c66-5f43-488a-98be-6b46d92fbcef", # this seed generates all events + days_past=120, + days_future=30, + n_clusters=500, + group_type_index_offset=0, + ) + matrix_manager = MatrixManager(matrix, print_steps=True) + with override_settings(TEST=False): + # Simulation saving should occur in non-test mode, so that Kafka isn't mocked. Normally in tests we don't + # want to ingest via Kafka, but simulation saving is specifically designed to use that route for speed + matrix_manager.run_on_team(team, user) diff --git a/ee/hogai/eval/tests/test_eval_funnel_generator.py b/ee/hogai/eval/tests/test_eval_funnel_generator.py index 4d7876ca6f73c..5f0f29243296a 100644 --- a/ee/hogai/eval/tests/test_eval_funnel_generator.py +++ b/ee/hogai/eval/tests/test_eval_funnel_generator.py @@ -1,40 +1,46 @@ +from collections.abc import Callable from typing import cast +import pytest from langgraph.graph.state import CompiledStateGraph from ee.hogai.assistant import AssistantGraph -from ee.hogai.eval.utils import EvalBaseTest from ee.hogai.utils.types import AssistantNodeName, AssistantState from posthog.schema import AssistantFunnelsQuery, HumanMessage, VisualizationMessage -class TestEvalFunnelGenerator(EvalBaseTest): - def _call_node(self, query: str, plan: str) -> AssistantFunnelsQuery: - graph: CompiledStateGraph = ( - AssistantGraph(self.team) - .add_edge(AssistantNodeName.START, AssistantNodeName.FUNNEL_GENERATOR) - .add_funnel_generator(AssistantNodeName.END) - .compile() - ) +@pytest.fixture +def call_node(team, runnable_config) -> Callable[[str, str], AssistantFunnelsQuery]: + graph: CompiledStateGraph = ( + AssistantGraph(team) + .add_edge(AssistantNodeName.START, AssistantNodeName.FUNNEL_GENERATOR) + .add_funnel_generator(AssistantNodeName.END) + .compile() + ) + + def callable(query: str, plan: str) -> AssistantFunnelsQuery: state = graph.invoke( AssistantState(messages=[HumanMessage(content=query)], plan=plan), - self._get_config(), + runnable_config, ) return cast(VisualizationMessage, AssistantState.model_validate(state).messages[-1]).answer - def test_node_replaces_equals_with_contains(self): - query = "what is the conversion rate from a page view to sign up for users with name John?" - plan = """Sequence: - 1. $pageview - - property filter 1 - - person - - name - - equals - - John - 2. signed_up - """ - actual_output = self._call_node(query, plan).model_dump_json(exclude_none=True) - assert "exact" not in actual_output - assert "icontains" in actual_output - assert "John" not in actual_output - assert "john" in actual_output + return callable + + +def test_node_replaces_equals_with_contains(call_node): + query = "what is the conversion rate from a page view to sign up for users with name John?" + plan = """Sequence: + 1. $pageview + - property filter 1 + - person + - name + - equals + - John + 2. signed_up + """ + actual_output = call_node(query, plan).model_dump_json(exclude_none=True) + assert "exact" not in actual_output + assert "icontains" in actual_output + assert "John" not in actual_output + assert "john" in actual_output diff --git a/ee/hogai/eval/tests/test_eval_funnel_planner.py b/ee/hogai/eval/tests/test_eval_funnel_planner.py index 9adbd75e77c6c..c8bc25bc0b5dc 100644 --- a/ee/hogai/eval/tests/test_eval_funnel_planner.py +++ b/ee/hogai/eval/tests/test_eval_funnel_planner.py @@ -1,208 +1,224 @@ +from collections.abc import Callable + +import pytest from deepeval import assert_test from deepeval.metrics import GEval from deepeval.test_case import LLMTestCase, LLMTestCaseParams +from langchain_core.runnables.config import RunnableConfig from langgraph.graph.state import CompiledStateGraph from ee.hogai.assistant import AssistantGraph -from ee.hogai.eval.utils import EvalBaseTest from ee.hogai.utils.types import AssistantNodeName, AssistantState from posthog.schema import HumanMessage -class TestEvalFunnelPlanner(EvalBaseTest): - def _get_plan_correctness_metric(self): - return GEval( - name="Funnel Plan Correctness", - criteria="You will be given expected and actual generated plans to provide a taxonomy to answer a user's question with a funnel insight. Compare the plans to determine whether the taxonomy of the actual plan matches the expected plan. Do not apply general knowledge about funnel insights.", - evaluation_steps=[ - "A plan must define at least two series in the sequence, but it is not required to define any filters, exclusion steps, or a breakdown.", - "Compare events, properties, math types, and property values of 'expected output' and 'actual output'. Do not penalize if the actual output does not include a timeframe.", - "Check if the combination of events, properties, and property values in 'actual output' can answer the user's question according to the 'expected output'.", - # The criteria for aggregations must be more specific because there isn't a way to bypass them. - "Check if the math types in 'actual output' match those in 'expected output.' If the aggregation type is specified by a property, user, or group in 'expected output', the same property, user, or group must be used in 'actual output'.", - "If 'expected output' contains exclusion steps, check if 'actual output' contains those, and heavily penalize if the exclusion steps are not present or different.", - "If 'expected output' contains a breakdown, check if 'actual output' contains a similar breakdown, and heavily penalize if the breakdown is not present or different. Plans may only have one breakdown.", - # We don't want to see in the output unnecessary property filters. The assistant tries to use them all the time. - "Heavily penalize if the 'actual output' contains any excessive output not present in the 'expected output'. For example, the `is set` operator in filters should not be used unless the user explicitly asks for it.", - ], - evaluation_params=[ - LLMTestCaseParams.INPUT, - LLMTestCaseParams.EXPECTED_OUTPUT, - LLMTestCaseParams.ACTUAL_OUTPUT, - ], - threshold=0.7, - ) +@pytest.fixture(scope="module") +def metric(): + return GEval( + name="Funnel Plan Correctness", + criteria="You will be given expected and actual generated plans to provide a taxonomy to answer a user's question with a funnel insight. Compare the plans to determine whether the taxonomy of the actual plan matches the expected plan. Do not apply general knowledge about funnel insights.", + evaluation_steps=[ + "A plan must define at least two series in the sequence, but it is not required to define any filters, exclusion steps, or a breakdown.", + "Compare events, properties, math types, and property values of 'expected output' and 'actual output'. Do not penalize if the actual output does not include a timeframe.", + "Check if the combination of events, properties, and property values in 'actual output' can answer the user's question according to the 'expected output'.", + # The criteria for aggregations must be more specific because there isn't a way to bypass them. + "Check if the math types in 'actual output' match those in 'expected output.' If the aggregation type is specified by a property, user, or group in 'expected output', the same property, user, or group must be used in 'actual output'.", + "If 'expected output' contains exclusion steps, check if 'actual output' contains those, and heavily penalize if the exclusion steps are not present or different.", + "If 'expected output' contains a breakdown, check if 'actual output' contains a similar breakdown, and heavily penalize if the breakdown is not present or different. Plans may only have one breakdown.", + # We don't want to see in the output unnecessary property filters. The assistant tries to use them all the time. + "Heavily penalize if the 'actual output' contains any excessive output not present in the 'expected output'. For example, the `is set` operator in filters should not be used unless the user explicitly asks for it.", + ], + evaluation_params=[ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.EXPECTED_OUTPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + ], + threshold=0.7, + ) - def _call_node(self, query): - graph: CompiledStateGraph = ( - AssistantGraph(self.team) - .add_edge(AssistantNodeName.START, AssistantNodeName.FUNNEL_PLANNER) - .add_funnel_planner(AssistantNodeName.END) - .compile() - ) + +@pytest.fixture +def call_node(team, runnable_config: RunnableConfig) -> Callable[[str], str]: + graph: CompiledStateGraph = ( + AssistantGraph(team) + .add_edge(AssistantNodeName.START, AssistantNodeName.FUNNEL_PLANNER) + .add_funnel_planner(AssistantNodeName.END) + .compile() + ) + + def callable(query: str) -> str: state = graph.invoke( AssistantState(messages=[HumanMessage(content=query)]), - self._get_config(), + runnable_config, ) return AssistantState.model_validate(state).plan or "" - def test_basic_funnel(self): - query = "what was the conversion from a page view to sign up?" - test_case = LLMTestCase( - input=query, - expected_output=""" - Sequence: - 1. $pageview - 2. signed_up - """, - actual_output=self._call_node(query), - ) - assert_test(test_case, [self._get_plan_correctness_metric()]) - - def test_outputs_at_least_two_events(self): - """ - Ambigious query. The funnel must return at least two events. - """ - query = "how many users paid a bill?" - test_case = LLMTestCase( - input=query, - expected_output=""" - Sequence: - 1. any event - 2. upgrade_plan - """, - actual_output=self._call_node(query), - ) - assert_test(test_case, [self._get_plan_correctness_metric()]) - - def test_no_excessive_property_filters(self): - query = "Show the user conversion from a sign up to a file download" - test_case = LLMTestCase( - input=query, - expected_output=""" - Sequence: - 1. signed_up - 2. downloaded_file - """, - actual_output=self._call_node(query), - ) - assert_test(test_case, [self._get_plan_correctness_metric()]) + return callable - def test_basic_filtering(self): - query = ( - "What was the conversion from uploading a file to downloading it from Chrome and Safari in the last 30d?" - ) - test_case = LLMTestCase( - input=query, - expected_output=""" - Sequence: - 1. uploaded_file - - property filter 1: - - entity: event - - property name: $browser - - property type: String - - operator: equals - - property value: Chrome - - property filter 2: - - entity: event - - property name: $browser - - property type: String - - operator: equals - - property value: Safari - 2. downloaded_file - - property filter 1: - - entity: event - - property name: $browser - - property type: String - - operator: equals - - property value: Chrome - - property filter 2: - - entity: event - - property name: $browser - - property type: String - - operator: equals - - property value: Safari - """, - actual_output=self._call_node(query), - ) - assert_test(test_case, [self._get_plan_correctness_metric()]) - - def test_exclusion_steps(self): - query = "What was the conversion from uploading a file to downloading it in the last 30d excluding users that deleted a file?" - test_case = LLMTestCase( - input=query, - expected_output=""" - Sequence: - 1. uploaded_file - 2. downloaded_file - - Exclusions: - - deleted_file - - start index: 0 - - end index: 1 - """, - actual_output=self._call_node(query), - ) - assert_test(test_case, [self._get_plan_correctness_metric()]) - - def test_breakdown(self): - query = "Show a conversion from uploading a file to downloading it segmented by a browser" - test_case = LLMTestCase( - input=query, - expected_output=""" - Sequence: - 1. uploaded_file - 2. downloaded_file - - Breakdown by: - - entity: event - - property name: $browser - """, - actual_output=self._call_node(query), - ) - assert_test(test_case, [self._get_plan_correctness_metric()]) - - def test_needle_in_a_haystack(self): - query = "What was the conversion from a sign up to a paying customer on the personal-pro plan?" - test_case = LLMTestCase( - input=query, - expected_output=""" - Sequence: - 1. signed_up - 2. paid_bill - - property filter 1: - - entity: event - - property name: plan - - property type: String - - operator: equals - - property value: personal/pro - """, - actual_output=self._call_node(query), - ) - assert_test(test_case, [self._get_plan_correctness_metric()]) - - def test_planner_outputs_multiple_series_from_a_single_series_question(self): - query = "What's our sign-up funnel?" - test_case = LLMTestCase( - input=query, - expected_output=""" - Sequence: - 1. $pageview - 2. signed_up - """, - actual_output=self._call_node(query), - ) - assert_test(test_case, [self._get_plan_correctness_metric()]) - - def test_funnel_does_not_include_timeframe(self): - query = "what was the conversion from a page view to sign up for event time before 2024-01-01?" - test_case = LLMTestCase( - input=query, - expected_output=""" - Sequence: - 1. $pageview - 2. signed_up - """, - actual_output=self._call_node(query), - ) - assert_test(test_case, [self._get_plan_correctness_metric()]) + +def test_basic_funnel(metric, call_node): + query = "what was the conversion from a page view to sign up?" + test_case = LLMTestCase( + input=query, + expected_output=""" + Sequence: + 1. $pageview + 2. signed_up + """, + actual_output=call_node(query), + ) + assert_test(test_case, [metric]) + + +def test_outputs_at_least_two_events(metric, call_node): + """ + Ambigious query. The funnel must return at least two events. + """ + query = "how many users paid a bill?" + test_case = LLMTestCase( + input=query, + expected_output=""" + Sequence: + 1. any event + 2. upgrade_plan + """, + actual_output=call_node(query), + ) + assert_test(test_case, [metric]) + + +def test_no_excessive_property_filters(metric, call_node): + query = "Show the user conversion from a sign up to a file download" + test_case = LLMTestCase( + input=query, + expected_output=""" + Sequence: + 1. signed_up + 2. downloaded_file + """, + actual_output=call_node(query), + ) + assert_test(test_case, [metric]) + + +def test_basic_filtering(metric, call_node): + query = "What was the conversion from uploading a file to downloading it from Chrome and Safari in the last 30d?" + test_case = LLMTestCase( + input=query, + expected_output=""" + Sequence: + 1. uploaded_file + - property filter 1: + - entity: event + - property name: $browser + - property type: String + - operator: equals + - property value: Chrome + - property filter 2: + - entity: event + - property name: $browser + - property type: String + - operator: equals + - property value: Safari + 2. downloaded_file + - property filter 1: + - entity: event + - property name: $browser + - property type: String + - operator: equals + - property value: Chrome + - property filter 2: + - entity: event + - property name: $browser + - property type: String + - operator: equals + - property value: Safari + """, + actual_output=call_node(query), + ) + assert_test(test_case, [metric]) + + +def test_exclusion_steps(metric, call_node): + query = "What was the conversion from uploading a file to downloading it in the last 30d excluding users that deleted a file?" + test_case = LLMTestCase( + input=query, + expected_output=""" + Sequence: + 1. uploaded_file + 2. downloaded_file + + Exclusions: + - deleted_file + - start index: 0 + - end index: 1 + """, + actual_output=call_node(query), + ) + assert_test(test_case, [metric]) + + +def test_breakdown(metric, call_node): + query = "Show a conversion from uploading a file to downloading it segmented by a browser" + test_case = LLMTestCase( + input=query, + expected_output=""" + Sequence: + 1. uploaded_file + 2. downloaded_file + + Breakdown by: + - entity: event + - property name: $browser + """, + actual_output=call_node(query), + ) + assert_test(test_case, [metric]) + + +def test_needle_in_a_haystack(metric, call_node): + query = "What was the conversion from a sign up to a paying customer on the personal-pro plan?" + test_case = LLMTestCase( + input=query, + expected_output=""" + Sequence: + 1. signed_up + 2. paid_bill + - property filter 1: + - entity: event + - property name: plan + - property type: String + - operator: equals + - property value: personal/pro + """, + actual_output=call_node(query), + ) + assert_test(test_case, [metric]) + + +def test_planner_outputs_multiple_series_from_a_single_series_question(metric, call_node): + query = "What's our sign-up funnel?" + test_case = LLMTestCase( + input=query, + expected_output=""" + Sequence: + 1. $pageview + 2. signed_up + """, + actual_output=call_node(query), + ) + assert_test(test_case, [metric]) + + +def test_funnel_does_not_include_timeframe(metric, call_node): + query = "what was the conversion from a page view to sign up for event time before 2024-01-01?" + test_case = LLMTestCase( + input=query, + expected_output=""" + Sequence: + 1. $pageview + 2. signed_up + """, + actual_output=call_node(query), + ) + assert_test(test_case, [metric]) diff --git a/ee/hogai/eval/tests/test_eval_router.py b/ee/hogai/eval/tests/test_eval_router.py index c1307e9d40f00..84e5c4c809972 100644 --- a/ee/hogai/eval/tests/test_eval_router.py +++ b/ee/hogai/eval/tests/test_eval_router.py @@ -1,69 +1,80 @@ +from collections.abc import Callable from typing import cast +import pytest from langgraph.graph.state import CompiledStateGraph from ee.hogai.assistant import AssistantGraph -from ee.hogai.eval.utils import EvalBaseTest from ee.hogai.utils.types import AssistantNodeName, AssistantState from posthog.schema import HumanMessage, RouterMessage -class TestEvalRouter(EvalBaseTest): - def _call_node(self, query: str | list): - graph: CompiledStateGraph = ( - AssistantGraph(self.team) - .add_start() - .add_router(path_map={"trends": AssistantNodeName.END, "funnel": AssistantNodeName.END}) - .compile() - ) +@pytest.fixture +def call_node(team, runnable_config) -> Callable[[str | list], str]: + graph: CompiledStateGraph = ( + AssistantGraph(team) + .add_start() + .add_router(path_map={"trends": AssistantNodeName.END, "funnel": AssistantNodeName.END}) + .compile() + ) + + def callable(query: str | list) -> str: messages = [HumanMessage(content=query)] if isinstance(query, str) else query state = graph.invoke( AssistantState(messages=messages), - self._get_config(), + runnable_config, ) return cast(RouterMessage, AssistantState.model_validate(state).messages[-1]).content - def test_outputs_basic_trends_insight(self): - query = "Show the $pageview trend" - res = self._call_node(query) - self.assertEqual(res, "trends") - - def test_outputs_basic_funnel_insight(self): - query = "What is the conversion rate of users who uploaded a file to users who paid for a plan?" - res = self._call_node(query) - self.assertEqual(res, "funnel") - - def test_converts_trends_to_funnel(self): - conversation = [ - HumanMessage(content="Show trends of $pageview and $identify"), - RouterMessage(content="trends"), - HumanMessage(content="Convert this insight to a funnel"), - ] - res = self._call_node(conversation[:1]) - self.assertEqual(res, "trends") - res = self._call_node(conversation) - self.assertEqual(res, "funnel") - - def test_converts_funnel_to_trends(self): - conversation = [ - HumanMessage(content="What is the conversion from a page view to a sign up?"), - RouterMessage(content="funnel"), - HumanMessage(content="Convert this insight to a trends"), - ] - res = self._call_node(conversation[:1]) - self.assertEqual(res, "funnel") - res = self._call_node(conversation) - self.assertEqual(res, "trends") - - def test_outputs_single_trends_insight(self): - """ - Must display a trends insight because it's not possible to build a funnel with a single series. - """ - query = "how many users upgraded their plan to personal pro?" - res = self._call_node(query) - self.assertEqual(res, "trends") - - def test_classifies_funnel_with_single_series(self): - query = "What's our sign-up funnel?" - res = self._call_node(query) - self.assertEqual(res, "funnel") + return callable + + +def test_outputs_basic_trends_insight(call_node): + query = "Show the $pageview trend" + res = call_node(query) + assert res == "trends" + + +def test_outputs_basic_funnel_insight(call_node): + query = "What is the conversion rate of users who uploaded a file to users who paid for a plan?" + res = call_node(query) + assert res == "funnel" + + +def test_converts_trends_to_funnel(call_node): + conversation = [ + HumanMessage(content="Show trends of $pageview and $identify"), + RouterMessage(content="trends"), + HumanMessage(content="Convert this insight to a funnel"), + ] + res = call_node(conversation[:1]) + assert res == "trends" + res = call_node(conversation) + assert res == "funnel" + + +def test_converts_funnel_to_trends(call_node): + conversation = [ + HumanMessage(content="What is the conversion from a page view to a sign up?"), + RouterMessage(content="funnel"), + HumanMessage(content="Convert this insight to a trends"), + ] + res = call_node(conversation[:1]) + assert res == "funnel" + res = call_node(conversation) + assert res == "trends" + + +def test_outputs_single_trends_insight(call_node): + """ + Must display a trends insight because it's not possible to build a funnel with a single series. + """ + query = "how many users upgraded their plan to personal pro?" + res = call_node(query) + assert res == "trends" + + +def test_classifies_funnel_with_single_series(call_node): + query = "What's our sign-up funnel?" + res = call_node(query) + assert res == "funnel" diff --git a/ee/hogai/eval/tests/test_eval_trends_generator.py b/ee/hogai/eval/tests/test_eval_trends_generator.py index 496bbf0100b51..c8491957c868f 100644 --- a/ee/hogai/eval/tests/test_eval_trends_generator.py +++ b/ee/hogai/eval/tests/test_eval_trends_generator.py @@ -1,58 +1,65 @@ +from collections.abc import Callable from typing import cast +import pytest from langgraph.graph.state import CompiledStateGraph from ee.hogai.assistant import AssistantGraph -from ee.hogai.eval.utils import EvalBaseTest from ee.hogai.utils.types import AssistantNodeName, AssistantState from posthog.schema import AssistantTrendsQuery, HumanMessage, VisualizationMessage -class TestEvalTrendsGenerator(EvalBaseTest): - def _call_node(self, query: str, plan: str) -> AssistantTrendsQuery: - graph: CompiledStateGraph = ( - AssistantGraph(self.team) - .add_edge(AssistantNodeName.START, AssistantNodeName.TRENDS_GENERATOR) - .add_trends_generator(AssistantNodeName.END) - .compile() - ) +@pytest.fixture +def call_node(team, runnable_config) -> Callable[[str, str], AssistantTrendsQuery]: + graph: CompiledStateGraph = ( + AssistantGraph(team) + .add_edge(AssistantNodeName.START, AssistantNodeName.TRENDS_GENERATOR) + .add_trends_generator(AssistantNodeName.END) + .compile() + ) + + def callable(query: str, plan: str) -> AssistantTrendsQuery: state = graph.invoke( AssistantState(messages=[HumanMessage(content=query)], plan=plan), - self._get_config(), + runnable_config, ) return cast(VisualizationMessage, AssistantState.model_validate(state).messages[-1]).answer - def test_node_replaces_equals_with_contains(self): - query = "what is pageview trend for users with name John?" - plan = """Events: - - $pageview - - math operation: total count - - property filter 1 - - person - - name - - equals - - John - """ - actual_output = self._call_node(query, plan).model_dump_json(exclude_none=True) - assert "exact" not in actual_output - assert "icontains" in actual_output - assert "John" not in actual_output - assert "john" in actual_output - - def test_node_leans_towards_line_graph(self): - query = "How often do users download files?" - # We ideally want to consider both total count of downloads per period, as well as how often a median user downloads - plan = """Events: - - downloaded_file - - math operation: total count - - downloaded_file - - math operation: median count per user - """ - actual_output = self._call_node(query, plan) - assert actual_output.trendsFilter.display == "ActionsLineGraph" - assert actual_output.series[0].kind == "EventsNode" - assert actual_output.series[0].event == "downloaded_file" - assert actual_output.series[0].math == "total" - assert actual_output.series[1].kind == "EventsNode" - assert actual_output.series[1].event == "downloaded_file" - assert actual_output.series[1].math == "median_count_per_actor" + return callable + + +def test_node_replaces_equals_with_contains(call_node): + query = "what is pageview trend for users with name John?" + plan = """Events: + - $pageview + - math operation: total count + - property filter 1 + - person + - name + - equals + - John + """ + actual_output = call_node(query, plan).model_dump_json(exclude_none=True) + assert "exact" not in actual_output + assert "icontains" in actual_output + assert "John" not in actual_output + assert "john" in actual_output + + +def test_node_leans_towards_line_graph(call_node): + query = "How often do users download files?" + # We ideally want to consider both total count of downloads per period, as well as how often a median user downloads + plan = """Events: + - downloaded_file + - math operation: total count + - downloaded_file + - math operation: median count per user + """ + actual_output = call_node(query, plan) + assert actual_output.trendsFilter.display == "ActionsLineGraph" + assert actual_output.series[0].kind == "EventsNode" + assert actual_output.series[0].event == "downloaded_file" + assert actual_output.series[0].math == "total" + assert actual_output.series[1].kind == "EventsNode" + assert actual_output.series[1].event == "downloaded_file" + assert actual_output.series[1].math == "median_count_per_actor" diff --git a/ee/hogai/eval/tests/test_eval_trends_planner.py b/ee/hogai/eval/tests/test_eval_trends_planner.py index d4fbff456a91c..4d4ea4c41dfbf 100644 --- a/ee/hogai/eval/tests/test_eval_trends_planner.py +++ b/ee/hogai/eval/tests/test_eval_trends_planner.py @@ -1,179 +1,196 @@ +from collections.abc import Callable + +import pytest from deepeval import assert_test from deepeval.metrics import GEval from deepeval.test_case import LLMTestCase, LLMTestCaseParams +from langchain_core.runnables.config import RunnableConfig from langgraph.graph.state import CompiledStateGraph from ee.hogai.assistant import AssistantGraph -from ee.hogai.eval.utils import EvalBaseTest from ee.hogai.utils.types import AssistantNodeName, AssistantState from posthog.schema import HumanMessage -class TestEvalTrendsPlanner(EvalBaseTest): - def _get_plan_correctness_metric(self): - return GEval( - name="Trends Plan Correctness", - criteria="You will be given expected and actual generated plans to provide a taxonomy to answer a user's question with a trends insight. Compare the plans to determine whether the taxonomy of the actual plan matches the expected plan. Do not apply general knowledge about trends insights.", - evaluation_steps=[ - "A plan must define at least one event and a math type, but it is not required to define any filters, breakdowns, or formulas.", - "Compare events, properties, math types, and property values of 'expected output' and 'actual output'. Do not penalize if the actual output does not include a timeframe.", - "Check if the combination of events, properties, and property values in 'actual output' can answer the user's question according to the 'expected output'.", - # The criteria for aggregations must be more specific because there isn't a way to bypass them. - "Check if the math types in 'actual output' match those in 'expected output'. Math types sometimes are interchangeable, so use your judgement. If the aggregation type is specified by a property, user, or group in 'expected output', the same property, user, or group must be used in 'actual output'.", - "If 'expected output' contains a breakdown, check if 'actual output' contains a similar breakdown, and heavily penalize if the breakdown is not present or different.", - "If 'expected output' contains a formula, check if 'actual output' contains a similar formula, and heavily penalize if the formula is not present or different.", - # We don't want to see in the output unnecessary property filters. The assistant tries to use them all the time. - "Heavily penalize if the 'actual output' contains any excessive output not present in the 'expected output'. For example, the `is set` operator in filters should not be used unless the user explicitly asks for it.", - ], - evaluation_params=[ - LLMTestCaseParams.INPUT, - LLMTestCaseParams.EXPECTED_OUTPUT, - LLMTestCaseParams.ACTUAL_OUTPUT, - ], - threshold=0.7, - ) +@pytest.fixture(scope="module") +def metric(): + return GEval( + name="Trends Plan Correctness", + criteria="You will be given expected and actual generated plans to provide a taxonomy to answer a user's question with a trends insight. Compare the plans to determine whether the taxonomy of the actual plan matches the expected plan. Do not apply general knowledge about trends insights.", + evaluation_steps=[ + "A plan must define at least one event and a math type, but it is not required to define any filters, breakdowns, or formulas.", + "Compare events, properties, math types, and property values of 'expected output' and 'actual output'. Do not penalize if the actual output does not include a timeframe.", + "Check if the combination of events, properties, and property values in 'actual output' can answer the user's question according to the 'expected output'.", + # The criteria for aggregations must be more specific because there isn't a way to bypass them. + "Check if the math types in 'actual output' match those in 'expected output'. Math types sometimes are interchangeable, so use your judgement. If the aggregation type is specified by a property, user, or group in 'expected output', the same property, user, or group must be used in 'actual output'.", + "If 'expected output' contains a breakdown, check if 'actual output' contains a similar breakdown, and heavily penalize if the breakdown is not present or different.", + "If 'expected output' contains a formula, check if 'actual output' contains a similar formula, and heavily penalize if the formula is not present or different.", + # We don't want to see in the output unnecessary property filters. The assistant tries to use them all the time. + "Heavily penalize if the 'actual output' contains any excessive output not present in the 'expected output'. For example, the `is set` operator in filters should not be used unless the user explicitly asks for it.", + ], + evaluation_params=[ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.EXPECTED_OUTPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + ], + threshold=0.7, + ) - def _call_node(self, query): - graph: CompiledStateGraph = ( - AssistantGraph(self.team) - .add_edge(AssistantNodeName.START, AssistantNodeName.TRENDS_PLANNER) - .add_trends_planner(AssistantNodeName.END) - .compile() - ) + +@pytest.fixture +def call_node(team, runnable_config: RunnableConfig) -> Callable[[str], str]: + graph: CompiledStateGraph = ( + AssistantGraph(team) + .add_edge(AssistantNodeName.START, AssistantNodeName.TRENDS_PLANNER) + .add_trends_planner(AssistantNodeName.END) + .compile() + ) + + def callable(query: str) -> str: state = graph.invoke( AssistantState(messages=[HumanMessage(content=query)]), - self._get_config(), + runnable_config, ) return AssistantState.model_validate(state).plan or "" - def test_no_excessive_property_filters(self): - query = "Show the $pageview trend" - test_case = LLMTestCase( - input=query, - expected_output=""" - Events: - - $pageview - - math operation: total count - """, - actual_output=self._call_node(query), - ) - assert_test(test_case, [self._get_plan_correctness_metric()]) - - def test_no_excessive_property_filters_for_a_defined_math_type(self): - query = "What is the MAU?" - test_case = LLMTestCase( - input=query, - expected_output=""" - Events: - - $pageview - - math operation: unique users - """, - actual_output=self._call_node(query), - ) - assert_test(test_case, [self._get_plan_correctness_metric()]) - - def test_basic_filtering(self): - query = "can you compare how many Chrome vs Safari users uploaded a file in the last 30d?" - test_case = LLMTestCase( - input=query, - expected_output=""" - Events: - - uploaded_file - - math operation: total count - - property filter 1: - - entity: event - - property name: $browser - - property type: String - - operator: equals - - property value: Chrome - - property filter 2: - - entity: event - - property name: $browser - - property type: String - - operator: equals - - property value: Safari - - Breakdown by: - - breakdown 1: + return callable + + +def test_no_excessive_property_filters(metric, call_node): + query = "Show the $pageview trend" + test_case = LLMTestCase( + input=query, + expected_output=""" + Events: + - $pageview + - math operation: total count + """, + actual_output=call_node(query), + ) + assert_test(test_case, [metric]) + + +def test_no_excessive_property_filters_for_a_defined_math_type(metric, call_node): + query = "What is the MAU?" + test_case = LLMTestCase( + input=query, + expected_output=""" + Events: + - $pageview + - math operation: unique users + """, + actual_output=call_node(query), + ) + assert_test(test_case, [metric]) + + +def test_basic_filtering(metric, call_node): + query = "can you compare how many Chrome vs Safari users uploaded a file in the last 30d?" + test_case = LLMTestCase( + input=query, + expected_output=""" + Events: + - uploaded_file + - math operation: total count + - property filter 1: - entity: event - property name: $browser - """, - actual_output=self._call_node(query), - ) - assert_test(test_case, [self._get_plan_correctness_metric()]) - - def test_formula_mode(self): - query = "i want to see a ratio of identify divided by page views" - test_case = LLMTestCase( - input=query, - expected_output=""" - Events: - - $identify - - math operation: total count - - $pageview - - math operation: total count - - Formula: - `A/B`, where `A` is the total count of `$identify` and `B` is the total count of `$pageview` - """, - actual_output=self._call_node(query), - ) - assert_test(test_case, [self._get_plan_correctness_metric()]) - - def test_math_type_by_a_property(self): - query = "what is the average session duration?" - test_case = LLMTestCase( - input=query, - expected_output=""" - Events: - - All Events - - math operation: average by `$session_duration` - """, - actual_output=self._call_node(query), - ) - assert_test(test_case, [self._get_plan_correctness_metric()]) - - def test_math_type_by_a_user(self): - query = "What is the median page view count for a user?" - test_case = LLMTestCase( - input=query, - expected_output=""" - Events: - - $pageview - - math operation: median by users - """, - actual_output=self._call_node(query), - ) - assert_test(test_case, [self._get_plan_correctness_metric()]) - - def test_needle_in_a_haystack(self): - query = "How frequently do people pay for a personal-pro plan?" - test_case = LLMTestCase( - input=query, - expected_output=""" - Events: - - paid_bill - - math operation: total count - - property filter 1: - - entity: event - - property name: plan - - property type: String - - operator: contains - - property value: personal/pro - """, - actual_output=self._call_node(query), - ) - assert_test(test_case, [self._get_plan_correctness_metric()]) - - def test_funnel_does_not_include_timeframe(self): - query = "what is the pageview trend for event time before 2024-01-01?" - test_case = LLMTestCase( - input=query, - expected_output=""" - Events: - - $pageview - - math operation: total count - """, - actual_output=self._call_node(query), - ) - assert_test(test_case, [self._get_plan_correctness_metric()]) + - property type: String + - operator: equals + - property value: Chrome + - property filter 2: + - entity: event + - property name: $browser + - property type: String + - operator: equals + - property value: Safari + + Breakdown by: + - breakdown 1: + - entity: event + - property name: $browser + """, + actual_output=call_node(query), + ) + assert_test(test_case, [metric]) + + +def test_formula_mode(metric, call_node): + query = "i want to see a ratio of identify divided by page views" + test_case = LLMTestCase( + input=query, + expected_output=""" + Events: + - $identify + - math operation: total count + - $pageview + - math operation: total count + + Formula: + `A/B`, where `A` is the total count of `$identify` and `B` is the total count of `$pageview` + """, + actual_output=call_node(query), + ) + assert_test(test_case, [metric]) + + +def test_math_type_by_a_property(metric, call_node): + query = "what is the average session duration?" + test_case = LLMTestCase( + input=query, + expected_output=""" + Events: + - All Events + - math operation: average by `$session_duration` + """, + actual_output=call_node(query), + ) + assert_test(test_case, [metric]) + + +def test_math_type_by_a_user(metric, call_node): + query = "What is the median page view count for a user?" + test_case = LLMTestCase( + input=query, + expected_output=""" + Events: + - $pageview + - math operation: median by users + """, + actual_output=call_node(query), + ) + assert_test(test_case, [metric]) + + +def test_needle_in_a_haystack(metric, call_node): + query = "How frequently do people pay for a personal-pro plan?" + test_case = LLMTestCase( + input=query, + expected_output=""" + Events: + - paid_bill + - math operation: total count + - property filter 1: + - entity: event + - property name: plan + - property type: String + - operator: contains + - property value: personal/pro + """, + actual_output=call_node(query), + ) + assert_test(test_case, [metric]) + + +def test_trends_does_not_include_timeframe(metric, call_node): + query = "what is the pageview trend for event time before 2024-01-01?" + test_case = LLMTestCase( + input=query, + expected_output=""" + Events: + - $pageview + - math operation: total count + """, + actual_output=call_node(query), + ) + assert_test(test_case, [metric]) diff --git a/ee/hogai/eval/utils.py b/ee/hogai/eval/utils.py deleted file mode 100644 index 6e03c4cfafa9f..0000000000000 --- a/ee/hogai/eval/utils.py +++ /dev/null @@ -1,40 +0,0 @@ -import os - -import pytest -from django.test import override_settings -from flaky import flaky -from langchain_core.runnables import RunnableConfig - -from ee.models.assistant import Conversation -from posthog.demo.matrix.manager import MatrixManager -from posthog.tasks.demo_create_data import HedgeboxMatrix -from posthog.test.base import NonAtomicBaseTest - - -@pytest.mark.skipif(os.environ.get("DEEPEVAL") != "YES", reason="Only runs for the assistant evaluation") -@flaky(max_runs=3, min_passes=1) -class EvalBaseTest(NonAtomicBaseTest): - def _get_config(self) -> RunnableConfig: - conversation = Conversation.objects.create(team=self.team, user=self.user) - return { - "configurable": { - "thread_id": conversation.id, - } - } - - @classmethod - def setUpTestData(cls): - super().setUpTestData() - matrix = HedgeboxMatrix( - seed="b1ef3c66-5f43-488a-98be-6b46d92fbcef", # this seed generates all events - days_past=120, - days_future=30, - n_clusters=500, - group_type_index_offset=0, - ) - matrix_manager = MatrixManager(matrix, print_steps=True) - existing_user = cls.team.organization.members.first() - with override_settings(TEST=False): - # Simulation saving should occur in non-test mode, so that Kafka isn't mocked. Normally in tests we don't - # want to ingest via Kafka, but simulation saving is specifically designed to use that route for speed - matrix_manager.run_on_team(cls.team, existing_user) diff --git a/frontend/__snapshots__/components-command-bar--search--dark.png b/frontend/__snapshots__/components-command-bar--search--dark.png index 599e3c20f7aea..0692091cc3e32 100644 Binary files a/frontend/__snapshots__/components-command-bar--search--dark.png and b/frontend/__snapshots__/components-command-bar--search--dark.png differ diff --git a/frontend/__snapshots__/components-command-bar--search--light.png b/frontend/__snapshots__/components-command-bar--search--light.png index 75bd57cddaff1..8231897e6233f 100644 Binary files a/frontend/__snapshots__/components-command-bar--search--light.png and b/frontend/__snapshots__/components-command-bar--search--light.png differ diff --git a/frontend/__snapshots__/lemon-ui-lemon-input--search--dark.png b/frontend/__snapshots__/lemon-ui-lemon-input--search--dark.png index 2618d982587b5..8dd6e08c842af 100644 Binary files a/frontend/__snapshots__/lemon-ui-lemon-input--search--dark.png and b/frontend/__snapshots__/lemon-ui-lemon-input--search--dark.png differ diff --git a/frontend/__snapshots__/lemon-ui-lemon-input--search--light.png b/frontend/__snapshots__/lemon-ui-lemon-input--search--light.png index 171b0fb4ae3bc..42f6a48d5f2a9 100644 Binary files a/frontend/__snapshots__/lemon-ui-lemon-input--search--light.png and b/frontend/__snapshots__/lemon-ui-lemon-input--search--light.png differ diff --git a/frontend/__snapshots__/replay-player-success--second-recording-in-list--dark.png b/frontend/__snapshots__/replay-player-success--second-recording-in-list--dark.png index 93ad7064fd68b..deadbaa964285 100644 Binary files a/frontend/__snapshots__/replay-player-success--second-recording-in-list--dark.png and b/frontend/__snapshots__/replay-player-success--second-recording-in-list--dark.png differ diff --git a/frontend/__snapshots__/replay-player-success--second-recording-in-list--light.png b/frontend/__snapshots__/replay-player-success--second-recording-in-list--light.png index 537566ccb05ab..f25c112bb904b 100644 Binary files a/frontend/__snapshots__/replay-player-success--second-recording-in-list--light.png and b/frontend/__snapshots__/replay-player-success--second-recording-in-list--light.png differ diff --git a/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-breakdown-edit--light.png b/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-breakdown-edit--light.png index 02bc921745ecd..00916265cd38b 100644 Binary files a/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-breakdown-edit--light.png and b/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-breakdown-edit--light.png differ diff --git a/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-edit--light.png b/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-edit--light.png index 3a86dc325987a..8de14b30417ac 100644 Binary files a/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-edit--light.png and b/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-edit--light.png differ diff --git a/frontend/src/lib/components/CommandBar/CommandBar.tsx b/frontend/src/lib/components/CommandBar/CommandBar.tsx index fe4b9c2e4555e..5d25486df6862 100644 --- a/frontend/src/lib/components/CommandBar/CommandBar.tsx +++ b/frontend/src/lib/components/CommandBar/CommandBar.tsx @@ -26,7 +26,7 @@ const CommandBarOverlay = forwardRef(fun data-attr="command-bar" className={`w-full ${ barStatus === BarStatus.SHOW_SEARCH && 'h-full' - } bg-bg-3000 rounded overflow-hidden border border-border-bold`} + } w-full bg-bg-3000 rounded overflow-hidden border border-border-bold`} ref={ref} > {children} diff --git a/frontend/src/lib/components/CommandBar/SearchBar.tsx b/frontend/src/lib/components/CommandBar/SearchBar.tsx index 3eba8e50e2aad..d9b27c7e806db 100644 --- a/frontend/src/lib/components/CommandBar/SearchBar.tsx +++ b/frontend/src/lib/components/CommandBar/SearchBar.tsx @@ -13,9 +13,10 @@ export const SearchBar = (): JSX.Element => { const inputRef = useRef(null) return ( -
+
-
+ {/* 49px = height of search input, 40rem = height of search results */} +
diff --git a/frontend/src/lib/components/CommandBar/SearchResult.tsx b/frontend/src/lib/components/CommandBar/SearchResult.tsx index 354470759518e..6bb04fce0693c 100644 --- a/frontend/src/lib/components/CommandBar/SearchResult.tsx +++ b/frontend/src/lib/components/CommandBar/SearchResult.tsx @@ -1,6 +1,8 @@ import { LemonSkeleton } from '@posthog/lemon-ui' import clsx from 'clsx' import { useActions, useValues } from 'kea' +import { TAILWIND_BREAKPOINTS } from 'lib/constants' +import { useWindowSize } from 'lib/hooks/useWindowSize' import { capitalizeFirstLetter } from 'lib/utils' import { useLayoutEffect, useRef } from 'react' import { useSummarizeInsight } from 'scenes/insights/summarizeInsight' @@ -22,10 +24,12 @@ type SearchResultProps = { export const SearchResult = ({ result, resultIndex, focused }: SearchResultProps): JSX.Element => { const { aggregationLabel } = useValues(searchBarLogic) - const { openResult } = useActions(searchBarLogic) + const { setActiveResultIndex, openResult } = useActions(searchBarLogic) const ref = useRef(null) + const { width } = useWindowSize() + useLayoutEffect(() => { if (focused) { // :HACKY: This uses the non-standard scrollIntoViewIfNeeded api @@ -40,27 +44,33 @@ export const SearchResult = ({ result, resultIndex, focused }: SearchResultProps }, [focused]) return ( -
{ - openResult(resultIndex) - }} - ref={ref} - > -
- - {result.type !== 'group' - ? tabToName[result.type] - : `${capitalizeFirstLetter(aggregationLabel(result.extra_fields.group_type_index).plural)}`} - - - - + <> +
{ + if (width && width <= TAILWIND_BREAKPOINTS.md) { + openResult(resultIndex) + } else { + setActiveResultIndex(resultIndex) + } + }} + ref={ref} + > +
+ + {result.type !== 'group' + ? tabToName[result.type] + : `${capitalizeFirstLetter(aggregationLabel(result.extra_fields.group_type_index).plural)}`} + + + + +
-
+ ) } diff --git a/frontend/src/lib/components/CommandBar/SearchResultPreview.tsx b/frontend/src/lib/components/CommandBar/SearchResultPreview.tsx index 498150ebada3a..af4e70df0a08a 100644 --- a/frontend/src/lib/components/CommandBar/SearchResultPreview.tsx +++ b/frontend/src/lib/components/CommandBar/SearchResultPreview.tsx @@ -1,11 +1,15 @@ -import { useValues } from 'kea' +import { useActions, useValues } from 'kea' import { ResultDescription, ResultName } from 'lib/components/CommandBar/SearchResult' +import { LemonButton } from 'lib/lemon-ui/LemonButton' + +import { KeyboardShortcut } from '~/layout/navigation-3000/components/KeyboardShortcut' import { tabToName } from './constants' import { searchBarLogic, urlForResult } from './searchBarLogic' export const SearchResultPreview = (): JSX.Element | null => { const { activeResultIndex, combinedSearchResults } = useValues(searchBarLogic) + const { openResult } = useActions(searchBarLogic) if (!combinedSearchResults || combinedSearchResults.length === 0) { return null @@ -14,17 +18,41 @@ export const SearchResultPreview = (): JSX.Element | null => { const result = combinedSearchResults[activeResultIndex] return ( -
-
{tabToName[result.type]}
-
- -
- - {location.host} - {urlForResult(result)} - -
- +
+
+
+
{tabToName[result.type as keyof typeof tabToName]}
+
+ +
+ + {location.host} + {urlForResult(result)} + +
+ +
+
+
+ { + openResult(activeResultIndex) + }} + tooltip={ + <> + Open + + } + aria-label="Open search result" + > + Open + +
+ Open +
+
) diff --git a/frontend/src/lib/components/CommandBar/SearchResults.tsx b/frontend/src/lib/components/CommandBar/SearchResults.tsx index 2dde6f78cbead..3e6abbc35a27d 100644 --- a/frontend/src/lib/components/CommandBar/SearchResults.tsx +++ b/frontend/src/lib/components/CommandBar/SearchResults.tsx @@ -1,6 +1,4 @@ -import clsx from 'clsx' import { useValues } from 'kea' -import { useResizeBreakpoints } from 'lib/hooks/useResizeObserver' import { DetectiveHog } from '../hedgehogs' import { searchBarLogic } from './searchBarLogic' @@ -10,27 +8,17 @@ import { SearchResultPreview } from './SearchResultPreview' export const SearchResults = (): JSX.Element => { const { combinedSearchResults, combinedSearchLoading, activeResultIndex } = useValues(searchBarLogic) - const { ref, size } = useResizeBreakpoints({ - 0: 'small', - 550: 'normal', - }) - return ( -
+ <> {!combinedSearchLoading && combinedSearchResults?.length === 0 ? ( -
+

No results

This doesn't happen often, but we're stumped!

) : ( -
-
+
+
{combinedSearchLoading && ( <> @@ -48,13 +36,11 @@ export const SearchResults = (): JSX.Element => { /> ))}
- {size !== 'small' ? ( -
- -
- ) : null} +
+ +
)} -
+ ) } diff --git a/frontend/src/lib/components/CommandBar/SearchTabs.tsx b/frontend/src/lib/components/CommandBar/SearchTabs.tsx index 37ff41ff30a53..aa3ddb67e8496 100644 --- a/frontend/src/lib/components/CommandBar/SearchTabs.tsx +++ b/frontend/src/lib/components/CommandBar/SearchTabs.tsx @@ -12,11 +12,13 @@ type SearchTabsProps = { export const SearchTabs = ({ inputRef }: SearchTabsProps): JSX.Element | null => { const { tabsGrouped } = useValues(searchBarLogic) return ( -
+
{Object.entries(tabsGrouped).map(([group, tabs]) => (
{group !== 'all' && ( - {groupToName[group]} + + {groupToName[group as keyof typeof groupToName]} + )} {tabs.map((tab) => ( diff --git a/frontend/src/lib/components/CommandBar/index.scss b/frontend/src/lib/components/CommandBar/index.scss index 02aa24cb7a11d..3150d46ed5ada 100644 --- a/frontend/src/lib/components/CommandBar/index.scss +++ b/frontend/src/lib/components/CommandBar/index.scss @@ -16,11 +16,6 @@ } } -.SearchResults { - // offset container height by input - height: calc(100% - 2.875rem); -} - .CommandBar__overlay { position: fixed; top: 0; diff --git a/frontend/src/lib/components/CommandBar/searchBarLogic.ts b/frontend/src/lib/components/CommandBar/searchBarLogic.ts index b3576ac482d10..1b96c64c34b81 100644 --- a/frontend/src/lib/components/CommandBar/searchBarLogic.ts +++ b/frontend/src/lib/components/CommandBar/searchBarLogic.ts @@ -61,6 +61,7 @@ export const searchBarLogic = kea([ onArrowUp: (activeIndex: number, maxIndex: number) => ({ activeIndex, maxIndex }), onArrowDown: (activeIndex: number, maxIndex: number) => ({ activeIndex, maxIndex }), openResult: (index: number) => ({ index }), + setActiveResultIndex: (index: number) => ({ index }), }), loaders(({ values, actions }) => ({ rawSearchResponse: [ @@ -208,6 +209,7 @@ export const searchBarLogic = kea([ openResult: () => 0, onArrowUp: (_, { activeIndex, maxIndex }) => (activeIndex > 0 ? activeIndex - 1 : maxIndex), onArrowDown: (_, { activeIndex, maxIndex }) => (activeIndex < maxIndex ? activeIndex + 1 : 0), + setActiveResultIndex: (_, { index }) => index, }, ], activeTab: [ diff --git a/frontend/src/lib/constants.tsx b/frontend/src/lib/constants.tsx index d41a232518b18..ae93860525488 100644 --- a/frontend/src/lib/constants.tsx +++ b/frontend/src/lib/constants.tsx @@ -314,3 +314,11 @@ export const SESSION_REPLAY_MINIMUM_DURATION_OPTIONS: LemonSelectOptions { _debugTag?: string _runFn: () => Promise @@ -8,7 +11,7 @@ class ConcurrencyControllerItem { constructor( concurrencyController: ConcurrencyController, userFn: () => Promise, - abortController: AbortController, + abortController: AbortController | undefined, priority: number = Infinity, debugTag: string | undefined ) { @@ -17,7 +20,7 @@ class ConcurrencyControllerItem { const { promise, resolve, reject } = promiseResolveReject() this._promise = promise this._runFn = async () => { - if (abortController.signal.aborted) { + if (abortController?.signal.aborted) { reject(new FakeAbortError(abortController.signal.reason || 'AbortError')) return } @@ -32,7 +35,7 @@ class ConcurrencyControllerItem { reject(error) } } - abortController.signal.addEventListener('abort', () => { + abortController?.signal.addEventListener('abort', () => { reject(new FakeAbortError(abortController.signal.reason || 'AbortError')) }) promise @@ -76,7 +79,7 @@ export class ConcurrencyController { }: { fn: () => Promise priority?: number - abortController: AbortController + abortController?: AbortController debugTag?: string }): Promise => { const item = new ConcurrencyControllerItem(this, fn, abortController, priority, debugTag) diff --git a/frontend/src/queries/schema.json b/frontend/src/queries/schema.json index 1d2a4d94012aa..005fb78c94497 100644 --- a/frontend/src/queries/schema.json +++ b/frontend/src/queries/schema.json @@ -2132,6 +2132,9 @@ "significant": { "type": "boolean" }, + "stats_version": { + "type": "integer" + }, "timezone": { "type": "string" }, @@ -4511,6 +4514,9 @@ "significant": { "type": "boolean" }, + "stats_version": { + "type": "integer" + }, "variants": { "items": { "$ref": "#/definitions/ExperimentVariantFunnelsBaseStats" @@ -5991,6 +5997,9 @@ }, "response": { "$ref": "#/definitions/ExperimentFunnelsQueryResponse" + }, + "stats_version": { + "type": "integer" } }, "required": ["funnels_query", "kind"], @@ -6041,6 +6050,9 @@ "significant": { "type": "boolean" }, + "stats_version": { + "type": "integer" + }, "variants": { "items": { "$ref": "#/definitions/ExperimentVariantFunnelsBaseStats" @@ -9787,6 +9799,9 @@ "significant": { "type": "boolean" }, + "stats_version": { + "type": "integer" + }, "variants": { "items": { "$ref": "#/definitions/ExperimentVariantFunnelsBaseStats" @@ -10418,6 +10433,9 @@ "significant": { "type": "boolean" }, + "stats_version": { + "type": "integer" + }, "variants": { "items": { "$ref": "#/definitions/ExperimentVariantFunnelsBaseStats" diff --git a/frontend/src/queries/schema.ts b/frontend/src/queries/schema.ts index 5360ae06d99f4..9e4252c989b38 100644 --- a/frontend/src/queries/schema.ts +++ b/frontend/src/queries/schema.ts @@ -2003,6 +2003,7 @@ export interface ExperimentFunnelsQueryResponse { significance_code: ExperimentSignificanceCode expected_loss: number credible_intervals: Record + stats_version?: integer } export type CachedExperimentFunnelsQueryResponse = CachedQueryResponse @@ -2012,6 +2013,7 @@ export interface ExperimentFunnelsQuery extends DataNode { diff --git a/frontend/src/scenes/experiments/experimentLogic.tsx b/frontend/src/scenes/experiments/experimentLogic.tsx index 698c4182dc84d..6094de7721879 100644 --- a/frontend/src/scenes/experiments/experimentLogic.tsx +++ b/frontend/src/scenes/experiments/experimentLogic.tsx @@ -861,10 +861,7 @@ export const experimentLogic = kea([ ...values.experiment.metrics[0], experiment_id: values.experimentId, } - if ( - queryWithExperimentId.kind === NodeKind.ExperimentTrendsQuery && - values.featureFlags[FEATURE_FLAGS.EXPERIMENT_STATS_V2] - ) { + if (values.featureFlags[FEATURE_FLAGS.EXPERIMENT_STATS_V2]) { queryWithExperimentId.stats_version = 2 } diff --git a/frontend/src/scenes/insights/insightDataLogic.tsx b/frontend/src/scenes/insights/insightDataLogic.tsx index 1de631587cf35..b5036267488b0 100644 --- a/frontend/src/scenes/insights/insightDataLogic.tsx +++ b/frontend/src/scenes/insights/insightDataLogic.tsx @@ -219,6 +219,7 @@ export const insightDataLogic = kea([ if (isQueryTooLarge(query)) { localStorage.removeItem(`draft-query-${values.currentTeamId}`) } + localStorage.setItem( `draft-query-${values.currentTeamId}`, crushDraftQueryForLocalStorage(query, Date.now()) diff --git a/frontend/src/scenes/insights/utils.tsx b/frontend/src/scenes/insights/utils.tsx index 5a1b4d56ec7d9..04e55519ab05b 100644 --- a/frontend/src/scenes/insights/utils.tsx +++ b/frontend/src/scenes/insights/utils.tsx @@ -1,4 +1,3 @@ -import JSONCrush from 'jsoncrush' import api from 'lib/api' import { dayjs } from 'lib/dayjs' import { CORE_FILTER_DEFINITIONS_BY_GROUP } from 'lib/taxonomy' @@ -445,40 +444,27 @@ export function isQueryTooLarge(query: Node>): boolean { export function parseDraftQueryFromLocalStorage( query: string ): { query: Node>; timestamp: number } | null { - // First try to uncrush the query if it's a JSONCrush query else fall back to parsing it as a JSON try { - const uncrushedQuery = JSONCrush.uncrush(query) - return JSON.parse(uncrushedQuery) + return JSON.parse(query) } catch (e) { - console.error('Error parsing uncrushed query', e) - try { - return JSON.parse(query) - } catch (e) { - console.error('Error parsing query', e) - return null - } + console.error('Error parsing query', e) + return null } } export function crushDraftQueryForLocalStorage(query: Node>, timestamp: number): string { - return JSONCrush.crush(JSON.stringify({ query, timestamp })) + return JSON.stringify({ query, timestamp }) } export function parseDraftQueryFromURL(query: string): Node> | null { try { - const uncrushedQuery = JSONCrush.uncrush(query) - return JSON.parse(uncrushedQuery) + return JSON.parse(query) } catch (e) { - console.error('Error parsing uncrushed query', e) - try { - return JSON.parse(query) - } catch (e) { - console.error('Error parsing query', e) - return null - } + console.error('Error parsing query', e) + return null } } export function crushDraftQueryForURL(query: Node>): string { - return JSONCrush.crush(JSON.stringify(query)) + return JSON.stringify(query) } diff --git a/frontend/src/scenes/surveys/surveyViewViz.tsx b/frontend/src/scenes/surveys/surveyViewViz.tsx index 8e19c575fef10..7384fec294c94 100644 --- a/frontend/src/scenes/surveys/surveyViewViz.tsx +++ b/frontend/src/scenes/surveys/surveyViewViz.tsx @@ -207,6 +207,7 @@ export function RatingQuestionBarChart({ } useEffect(() => { loadSurveyRatingResults({ questionIndex, iteration }) + // eslint-disable-next-line react-hooks/exhaustive-deps }, [questionIndex]) return ( @@ -301,6 +302,7 @@ export function NPSSurveyResultsBarChart({ useEffect(() => { loadSurveyRecurringNPSResults({ questionIndex }) + // eslint-disable-next-line react-hooks/exhaustive-deps }, [questionIndex]) return ( @@ -397,6 +399,7 @@ export function SingleChoiceQuestionPieChart({ useEffect(() => { loadSurveySingleChoiceResults({ questionIndex }) + // eslint-disable-next-line react-hooks/exhaustive-deps }, [questionIndex]) return ( @@ -499,12 +502,15 @@ export function MultipleChoiceQuestionBarChart({ useEffect(() => { loadSurveyMultipleChoiceResults({ questionIndex }) + // eslint-disable-next-line react-hooks/exhaustive-deps }, [questionIndex]) useEffect(() => { if (surveyMultipleChoiceResults?.[questionIndex]?.data?.length) { setChartHeight(100 + 20 * surveyMultipleChoiceResults[questionIndex].data.length) } + // TODO this one maybe should have questionIndex as a dependency + // eslint-disable-next-line react-hooks/exhaustive-deps }, [surveyMultipleChoiceResults]) return ( @@ -581,6 +587,7 @@ export function OpenTextViz({ useEffect(() => { loadSurveyOpenTextResults({ questionIndex }) + // eslint-disable-next-line react-hooks/exhaustive-deps }, [questionIndex]) return ( @@ -736,9 +743,9 @@ function ResponseSummaryFeedback({ surveyId }: { surveyId: string }): JSX.Elemen return // Already rated } setRating(newRating) - posthog.capture('survey_resonse_rated', { + posthog.capture('ai_survey_summary_rated', { survey_id: surveyId, - answer_rating: rating, + answer_rating: newRating, }) } diff --git a/frontend/src/scenes/urls.ts b/frontend/src/scenes/urls.ts index 293481f598e3b..477064de6497e 100644 --- a/frontend/src/scenes/urls.ts +++ b/frontend/src/scenes/urls.ts @@ -1,4 +1,3 @@ -import JSONCrush from 'jsoncrush' import { combineUrl } from 'kea-router' import { AlertType } from 'lib/components/Alerts/types' import { getCurrentTeamId } from 'lib/utils/getAppContext' @@ -75,8 +74,7 @@ export const urls = { insightNew: (type?: InsightType, dashboardId?: DashboardType['id'] | null, query?: Node): string => combineUrl('/insights/new', dashboardId ? { dashboard: dashboardId } : {}, { ...(type ? { insight: type } : {}), - // have to use JSONCrush directly rather than the util to avoid circular dep - ...(query ? { q: typeof query === 'string' ? query : JSONCrush.crush(JSON.stringify(query)) } : {}), + ...(query ? { q: typeof query === 'string' ? query : JSON.stringify(query) } : {}), }).url, insightNewHogQL: (query: string, filters?: HogQLFilters): string => combineUrl( diff --git a/jest.config.ts b/jest.config.ts index 39ab8232a5592..53e5df5943413 100644 --- a/jest.config.ts +++ b/jest.config.ts @@ -8,7 +8,7 @@ process.env.TZ = process.env.TZ || 'UTC' * https://jestjs.io/docs/en/configuration.html */ -const esmModules = ['query-selector-shadow-dom', 'react-syntax-highlighter', '@react-hook', '@medv', 'monaco-editor', 'jsoncrush'] +const esmModules = ['query-selector-shadow-dom', 'react-syntax-highlighter', '@react-hook', '@medv', 'monaco-editor'] const eeFolderExists = fs.existsSync('ee/frontend/exports.ts') function rootDirectories() { const rootDirectories = ['/frontend/src'] diff --git a/package.json b/package.json index 6944c5b335e23..06dc6653b7713 100644 --- a/package.json +++ b/package.json @@ -140,7 +140,6 @@ "hls.js": "^1.5.15", "husky": "^7.0.4", "image-blob-reduce": "^4.1.0", - "jsoncrush": "^1.1.8", "kea": "^3.1.5", "kea-forms": "^3.2.0", "kea-loaders": "^3.0.0", @@ -162,7 +161,7 @@ "pmtiles": "^2.11.0", "postcss": "^8.4.31", "postcss-preset-env": "^9.3.0", - "posthog-js": "1.202.2", + "posthog-js": "1.202.4", "posthog-js-lite": "3.0.0", "prettier": "^2.8.8", "prop-types": "^15.7.2", diff --git a/plugin-server/package.json b/plugin-server/package.json index 11df155e0757c..9014d19be548b 100644 --- a/plugin-server/package.json +++ b/plugin-server/package.json @@ -69,16 +69,17 @@ "express": "^4.18.2", "faker": "^5.5.3", "fast-deep-equal": "^3.1.3", + "fastpriorityqueue": "^0.7.5", "fernet-nodejs": "^1.0.6", "generic-pool": "^3.7.1", "graphile-worker": "0.13.0", "ioredis": "^4.27.6", "ipaddr.js": "^2.1.0", "kafkajs": "^2.2.0", - "lz4-kafkajs": "1.0.0", "kafkajs-snappy": "^1.1.0", "lru-cache": "^6.0.0", "luxon": "^3.4.4", + "lz4-kafkajs": "1.0.0", "node-fetch": "^2.6.1", "node-rdkafka": "^2.17.0", "node-schedule": "^2.1.0", diff --git a/plugin-server/pnpm-lock.yaml b/plugin-server/pnpm-lock.yaml index c297462845d8e..f187191553102 100644 --- a/plugin-server/pnpm-lock.yaml +++ b/plugin-server/pnpm-lock.yaml @@ -91,6 +91,9 @@ dependencies: fast-deep-equal: specifier: ^3.1.3 version: 3.1.3 + fastpriorityqueue: + specifier: ^0.7.5 + version: 0.7.5 fernet-nodejs: specifier: ^1.0.6 version: 1.0.6 @@ -6276,6 +6279,10 @@ packages: strnum: 1.0.5 dev: false + /fastpriorityqueue@0.7.5: + resolution: {integrity: sha512-3Pa0n9gwy8yIbEsT3m2j/E9DXgWvvjfiZjjqcJ+AdNKTAlVMIuFYrYG5Y3RHEM8O6cwv9hOpOWY/NaMfywoQVA==} + dev: false + /fastq@1.15.0: resolution: {integrity: sha512-wBrocU2LCXXa+lWBt8RoIRD89Fi8OdABODa/kEnyeyjS5aZO5/GNvI5sEINADqP/h8M29UHTHUb53sUu5Ihqdw==} dependencies: diff --git a/plugin-server/src/utils/concurrencyController.ts b/plugin-server/src/utils/concurrencyController.ts new file mode 100644 index 0000000000000..ac84d439fd507 --- /dev/null +++ b/plugin-server/src/utils/concurrencyController.ts @@ -0,0 +1,133 @@ +import FastPriorityQueue from 'fastpriorityqueue' + +export function promiseResolveReject(): { + resolve: (value: T) => void + reject: (reason?: any) => void + promise: Promise +} { + let resolve: (value: T) => void + let reject: (reason?: any) => void + const promise = new Promise((innerResolve, innerReject) => { + resolve = innerResolve + reject = innerReject + }) + return { resolve: resolve!, reject: reject!, promise } +} + +// Note that this file also exists in the frontend code, please keep them in sync as the tests only exist in the other version +class ConcurrencyControllerItem { + _debugTag?: string + _runFn: () => Promise + _priority: number = Infinity + _promise: Promise + constructor( + concurrencyController: ConcurrencyController, + userFn: () => Promise, + abortController: AbortController | undefined, + priority: number = Infinity, + debugTag: string | undefined + ) { + this._debugTag = debugTag + this._priority = priority + const { promise, resolve, reject } = promiseResolveReject() + this._promise = promise + this._runFn = async () => { + if (abortController?.signal.aborted) { + reject(new FakeAbortError(abortController.signal.reason || 'AbortError')) + return + } + if (concurrencyController._current.length >= concurrencyController._concurrencyLimit) { + throw new Error('Developer Error: ConcurrencyControllerItem: _runFn called while already running') + } + try { + concurrencyController._current.push(this) + const result = await userFn() + resolve(result) + } catch (error) { + reject(error) + } + } + abortController?.signal.addEventListener('abort', () => { + reject(new FakeAbortError(abortController.signal.reason || 'AbortError')) + }) + promise + .catch(() => { + // ignore + }) + .finally(() => { + if (concurrencyController._current.includes(this)) { + concurrencyController._current = concurrencyController._current.filter((item) => item !== this) + concurrencyController._runNext() + } + }) + } +} + +export class ConcurrencyController { + _concurrencyLimit: number + + _current: ConcurrencyControllerItem[] = [] + private _queue: FastPriorityQueue> = new FastPriorityQueue( + (a, b) => a._priority < b._priority + ) + + constructor(concurrencyLimit: number) { + this._concurrencyLimit = concurrencyLimit + } + + /** + * Run a function with a mutex. If the mutex is already running, the function will be queued and run when the mutex + * is available. + * @param fn The function to run + * @param priority The priority of the function. Lower numbers will be run first. Defaults to Infinity. + * @param abortController An AbortController that, if aborted, will reject the promise and immediately start the next item in the queue. + * @param debugTag + */ + run = ({ + fn, + priority, + abortController, + debugTag, + }: { + fn: () => Promise + priority?: number + abortController?: AbortController + debugTag?: string + }): Promise => { + const item = new ConcurrencyControllerItem(this, fn, abortController, priority, debugTag) + + this._queue.add(item) + + this._tryRunNext() + + return item._promise + } + + _runNext(): void { + const next = this._queue.poll() + if (next) { + next._runFn() + .catch(() => { + // ignore + }) + .finally(() => { + this._tryRunNext() + }) + } + } + + _tryRunNext(): void { + if (this._current.length < this._concurrencyLimit) { + this._runNext() + } + } + + setConcurrencyLimit = (limit: number): void => { + this._concurrencyLimit = limit + } +} + +// Create a fake AbortError that allows us to use e.name === 'AbortError' to check if an error is an AbortError +class FakeAbortError extends Error { + name = 'AbortError' +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index f383926dd1807..765be82fa6826 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -241,9 +241,6 @@ dependencies: image-blob-reduce: specifier: ^4.1.0 version: 4.1.0 - jsoncrush: - specifier: ^1.1.8 - version: 1.1.8 kea: specifier: ^3.1.5 version: 3.1.5(react@18.2.0) @@ -308,8 +305,8 @@ dependencies: specifier: ^9.3.0 version: 9.3.0(postcss@8.4.31) posthog-js: - specifier: 1.202.2 - version: 1.202.2 + specifier: 1.202.4 + version: 1.202.4 posthog-js-lite: specifier: 3.0.0 version: 3.0.0 @@ -13319,7 +13316,7 @@ packages: gopd: 1.2.0 has-symbols: 1.1.0 hasown: 2.0.2 - math-intrinsics: 1.0.0 + math-intrinsics: 1.1.0 dev: true /get-nonce@1.0.1: @@ -14076,7 +14073,7 @@ packages: hogan.js: 3.0.2 htm: 3.1.1 instantsearch-ui-components: 0.3.0 - preact: 10.25.2 + preact: 10.25.3 qs: 6.9.7 search-insights: 2.13.0 dev: false @@ -15415,10 +15412,6 @@ packages: resolution: {integrity: sha512-gfFQZrcTc8CnKXp6Y4/CBT3fTc0OVuDofpre4aEeEpSBPV5X5v4+Vmx+8snU7RLPrNHPKSgLxGo9YuQzz20o+w==} dev: true - /jsoncrush@1.1.8: - resolution: {integrity: sha512-lvIMGzMUA0fjuqwNcxlTNRq2bibPZ9auqT/LyGdlR5hvydJtA/BasSgkx4qclqTKVeTidrJvsS/oVjlTCPQ4Nw==} - dev: false - /jsonfile@6.1.0: resolution: {integrity: sha512-5dgndWOriYSm5cnYaJNhalLNDKOqFwyDB/rr1E9ZsGciGvKPs8R2xYGCacuf3z6K1YKDz182fd+fY3cn3pMqXQ==} dependencies: @@ -16003,8 +15996,8 @@ packages: resolution: {integrity: sha512-6qE4B9deFBIa9YSpOc9O0Sgc43zTeVYbgDT5veRKSlB2+ZuHNoVVxA1L/ckMUayV9Ay9y7Z/SZCLcGteW9i7bg==} dev: false - /math-intrinsics@1.0.0: - resolution: {integrity: sha512-4MqMiKP90ybymYvsut0CH2g4XWbfLtmlCkXmtmdcDCxNB+mQcu1w/1+L/VD7vi/PSv7X2JYV7SCcR+jiPXnQtA==} + /math-intrinsics@1.1.0: + resolution: {integrity: sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==} engines: {node: '>= 0.4'} dev: true @@ -17909,12 +17902,12 @@ packages: resolution: {integrity: sha512-dyajjnfzZD1tht4N7p7iwf7nBnR1MjVaVu+MKr+7gBgA39bn28wizCIJZztZPtHy4PY0YwtSGgwfBCuG/hnHgA==} dev: false - /posthog-js@1.202.2: - resolution: {integrity: sha512-9p7dAWuCfoM0WrasubGwtC8i38HU3iMqK3gd0mhyAoTrEVMVozTQq64Toc2VEv8H69NGNn6ikk5t2LclHT9XFA==} + /posthog-js@1.202.4: + resolution: {integrity: sha512-YLu6f1ibAkiopGivGQnLBaCKegT+0GHP3DfP72z3KVby2UXLBB7dj+GIa54zfjburE7ehasysRzeK5nW39QOfA==} dependencies: core-js: 3.39.0 fflate: 0.4.8 - preact: 10.25.2 + preact: 10.25.3 web-vitals: 4.2.4 dev: false @@ -17922,8 +17915,8 @@ packages: resolution: {integrity: sha512-Q+/tYsFU9r7xoOJ+y/ZTtdVQwTWfzjbiXBDMM/JKUux3+QPP02iUuIoeBQ+Ot6oEDlC+/PGjB/5A3K7KKb7hcw==} dev: false - /preact@10.25.2: - resolution: {integrity: sha512-GEts1EH3oMnqdOIeXhlbBSddZ9nrINd070WBOiPO2ous1orrKGUM4SMDbwyjSWD1iMS2dBvaDjAa5qUhz3TXqw==} + /preact@10.25.3: + resolution: {integrity: sha512-dzQmIFtM970z+fP9ziQ3yG4e3ULIbwZzJ734vaMVUTaKQ2+Ru1Ou/gjshOYVHCcd1rpAelC6ngjvjDXph98unQ==} dev: false /prelude-ls@1.2.1: diff --git a/posthog/conftest.py b/posthog/conftest.py index c27dbec43955e..e9804d25eff42 100644 --- a/posthog/conftest.py +++ b/posthog/conftest.py @@ -14,6 +14,7 @@ def create_clickhouse_tables(num_tables: int): CREATE_DATA_QUERIES, CREATE_DICTIONARY_QUERIES, CREATE_DISTRIBUTED_TABLE_QUERIES, + CREATE_KAFKA_TABLE_QUERIES, CREATE_MERGETREE_TABLE_QUERIES, CREATE_MV_TABLE_QUERIES, CREATE_VIEW_QUERIES, @@ -28,10 +29,18 @@ def create_clickhouse_tables(num_tables: int): + len(CREATE_DICTIONARY_QUERIES) ) + # Evaluation tests use Kafka for faster data ingestion. + if settings.IN_EVAL_TESTING: + total_tables += len(CREATE_KAFKA_TABLE_QUERIES) + # Check if all the tables have already been created. Views, materialized views, and dictionaries also count if num_tables == total_tables: return + if settings.IN_EVAL_TESTING: + kafka_table_queries = list(map(build_query, CREATE_KAFKA_TABLE_QUERIES)) + run_clickhouse_statement_in_parallel(kafka_table_queries) + table_queries = list(map(build_query, CREATE_MERGETREE_TABLE_QUERIES + CREATE_DISTRIBUTED_TABLE_QUERIES)) run_clickhouse_statement_in_parallel(table_queries) @@ -62,7 +71,7 @@ def reset_clickhouse_tables(): from posthog.models.channel_type.sql import TRUNCATE_CHANNEL_DEFINITION_TABLE_SQL from posthog.models.cohort.sql import TRUNCATE_COHORTPEOPLE_TABLE_SQL from posthog.models.error_tracking.sql import TRUNCATE_ERROR_TRACKING_ISSUE_FINGERPRINT_OVERRIDES_TABLE_SQL - from posthog.models.event.sql import TRUNCATE_EVENTS_TABLE_SQL, TRUNCATE_EVENTS_RECENT_TABLE_SQL + from posthog.models.event.sql import TRUNCATE_EVENTS_RECENT_TABLE_SQL, TRUNCATE_EVENTS_TABLE_SQL from posthog.models.group.sql import TRUNCATE_GROUPS_TABLE_SQL from posthog.models.performance.sql import TRUNCATE_PERFORMANCE_EVENTS_TABLE_SQL from posthog.models.person.sql import ( @@ -100,6 +109,18 @@ def reset_clickhouse_tables(): TRUNCATE_HEATMAPS_TABLE_SQL(), ] + # Drop created Kafka tables because some tests don't expect it. + if settings.IN_EVAL_TESTING: + kafka_tables = sync_execute( + f""" + SELECT name + FROM system.tables + WHERE database = '{settings.CLICKHOUSE_DATABASE}' AND name LIKE 'kafka_%' + """, + ) + # Using `ON CLUSTER` takes x20 more time to drop the tables: https://github.com/ClickHouse/ClickHouse/issues/15473. + TABLES_TO_CREATE_DROP += [f"DROP TABLE {table[0]}" for table in kafka_tables] + run_clickhouse_statement_in_parallel(TABLES_TO_CREATE_DROP) from posthog.clickhouse.schema import ( diff --git a/posthog/hogql_queries/experiments/experiment_funnels_query_runner.py b/posthog/hogql_queries/experiments/experiment_funnels_query_runner.py index d9d874d7b6e6b..363ee06bc78be 100644 --- a/posthog/hogql_queries/experiments/experiment_funnels_query_runner.py +++ b/posthog/hogql_queries/experiments/experiment_funnels_query_runner.py @@ -7,6 +7,11 @@ calculate_credible_intervals, calculate_probabilities, ) +from posthog.hogql_queries.experiments.funnels_statistics_v2 import ( + are_results_significant_v2, + calculate_credible_intervals_v2, + calculate_probabilities_v2, +) from posthog.hogql_queries.query_runner import QueryRunner from posthog.models.experiment import Experiment from ..insights.funnels.funnels_query_runner import FunnelsQueryRunner @@ -45,6 +50,8 @@ def __init__(self, *args, **kwargs): if self.experiment.holdout: self.variants.append(f"holdout-{self.experiment.holdout.id}") + self.stats_version = self.query.stats_version or 1 + self.prepared_funnels_query = self._prepare_funnel_query() self.funnels_query_runner = FunnelsQueryRunner( query=self.prepared_funnels_query, team=self.team, timings=self.timings, limit_context=self.limit_context @@ -63,9 +70,14 @@ def calculate(self) -> ExperimentFunnelsQueryResponse: # Statistical analysis control_variant, test_variants = self._get_variants_with_base_stats(funnels_result) - probabilities = calculate_probabilities(control_variant, test_variants) - significance_code, loss = are_results_significant(control_variant, test_variants, probabilities) - credible_intervals = calculate_credible_intervals([control_variant, *test_variants]) + if self.stats_version == 2: + probabilities = calculate_probabilities_v2(control_variant, test_variants) + significance_code, loss = are_results_significant_v2(control_variant, test_variants, probabilities) + credible_intervals = calculate_credible_intervals_v2([control_variant, *test_variants]) + else: + probabilities = calculate_probabilities(control_variant, test_variants) + significance_code, loss = are_results_significant(control_variant, test_variants, probabilities) + credible_intervals = calculate_credible_intervals([control_variant, *test_variants]) except Exception as e: raise ValueError(f"Error calculating experiment funnel results: {str(e)}") from e @@ -80,6 +92,7 @@ def calculate(self) -> ExperimentFunnelsQueryResponse: }, significant=significance_code == ExperimentSignificanceCode.SIGNIFICANT, significance_code=significance_code, + stats_version=self.stats_version, expected_loss=loss, credible_intervals=credible_intervals, ) diff --git a/posthog/hogql_queries/experiments/funnels_statistics_v2.py b/posthog/hogql_queries/experiments/funnels_statistics_v2.py new file mode 100644 index 0000000000000..02f18d2f70740 --- /dev/null +++ b/posthog/hogql_queries/experiments/funnels_statistics_v2.py @@ -0,0 +1,216 @@ +import numpy as np +from scipy import stats +from posthog.schema import ExperimentVariantFunnelsBaseStats, ExperimentSignificanceCode +from posthog.hogql_queries.experiments import ( + FF_DISTRIBUTION_THRESHOLD, + MIN_PROBABILITY_FOR_SIGNIFICANCE, + EXPECTED_LOSS_SIGNIFICANCE_LEVEL, +) +from scipy.stats import betabinom + +ALPHA_PRIOR = 1 +BETA_PRIOR = 1 +SAMPLE_SIZE = 10000 + + +def calculate_probabilities_v2( + control: ExperimentVariantFunnelsBaseStats, variants: list[ExperimentVariantFunnelsBaseStats] +) -> list[float]: + """ + Calculate the win probabilities for each variant in an experiment using Bayesian analysis + for funnel conversion rates. + + This function computes the probability that each variant is the best (i.e., has the highest + conversion rate) compared to all other variants, including the control. It uses samples + drawn from the posterior Beta distributions of each variant's conversion rate. + + Parameters: + ----------- + control : ExperimentVariantFunnelsBaseStats + Statistics for the control group, including success and failure counts + variants : list[ExperimentVariantFunnelsBaseStats] + List of statistics for test variants to compare against the control + + Returns: + -------- + list[float] + A list of probabilities where: + - The first element is the probability that the control variant is the best + - Subsequent elements are the probabilities that each test variant is the best + + Notes: + ------ + - Uses a Bayesian approach with Beta distributions as the posterior + - Uses Beta(1,1) as the prior, which is uniform over [0,1] + - Draws 10,000 samples from each variant's posterior distribution + """ + all_variants = [control, *variants] + + # Use Beta distribution for conversion rates + samples: list[np.ndarray] = [] + for variant in all_variants: + # Add prior to both successes and failures for Bayesian prior + alpha = ALPHA_PRIOR + variant.success_count + beta = BETA_PRIOR + variant.failure_count + # Generate samples from Beta distribution + variant_samples = np.random.beta(alpha, beta, SAMPLE_SIZE) + samples.append(variant_samples) + + samples_array = np.array(samples) + # Calculate probability of each variant being the best + probabilities = [] + for i in range(len(all_variants)): + probability = (samples_array[i] == np.max(samples_array, axis=0)).mean() + probabilities.append(float(probability)) + + return probabilities + + +def calculate_expected_loss_v2( + target_variant: ExperimentVariantFunnelsBaseStats, variants: list[ExperimentVariantFunnelsBaseStats] +) -> float: + """ + Calculates expected loss in conversion rate using Beta-Binomial conjugate prior. + + This implementation uses a Bayesian approach with Beta-Binomial model + to estimate the expected loss when choosing the target variant over others. + + Parameters: + ----------- + target_variant : ExperimentVariantFunnelsBaseStats + The variant being evaluated for loss + variants : list[ExperimentVariantFunnelsBaseStats] + List of other variants to compare against + + Returns: + -------- + float + Expected loss in conversion rate if choosing the target variant + """ + # Calculate posterior parameters for target variant + target_alpha = int(ALPHA_PRIOR + target_variant.success_count) + target_beta = int(BETA_PRIOR + target_variant.failure_count) + target_n = int(target_variant.success_count + target_variant.failure_count) + + # Get samples from target variant's Beta-Binomial + target_samples = betabinom.rvs(target_n, target_alpha, target_beta, size=SAMPLE_SIZE) / target_n + + # Get samples from each comparison variant + variant_samples = [] + for variant in variants: + n = int(variant.success_count + variant.failure_count) + alpha = int(ALPHA_PRIOR + variant.success_count) + beta = int(BETA_PRIOR + variant.failure_count) + samples = betabinom.rvs(n, alpha, beta, size=SAMPLE_SIZE) / n + variant_samples.append(samples) + + # Calculate loss + variant_max = np.maximum.reduce(variant_samples) + losses = np.maximum(0, variant_max - target_samples) + expected_loss = float(np.mean(losses)) + + return expected_loss + + +def are_results_significant_v2( + control: ExperimentVariantFunnelsBaseStats, + variants: list[ExperimentVariantFunnelsBaseStats], + probabilities: list[float], +) -> tuple[ExperimentSignificanceCode, float]: + """ + Determine if the experiment results are statistically significant using Bayesian analysis + for funnel conversion rates. + + This function evaluates whether there is strong evidence that any variant is better + than the others by considering both winning probabilities and expected loss. It checks + if the sample size is sufficient and evaluates the risk of choosing the winning variant. + + Parameters: + ----------- + control : ExperimentVariantFunnelsBaseStats + Statistics for the control group, including success and failure counts + variants : list[ExperimentVariantFunnelsBaseStats] + List of statistics for test variants to compare against the control + probabilities : list[float] + List of probabilities from calculate_probabilities_v2 + + Returns: + -------- + tuple[ExperimentSignificanceCode, float] + A tuple containing: + - Significance code indicating the result (significant, not enough exposure, high loss, etc.) + - Expected loss value for significant results, 1.0 for non-significant results + + Notes: + ------ + - Requires minimum exposure threshold per variant for reliable results + - Uses probability threshold from MIN_PROBABILITY_FOR_SIGNIFICANCE + - Calculates expected loss for the best-performing variant + - Returns HIGH_LOSS if expected loss exceeds significance threshold + - Returns NOT_ENOUGH_EXPOSURE if sample size requirements not met + """ + # Check minimum exposure + if control.success_count + control.failure_count < FF_DISTRIBUTION_THRESHOLD or any( + v.success_count + v.failure_count < FF_DISTRIBUTION_THRESHOLD for v in variants + ): + return ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE, 1.0 + + # Check if any variant has high enough probability + max_probability = max(probabilities) + if max_probability >= MIN_PROBABILITY_FOR_SIGNIFICANCE: + # Find best performing variant + all_variants = [control, *variants] + conversion_rates = [v.success_count / (v.success_count + v.failure_count) for v in all_variants] + best_idx = np.argmax(conversion_rates) + best_variant = all_variants[best_idx] + other_variants = all_variants[:best_idx] + all_variants[best_idx + 1 :] + expected_loss = calculate_expected_loss_v2(best_variant, other_variants) + + if expected_loss >= EXPECTED_LOSS_SIGNIFICANCE_LEVEL: + return ExperimentSignificanceCode.HIGH_LOSS, expected_loss + + return ExperimentSignificanceCode.SIGNIFICANT, expected_loss + + return ExperimentSignificanceCode.LOW_WIN_PROBABILITY, 1.0 + + +def calculate_credible_intervals_v2(variants: list[ExperimentVariantFunnelsBaseStats]) -> dict[str, list[float]]: + """ + Calculate Bayesian credible intervals for conversion rates of each variant. + + This function computes the 95% credible intervals for the true conversion rate + of each variant, representing the range where we believe the true rate lies + with 95% probability. + + Parameters: + ----------- + variants : list[ExperimentVariantFunnelsBaseStats] + List of all variants including control, containing success and failure counts + + Returns: + -------- + dict[str, list[float]] + Dictionary mapping variant keys to [lower, upper] credible intervals, where: + - lower is the 2.5th percentile of the posterior distribution + - upper is the 97.5th percentile of the posterior distribution + + Notes: + ------ + - Uses Beta distribution as the posterior + - Uses Beta(1,1) as the prior, which is uniform over [0,1] + - Returns 95% credible intervals + - Intervals become narrower with larger sample sizes + """ + intervals = {} + + for variant in variants: + # Add 1 to both successes and failures for Bayesian prior + alpha = ALPHA_PRIOR + variant.success_count + beta = BETA_PRIOR + variant.failure_count + + # Calculate 95% credible interval + lower, upper = stats.beta.ppf([0.025, 0.975], alpha, beta) + + intervals[variant.key] = [float(lower), float(upper)] + + return intervals diff --git a/posthog/hogql_queries/experiments/test/test_experiment_funnels_query_runner.py b/posthog/hogql_queries/experiments/test/test_experiment_funnels_query_runner.py index 3f3127fcb1b55..3dc11499b1393 100644 --- a/posthog/hogql_queries/experiments/test/test_experiment_funnels_query_runner.py +++ b/posthog/hogql_queries/experiments/test/test_experiment_funnels_query_runner.py @@ -113,6 +113,8 @@ def test_query_runner(self): ) result = query_runner.calculate() + self.assertEqual(result.stats_version, 1) + self.assertEqual(len(result.variants), 2) control_variant = next(variant for variant in result.variants if variant.key == "control") @@ -123,11 +125,88 @@ def test_query_runner(self): self.assertEqual(test_variant.success_count, 8) self.assertEqual(test_variant.failure_count, 2) - self.assertIn("control", result.probability) - self.assertIn("test", result.probability) + self.assertAlmostEqual(result.probability["control"], 0.2, delta=0.1) + self.assertAlmostEqual(result.probability["test"], 0.8, delta=0.1) - self.assertIn("control", result.credible_intervals) - self.assertIn("test", result.credible_intervals) + self.assertEqual(result.significant, False) + self.assertEqual(result.significance_code, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE) + self.assertEqual(result.expected_loss, 1.0) + + self.assertAlmostEqual(result.credible_intervals["control"][0], 0.3, delta=0.1) + self.assertAlmostEqual(result.credible_intervals["control"][1], 0.8, delta=0.1) + self.assertAlmostEqual(result.credible_intervals["test"][0], 0.5, delta=0.1) + self.assertAlmostEqual(result.credible_intervals["test"][1], 0.9, delta=0.1) + + @freeze_time("2020-01-01T12:00:00Z") + def test_query_runner_v2(self): + feature_flag = self.create_feature_flag() + experiment = self.create_experiment(feature_flag=feature_flag) + + feature_flag_property = f"$feature/{feature_flag.key}" + + funnels_query = FunnelsQuery( + series=[EventsNode(event="$pageview"), EventsNode(event="purchase")], + dateRange={"date_from": "2020-01-01", "date_to": "2020-01-14"}, + ) + experiment_query = ExperimentFunnelsQuery( + experiment_id=experiment.id, + kind="ExperimentFunnelsQuery", + funnels_query=funnels_query, + stats_version=2, + ) + + experiment.metrics = [{"type": "primary", "query": experiment_query.model_dump()}] + experiment.save() + + for variant, purchase_count in [("control", 6), ("test", 8)]: + for i in range(10): + _create_person(distinct_ids=[f"user_{variant}_{i}"], team_id=self.team.pk) + _create_event( + team=self.team, + event="$pageview", + distinct_id=f"user_{variant}_{i}", + timestamp="2020-01-02T12:00:00Z", + properties={feature_flag_property: variant}, + ) + if i < purchase_count: + _create_event( + team=self.team, + event="purchase", + distinct_id=f"user_{variant}_{i}", + timestamp="2020-01-02T12:01:00Z", + properties={feature_flag_property: variant}, + ) + + flush_persons_and_events() + + query_runner = ExperimentFunnelsQueryRunner( + query=ExperimentFunnelsQuery(**experiment.metrics[0]["query"]), team=self.team + ) + result = query_runner.calculate() + + self.assertEqual(result.stats_version, 2) + + self.assertEqual(len(result.variants), 2) + + control_variant = next(variant for variant in result.variants if variant.key == "control") + test_variant = next(variant for variant in result.variants if variant.key == "test") + + self.assertEqual(control_variant.success_count, 6) + self.assertEqual(control_variant.failure_count, 4) + self.assertEqual(test_variant.success_count, 8) + self.assertEqual(test_variant.failure_count, 2) + + self.assertAlmostEqual(result.probability["control"], 0.2, delta=0.1) + self.assertAlmostEqual(result.probability["test"], 0.8, delta=0.1) + + self.assertEqual(result.significant, False) + self.assertEqual(result.significance_code, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE) + self.assertEqual(result.expected_loss, 1.0) + + self.assertAlmostEqual(result.credible_intervals["control"][0], 0.3, delta=0.1) + self.assertAlmostEqual(result.credible_intervals["control"][1], 0.8, delta=0.1) + self.assertAlmostEqual(result.credible_intervals["test"][0], 0.5, delta=0.1) + self.assertAlmostEqual(result.credible_intervals["test"][1], 0.9, delta=0.1) @flaky(max_runs=10, min_passes=1) @freeze_time("2020-01-01T12:00:00Z") diff --git a/posthog/hogql_queries/experiments/test/test_funnels_statistics.py b/posthog/hogql_queries/experiments/test/test_funnels_statistics.py new file mode 100644 index 0000000000000..2206ff92b9305 --- /dev/null +++ b/posthog/hogql_queries/experiments/test/test_funnels_statistics.py @@ -0,0 +1,243 @@ +from posthog.hogql_queries.experiments import MIN_PROBABILITY_FOR_SIGNIFICANCE +from posthog.schema import ExperimentVariantFunnelsBaseStats, ExperimentSignificanceCode +from posthog.hogql_queries.experiments.funnels_statistics_v2 import ( + calculate_probabilities_v2, + are_results_significant_v2, + calculate_credible_intervals_v2, +) +from posthog.hogql_queries.experiments.funnels_statistics import ( + calculate_probabilities, + are_results_significant, + calculate_credible_intervals, +) +from posthog.test.base import APIBaseTest + + +def create_variant( + key: str, + success_count: int, + failure_count: int, +) -> ExperimentVariantFunnelsBaseStats: + return ExperimentVariantFunnelsBaseStats( + key=key, + success_count=success_count, + failure_count=failure_count, + ) + + +class TestExperimentFunnelStatistics(APIBaseTest): + def run_test_for_both_implementations(self, test_fn): + """Run the same test for both implementations""" + self.stats_version = 1 + # Run for original implementation + test_fn( + stats_version=1, + calculate_probabilities=calculate_probabilities, + are_results_significant=are_results_significant, + calculate_credible_intervals=calculate_credible_intervals, + ) + self.stats_version = 2 + # Run for v2 implementation + test_fn( + stats_version=2, + calculate_probabilities=calculate_probabilities_v2, + are_results_significant=are_results_significant_v2, + calculate_credible_intervals=calculate_credible_intervals_v2, + ) + + def test_small_sample_two_variants_not_significant(self): + """Test with small sample size, two variants, no clear winner""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + control = create_variant("control", success_count=10, failure_count=90) + test = create_variant("test", success_count=15, failure_count=85) + + probabilities = calculate_probabilities(control, [test]) + significance, p_value = are_results_significant(control, [test], probabilities) + intervals = calculate_credible_intervals([control, test]) + + self.assertEqual(len(probabilities), 2) + if stats_version == 2: + self.assertAlmostEqual(probabilities[0], 0.15, delta=0.1) + self.assertAlmostEqual(probabilities[1], 0.85, delta=0.1) + self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) + self.assertEqual(p_value, 1) + + # Check credible intervals + self.assertAlmostEqual(intervals["control"][0], 0.05, delta=0.05) + self.assertAlmostEqual(intervals["control"][1], 0.20, delta=0.05) + self.assertAlmostEqual(intervals["test"][0], 0.08, delta=0.05) + self.assertAlmostEqual(intervals["test"][1], 0.25, delta=0.05) + else: + # Original implementation behavior + self.assertTrue(0.1 < probabilities[0] < 0.5) + self.assertTrue(0.5 < probabilities[1] < 0.9) + self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) + self.assertEqual(p_value, 1) + + # Original implementation intervals + self.assertAlmostEqual(intervals["control"][0], 0.05, delta=0.05) + self.assertAlmostEqual(intervals["control"][1], 0.20, delta=0.05) + self.assertAlmostEqual(intervals["test"][0], 0.08, delta=0.05) + self.assertAlmostEqual(intervals["test"][1], 0.25, delta=0.05) + + self.run_test_for_both_implementations(run_test) + + def test_large_sample_two_variants_significant(self): + """Test with large sample size, two variants, clear winner""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + control = create_variant("control", success_count=1000, failure_count=9000) + test = create_variant("test", success_count=1500, failure_count=8500) + + probabilities = calculate_probabilities(control, [test]) + significance, p_value = are_results_significant(control, [test], probabilities) + intervals = calculate_credible_intervals([control, test]) + + self.assertEqual(len(probabilities), 2) + if stats_version == 2: + self.assertAlmostEqual(probabilities[1], 1.0, delta=0.05) + self.assertAlmostEqual(probabilities[0], 0.0, delta=0.05) + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + self.assertEqual(p_value, 0) + + # Check credible intervals + self.assertAlmostEqual(intervals["control"][0], 0.095, delta=0.01) + self.assertAlmostEqual(intervals["control"][1], 0.105, delta=0.01) + self.assertAlmostEqual(intervals["test"][0], 0.145, delta=0.01) + self.assertAlmostEqual(intervals["test"][1], 0.155, delta=0.01) + else: + # Original implementation behavior + self.assertTrue(probabilities[1] > 0.5) # Test variant winning + self.assertTrue(probabilities[0] < 0.5) # Control variant losing + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + self.assertLess(p_value, 0.05) + + # Original implementation intervals + self.assertAlmostEqual(intervals["control"][0], 0.095, delta=0.01) + self.assertAlmostEqual(intervals["control"][1], 0.105, delta=0.01) + self.assertAlmostEqual(intervals["test"][0], 0.145, delta=0.01) + self.assertAlmostEqual(intervals["test"][1], 0.155, delta=0.01) + + self.run_test_for_both_implementations(run_test) + + def test_many_variants_not_significant(self): + """Test with multiple variants, no clear winner""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + control = create_variant("control", success_count=100, failure_count=900) + test_a = create_variant("test_a", success_count=98, failure_count=902) + test_b = create_variant("test_b", success_count=102, failure_count=898) + test_c = create_variant("test_c", success_count=101, failure_count=899) + + probabilities = calculate_probabilities(control, [test_a, test_b, test_c]) + significance, p_value = are_results_significant(control, [test_a, test_b, test_c], probabilities) + intervals = calculate_credible_intervals([control, test_a, test_b, test_c]) + + self.assertEqual(len(probabilities), 4) + if stats_version == 2: + self.assertTrue(all(p < MIN_PROBABILITY_FOR_SIGNIFICANCE for p in probabilities)) + self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) + self.assertEqual(p_value, 1) + + # Check credible intervals overlap + # Check credible intervals for control and all test variants + self.assertAlmostEqual(intervals["control"][0], 0.09, delta=0.02) + self.assertAlmostEqual(intervals["control"][1], 0.12, delta=0.02) + self.assertAlmostEqual(intervals["test_a"][0], 0.09, delta=0.02) + self.assertAlmostEqual(intervals["test_a"][1], 0.12, delta=0.02) + self.assertAlmostEqual(intervals["test_b"][0], 0.09, delta=0.02) + self.assertAlmostEqual(intervals["test_b"][1], 0.12, delta=0.02) + self.assertAlmostEqual(intervals["test_c"][0], 0.09, delta=0.02) + self.assertAlmostEqual(intervals["test_c"][1], 0.12, delta=0.02) + else: + # Original implementation behavior + self.assertTrue(all(0.1 < p < 0.9 for p in probabilities)) + self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) + self.assertEqual(p_value, 1) + + # Check credible intervals overlap + # Check credible intervals for control and all test variants + self.assertAlmostEqual(intervals["control"][0], 0.09, delta=0.02) + self.assertAlmostEqual(intervals["control"][1], 0.12, delta=0.02) + self.assertAlmostEqual(intervals["test_a"][0], 0.09, delta=0.02) + self.assertAlmostEqual(intervals["test_a"][1], 0.12, delta=0.02) + self.assertAlmostEqual(intervals["test_b"][0], 0.09, delta=0.02) + self.assertAlmostEqual(intervals["test_b"][1], 0.12, delta=0.02) + self.assertAlmostEqual(intervals["test_c"][0], 0.09, delta=0.02) + self.assertAlmostEqual(intervals["test_c"][1], 0.12, delta=0.02) + + self.run_test_for_both_implementations(run_test) + + def test_insufficient_sample_size(self): + """Test with sample size below threshold""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + control = create_variant("control", success_count=5, failure_count=45) + test = create_variant("test", success_count=8, failure_count=42) + + probabilities = calculate_probabilities(control, [test]) + significance, p_value = are_results_significant(control, [test], probabilities) + intervals = calculate_credible_intervals([control, test]) + + self.assertEqual(len(probabilities), 2) + if stats_version == 2: + self.assertEqual(significance, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE) + self.assertEqual(p_value, 1.0) + + # Check wide credible intervals due to small sample + self.assertTrue(intervals["control"][1] - intervals["control"][0] > 0.15) + self.assertTrue(intervals["test"][1] - intervals["test"][0] > 0.15) + else: + # Original implementation behavior + self.assertEqual(significance, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE) + self.assertEqual(p_value, 1.0) + + # Check wide credible intervals + self.assertTrue(intervals["control"][1] - intervals["control"][0] > 0.15) + self.assertTrue(intervals["test"][1] - intervals["test"][0] > 0.15) + + self.run_test_for_both_implementations(run_test) + + def test_expected_loss_minimal_difference(self): + """Test expected loss when variants have very similar performance""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + control = create_variant("control", success_count=1000, failure_count=9000) # 11% conversion + test = create_variant("test", success_count=1050, failure_count=8800) # 11.9% conversion + + probabilities = calculate_probabilities(control, [test]) + significance, expected_loss = are_results_significant(control, [test], probabilities) + + if stats_version == 2: + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + # Expected loss should still be relatively small + self.assertLess(expected_loss, 0.03) # Less than 3% expected loss + self.assertGreater(expected_loss, 0) + else: + # Original implementation behavior + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + self.assertLess(expected_loss, 0.03) # Less than 3% expected loss + self.assertGreater(expected_loss, 0) + + self.run_test_for_both_implementations(run_test) + + def test_expected_loss_test_variant_clear_winner(self): + """Test expected loss when one variant is clearly better""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + control = create_variant("control", success_count=1000, failure_count=9000) # 11% conversion + test = create_variant("test", success_count=2000, failure_count=8000) # 20% conversion + + probabilities = calculate_probabilities(control, [test]) + significance, expected_loss = are_results_significant(control, [test], probabilities) + + if stats_version == 2: + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + self.assertEqual(expected_loss, 0.0) + else: + # Original implementation behavior + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + self.assertEqual(expected_loss, 0.0) + + self.run_test_for_both_implementations(run_test) diff --git a/posthog/schema.py b/posthog/schema.py index 564dcc321fa60..a46596350f23f 100644 --- a/posthog/schema.py +++ b/posthog/schema.py @@ -6292,6 +6292,7 @@ class QueryResponseAlternative15(BaseModel): probability: dict[str, float] significance_code: ExperimentSignificanceCode significant: bool + stats_version: Optional[int] = None variants: list[ExperimentVariantFunnelsBaseStats] @@ -6324,6 +6325,7 @@ class QueryResponseAlternative26(BaseModel): probability: dict[str, float] significance_code: ExperimentSignificanceCode significant: bool + stats_version: Optional[int] = None variants: list[ExperimentVariantFunnelsBaseStats] @@ -6461,6 +6463,7 @@ class CachedExperimentFunnelsQueryResponse(BaseModel): ) significance_code: ExperimentSignificanceCode significant: bool + stats_version: Optional[int] = None timezone: str variants: list[ExperimentVariantFunnelsBaseStats] @@ -6477,6 +6480,7 @@ class Response9(BaseModel): probability: dict[str, float] significance_code: ExperimentSignificanceCode significant: bool + stats_version: Optional[int] = None variants: list[ExperimentVariantFunnelsBaseStats] @@ -6508,6 +6512,7 @@ class ExperimentFunnelsQueryResponse(BaseModel): probability: dict[str, float] significance_code: ExperimentSignificanceCode significant: bool + stats_version: Optional[int] = None variants: list[ExperimentVariantFunnelsBaseStats] @@ -6640,6 +6645,7 @@ class ExperimentFunnelsQuery(BaseModel): ) name: Optional[str] = None response: Optional[ExperimentFunnelsQueryResponse] = None + stats_version: Optional[int] = None class FunnelCorrelationQuery(BaseModel): diff --git a/posthog/settings/__init__.py b/posthog/settings/__init__.py index c6067fd19c1f7..3e7ebc0b7c984 100644 --- a/posthog/settings/__init__.py +++ b/posthog/settings/__init__.py @@ -108,6 +108,7 @@ PROM_PUSHGATEWAY_ADDRESS: str | None = os.getenv("PROM_PUSHGATEWAY_ADDRESS", None) IN_UNIT_TESTING: bool = get_from_env("IN_UNIT_TESTING", False, type_cast=str_to_bool) +IN_EVAL_TESTING: bool = get_from_env("DEEPEVAL", False, type_cast=str_to_bool) HOGQL_INCREASED_MAX_EXECUTION_TIME: int = get_from_env("HOGQL_INCREASED_MAX_EXECUTION_TIME", 600, type_cast=int) diff --git a/posthog/temporal/data_imports/pipelines/sql_database/helpers.py b/posthog/temporal/data_imports/pipelines/sql_database/helpers.py index 0400a60b32fd5..9bf72a26f3c1e 100644 --- a/posthog/temporal/data_imports/pipelines/sql_database/helpers.py +++ b/posthog/temporal/data_imports/pipelines/sql_database/helpers.py @@ -24,7 +24,7 @@ def __init__( self, engine: Engine, table: Table, - chunk_size: int = 1000, + chunk_size: int = DEFAULT_CHUNK_SIZE, incremental: Optional[dlt.sources.incremental[Any]] = None, connect_args: Optional[list[str]] = None, db_incremental_field_last_value: Optional[Any] = None, diff --git a/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py b/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py index a3fc1c6b2838b..de868d62ee51d 100644 --- a/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py +++ b/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py @@ -15,6 +15,7 @@ from dlt.sources.credentials import ConnectionStringCredentials from posthog.settings.utils import get_from_env +from posthog.temporal.data_imports.pipelines.sql_database_v2.settings import DEFAULT_CHUNK_SIZE from posthog.temporal.data_imports.pipelines.sql_database_v2._json import BigQueryJSON from posthog.utils import str_to_bool from posthog.warehouse.models import ExternalDataSource @@ -252,7 +253,7 @@ def sql_database( schema: Optional[str] = dlt.config.value, metadata: Optional[MetaData] = None, table_names: Optional[list[str]] = dlt.config.value, - chunk_size: int = 50000, + chunk_size: int = DEFAULT_CHUNK_SIZE, backend: TableBackend = "pyarrow", detect_precision_hints: Optional[bool] = False, reflection_level: Optional[ReflectionLevel] = "full", diff --git a/posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py b/posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py index acd64c97aae99..74a79650caa15 100644 --- a/posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py +++ b/posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py @@ -18,6 +18,8 @@ from dlt.sources.credentials import ConnectionStringCredentials +from posthog.temporal.data_imports.pipelines.sql_database_v2.settings import DEFAULT_CHUNK_SIZE + from .arrow_helpers import row_tuples_to_arrow from .schema_types import ( default_table_adapter, @@ -44,7 +46,7 @@ def __init__( backend: TableBackend, table: Table, columns: TTableSchemaColumns, - chunk_size: int = 1000, + chunk_size: int = DEFAULT_CHUNK_SIZE, incremental: Optional[dlt.sources.incremental[Any]] = None, db_incremental_field_last_value: Optional[Any] = None, query_adapter_callback: Optional[TQueryAdapter] = None, @@ -302,7 +304,7 @@ class SqlTableResourceConfiguration(BaseConfiguration): table: Optional[str] = None schema: Optional[str] = None incremental: Optional[dlt.sources.incremental] = None # type: ignore[type-arg] - chunk_size: int = 50000 + chunk_size: int = DEFAULT_CHUNK_SIZE backend: TableBackend = "sqlalchemy" detect_precision_hints: Optional[bool] = None defer_table_reflect: Optional[bool] = False diff --git a/posthog/temporal/data_imports/pipelines/sql_database_v2/settings.py b/posthog/temporal/data_imports/pipelines/sql_database_v2/settings.py new file mode 100644 index 0000000000000..d730961c096e8 --- /dev/null +++ b/posthog/temporal/data_imports/pipelines/sql_database_v2/settings.py @@ -0,0 +1 @@ +DEFAULT_CHUNK_SIZE = 10_000 diff --git a/requirements.in b/requirements.in index faefd16d9294d..8c0b98d16587b 100644 --- a/requirements.in +++ b/requirements.in @@ -16,6 +16,7 @@ clickhouse-driver==0.2.7 clickhouse-pool==0.5.3 conditional-cache==1.2 cryptography==39.0.2 +deltalake==0.22.3 dj-database-url==0.5.0 Django~=4.2.15 django-axes==5.9.0 @@ -34,7 +35,6 @@ djangorestframework==3.15.1 djangorestframework-csv==2.1.1 djangorestframework-dataclasses==1.2.0 dlt==1.3.0 -dlt[deltalake]==1.3.0 dnspython==2.2.1 drf-exceptions-hog==0.4.0 drf-extensions==0.7.0 diff --git a/requirements.txt b/requirements.txt index 639c98066ccd4..e4a1521f8dd7a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -138,8 +138,8 @@ defusedxml==0.6.0 # via # python3-openid # social-auth-core -deltalake==0.19.1 - # via dlt +deltalake==0.22.3 + # via -r requirements.in distro==1.9.0 # via openai dj-database-url==0.5.0 @@ -273,8 +273,6 @@ googleapis-common-protos==1.60.0 # via # google-api-core # grpcio-status -greenlet==3.1.1 - # via sqlalchemy grpcio==1.63.2 # via # -r requirements.in @@ -505,7 +503,6 @@ pyarrow==17.0.0 # via # -r requirements.in # deltalake - # dlt # sqlalchemy-bigquery pyasn1==0.5.0 # via