Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(experiments): Restore internal user filter for dw #26911

Merged
merged 7 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 33 additions & 47 deletions frontend/src/scenes/experiments/Metrics/PrimaryGoalTrends.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,6 @@ export function PrimaryGoalTrends(): JSX.Element {

const metricIdx = 0
const currentMetric = experiment.metrics[metricIdx] as ExperimentTrendsQuery
// :FLAG: CLEAN UP AFTER MIGRATION
const isDataWarehouseMetric =
featureFlags[FEATURE_FLAGS.EXPERIMENTS_HOGQL] &&
currentMetric.count_query.series[0].kind === NodeKind.DataWarehouseNode

return (
<>
Expand Down Expand Up @@ -63,18 +59,10 @@ export function PrimaryGoalTrends(): JSX.Element {
MathAvailability.All
)

if (series[0].kind === NodeKind.DataWarehouseNode) {
setTrendsMetric({
metricIdx,
series,
filterTestAccounts: false,
})
} else {
setTrendsMetric({
metricIdx,
series,
})
}
setTrendsMetric({
metricIdx,
series,
})
} else {
if (actions?.length) {
setExperiment({
Expand Down Expand Up @@ -113,37 +101,35 @@ export function PrimaryGoalTrends(): JSX.Element {
showNumericalPropsOnly={true}
{...commonActionFilterProps}
/>
{!isDataWarehouseMetric && (
<div className="mt-4 space-y-4">
<TestAccountFilterSwitch
checked={(() => {
// :FLAG: CLEAN UP AFTER MIGRATION
if (featureFlags[FEATURE_FLAGS.EXPERIMENTS_HOGQL]) {
const val = currentMetric.count_query?.filterTestAccounts
return hasFilters ? !!val : false
}
return hasFilters ? !!experiment.filters.filter_test_accounts : false
})()}
onChange={(checked: boolean) => {
// :FLAG: CLEAN UP AFTER MIGRATION
if (featureFlags[FEATURE_FLAGS.EXPERIMENTS_HOGQL]) {
setTrendsMetric({
metricIdx,
filterTestAccounts: checked,
})
} else {
setExperiment({
filters: {
...experiment.filters,
filter_test_accounts: checked,
},
})
}
}}
fullWidth
/>
</div>
)}
<div className="mt-4 space-y-4">
<TestAccountFilterSwitch
checked={(() => {
// :FLAG: CLEAN UP AFTER MIGRATION
if (featureFlags[FEATURE_FLAGS.EXPERIMENTS_HOGQL]) {
const val = currentMetric.count_query?.filterTestAccounts
return hasFilters ? !!val : false
}
return hasFilters ? !!experiment.filters.filter_test_accounts : false
})()}
onChange={(checked: boolean) => {
// :FLAG: CLEAN UP AFTER MIGRATION
if (featureFlags[FEATURE_FLAGS.EXPERIMENTS_HOGQL]) {
setTrendsMetric({
metricIdx,
filterTestAccounts: checked,
})
} else {
setExperiment({
filters: {
...experiment.filters,
filter_test_accounts: checked,
},
})
}
}}
fullWidth
/>
</div>
{isExperimentRunning && (
<LemonBanner type="info" className="mt-3 mb-3">
Preview insights are generated based on {EXPERIMENT_DEFAULT_DURATION} days of data. This can cause a
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
OBJECT_STORAGE_SECRET_ACCESS_KEY,
XDIST_SUFFIX,
)
from posthog.test.base import APIBaseTest, ClickhouseTestMixin, _create_event, flush_persons_and_events
from posthog.test.base import APIBaseTest, ClickhouseTestMixin, _create_event, _create_person, flush_persons_and_events
from freezegun import freeze_time
from typing import cast
from django.utils import timezone
Expand Down Expand Up @@ -198,10 +198,12 @@ def create_data_warehouse_table_with_usage(self):

path_to_s3_object = "s3://" + OBJECT_STORAGE_BUCKET + f"/{TEST_BUCKET}"

id = pa.array(["1", "2", "3", "4", "5"])
date = pa.array(["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-06", "2023-01-07"])
user_id = pa.array(["user_control_0", "user_test_1", "user_test_2", "user_test_3", "user_extra"])
usage = pa.array([1000, 500, 750, 800, 900])
id = pa.array(["1", "2", "3", "4", "5", "6"])
date = pa.array(["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-04", "2023-01-06", "2023-01-07"])
user_id = pa.array(
["user_control_0", "user_test_1", "user_test_2", "internal_test_1", "user_test_3", "user_extra"]
)
usage = pa.array([1000, 500, 750, 100000, 800, 900])
names = ["id", "ds", "userid", "usage"]

pq.write_to_dataset(
Expand Down Expand Up @@ -244,6 +246,15 @@ def create_data_warehouse_table_with_usage(self):
field_name="events",
configuration={"experiments_optimized": True, "experiments_timestamp_key": "ds"},
)

DataWarehouseJoin.objects.create(
team=self.team,
source_table_name=table_name,
source_table_key="userid",
joining_table_name="persons",
joining_table_key="properties.$user_id",
field_name="person",
)
return table_name

@freeze_time("2020-01-01T12:00:00Z")
Expand Down Expand Up @@ -804,6 +815,15 @@ def test_query_runner_with_data_warehouse_series_no_end_date_and_nested_id(self)

feature_flag_property = f"$feature/{feature_flag.key}"

self.team.test_account_filters = [
{
"key": "email",
"value": "@posthog.com",
"operator": "not_icontains",
"type": "person",
}
]
self.team.save()
count_query = TrendsQuery(
series=[
DataWarehouseNode(
Expand All @@ -816,9 +836,10 @@ def test_query_runner_with_data_warehouse_series_no_end_date_and_nested_id(self)
math_property="usage",
math_property_type="data_warehouse_properties",
)
]
],
filterTestAccounts=True,
)
exposure_query = TrendsQuery(series=[EventsNode(event="$feature_flag_called")])
exposure_query = TrendsQuery(series=[EventsNode(event="$feature_flag_called")], filterTestAccounts=True)

experiment_query = ExperimentTrendsQuery(
experiment_id=experiment.id,
Expand Down Expand Up @@ -846,6 +867,25 @@ def test_query_runner_with_data_warehouse_series_no_end_date_and_nested_id(self)
timestamp=datetime(2023, 1, i + 1),
)

_create_person(
team=self.team,
distinct_ids=["internal_test_1"],
properties={"email": "[email protected]", "$user_id": "internal_test_1"},
)

_create_event(
team=self.team,
event="$feature_flag_called",
distinct_id="internal_test_1",
properties={
feature_flag_property: "test",
"$feature_flag_response": "test",
"$feature_flag": feature_flag.key,
"$user_id": "internal_test_1",
},
timestamp=datetime(2023, 1, 3),
)
Comment on lines +876 to +887
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’d prefer a clearer test to explicitly verify that this event is excluded from the returned data. It might be better to have a separate method specifically for testing the filtering of test users.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How might you suggest constructing the test?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something like:

  • Only populate the data warehouse records with rows including the test user id
  • The experiment query should return no result

My point is: we’re creating a test user and an event generated by them. Am I missing something, or what exactly are we doing with this user and event later on? How are we ensuring that this event is excluded from the results?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How are we ensuring that this event is excluded from the results?

It's inferred. These values will change significantly if the event is included in results:

https://github.com/PostHog/posthog/pull/26911/files#diff-36098f224702c73f16f7fce4f65403a8d9f840e079324042dc4dabeac1176f4dL924-R971

The way I went about writing the test was:

  1. Add the data warehouse row and event so the test was failing.
  2. Add the persons table join and filterTestAccounts=True statement so the test is passing with the original conditions again.

I think what you suggest is marginally more explicit, but could also be unexpectedly erroneous in some other ways.

I updated the test to also run the query with filterTestAccounts=false and assert the results: 5ca3fd3

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good 👍


# "user_test_3" first exposure (feature_flag_property="control") is on 2023-01-03
# "user_test_3" relevant exposure (feature_flag_property="test") is on 2023-01-04
# "user_test_3" other event (feature_flag_property="control" is on 2023-01-05
Expand Down Expand Up @@ -906,8 +946,12 @@ def test_query_runner_with_data_warehouse_series_no_end_date_and_nested_id(self)
)

# Assert the expected join condition in the clickhouse SQL
expected_join_condition = f"and(equals(events.team_id, {query_runner.count_query_runner.team.id}), equals(event, %(hogql_val_8)s), greaterOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_9)s, 6, %(hogql_val_10)s))), lessOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_11)s, 6, %(hogql_val_12)s))))) AS e__events ON"
self.assertIn(expected_join_condition, str(response.clickhouse))
expected_join_condition = f"and(equals(events.team_id, {query_runner.count_query_runner.team.id}), equals(event, %(hogql_val_12)s), greaterOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_13)s, 6, %(hogql_val_14)s))), lessOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_15)s, 6, %(hogql_val_16)s))))) AS e__events ON"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just thinking out loud - how fragile is it to have a test set up like this, where we rely on checking the generated Clickhouse SQL? This could break if the hogql_val names change, even though it wouldn't actually affect our functionality.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct: it does break when the hogql_val names change. I'm fine with that right now, though. It's a brittle test for a brittle piece of code, and I'd like for it to break noisily if anything changes.

I added a more descriptive exception message for anyone who runs into it though: 9fb4210

self.assertIn(
expected_join_condition,
str(response.clickhouse),
"Please check to make sure the timestamp statements are included in the ASOF LEFT JOIN select statement. This may also fail if the placeholder numbers have changed.",
)

result = query_runner.calculate()

Expand Down Expand Up @@ -935,6 +979,65 @@ def test_query_runner_with_data_warehouse_series_no_end_date_and_nested_id(self)
[0.0, 500.0, 1250.0, 1250.0, 1250.0, 2050.0, 2050.0, 2050.0, 2050.0, 2050.0],
)

# Run the query again with filter_test_accounts=False
# as a point of comparison to above
count_query = TrendsQuery(
series=[
DataWarehouseNode(
id=table_name,
distinct_id_field="userid",
id_field="id",
table_name=table_name,
timestamp_field="ds",
math="avg",
math_property="usage",
math_property_type="data_warehouse_properties",
)
],
filterTestAccounts=False,
)
exposure_query = TrendsQuery(series=[EventsNode(event="$feature_flag_called")], filterTestAccounts=False)

experiment_query = ExperimentTrendsQuery(
experiment_id=experiment.id,
kind="ExperimentTrendsQuery",
count_query=count_query,
exposure_query=exposure_query,
)

experiment.metrics = [{"type": "primary", "query": experiment_query.model_dump()}]
experiment.save()

query_runner = ExperimentTrendsQueryRunner(
query=ExperimentTrendsQuery(**experiment.metrics[0]["query"]), team=self.team
)
with freeze_time("2023-01-07"):
result = query_runner.calculate()

trend_result = cast(ExperimentTrendsQueryResponse, result)

self.assertEqual(len(result.variants), 2)

control_result = next(variant for variant in trend_result.variants if variant.key == "control")
test_result = next(variant for variant in trend_result.variants if variant.key == "test")

control_insight = next(variant for variant in trend_result.insight if variant["breakdown_value"] == "control")
test_insight = next(variant for variant in trend_result.insight if variant["breakdown_value"] == "test")

self.assertEqual(control_result.count, 1000)
self.assertEqual(test_result.count, 102050)
self.assertEqual(control_result.absolute_exposure, 1)
self.assertEqual(test_result.absolute_exposure, 4)

self.assertEqual(
control_insight["data"][:10],
[1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0],
)
self.assertEqual(
test_insight["data"][:10],
[0.0, 500.0, 1250.0, 101250.0, 101250.0, 102050.0, 102050.0, 102050.0, 102050.0, 102050.0],
)

def test_query_runner_with_data_warehouse_series_expected_query(self):
table_name = self.create_data_warehouse_table_with_payments()

Expand Down Expand Up @@ -1004,7 +1107,11 @@ def test_query_runner_with_data_warehouse_series_expected_query(self):

# Assert the expected join condition in the clickhouse SQL
expected_join_condition = f"and(equals(events.team_id, {query_runner.count_query_runner.team.id}), equals(event, %(hogql_val_7)s), greaterOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_8)s, 6, %(hogql_val_9)s))), lessOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_10)s, 6, %(hogql_val_11)s))))) AS e__events ON"
self.assertIn(expected_join_condition, str(response.clickhouse))
self.assertIn(
expected_join_condition,
str(response.clickhouse),
"Please check to make sure the timestamp statements are included in the ASOF LEFT JOIN select statement. This may also fail if the placeholder numbers have changed.",
)

result = query_runner.calculate()

Expand Down
Loading