Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: re2 dot all flag for non-hogql filters #19487

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 0 additions & 153 deletions ee/clickhouse/models/test/__snapshots__/test_property.ambr

This file was deleted.

17 changes: 17 additions & 0 deletions ee/clickhouse/models/test/test_property.py
Original file line number Diff line number Diff line change
Expand Up @@ -1511,6 +1511,20 @@ def test_events(db, team) -> List[UUID]:
"date_exact_including_seconds_and_milliseconds": f"{datetime(2021, 3, 31, 23, 59, 59, 12):%d/%m/%Y %H:%M:%S.%f}"
},
),
_create_event(
event="$pageview",
team=team,
distinct_id="whatever",
# new line character shouldn't be matched by a single regex dot
properties={"email": "test@post\nhog.com"},
),
_create_event(
event="$pageview",
team=team,
distinct_id="whatever",
# not a new line character - instead a single character - should match
properties={"email": "[email protected]"},
),
]


Expand Down Expand Up @@ -1749,6 +1763,9 @@ def clean_up_materialised_columns():
[20, 21],
id="can match before date only values",
),
# Regression test, we were previously matching on newline characters
# this should match one of two possibles (how are you supposed to figure out the expected index 🙈)
pytest.param(Property(key="email", value=r"[email protected]", operator="regex"), [28]),
]


Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# name: ClickhouseTestExperimentSecondaryResults.test_basic_secondary_metric_results
'
/* user_id:132 celery:posthog.celery.sync_insight_caching_state */
/* user_id:135 celery:posthog.celery.sync_insight_caching_state */
SELECT team_id,
date_diff('second', max(timestamp), now()) AS age
FROM events
Expand Down
57 changes: 57 additions & 0 deletions posthog/api/test/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,63 @@ def test_event_property_filter(self):
response = self.client.post(f"/api/projects/{self.team.id}/query/", {"query": query.dict()}).json()
self.assertEqual(len(response["results"]), 1)

@also_test_with_materialized_columns(event_properties=["example_value"])
@snapshot_clickhouse_queries
def test_event_property_regex_is_patched_for_dotall_setting(self):
with freeze_time("2020-01-10 12:00:00"):
_create_person(
properties={"email": "[email protected]"},
distinct_ids=["2", "some-random-uid"],
team=self.team,
immediate=True,
)
_create_event(
team=self.team,
event="demonstrate dot all",
distinct_id="2",
properties={
"example_value": """a
b"""
},
)
with freeze_time("2020-01-10 12:11:00"):
_create_event(
team=self.team,
event="demonstrate dot all",
distinct_id="2",
# should match /a.b/
properties={"example_value": "aab"},
)
with freeze_time("2020-01-10 12:11:00"):
_create_event(
team=self.team,
event="demonstrate dot all",
distinct_id="2",
# should match /a.b/
properties={"example_value": "abb"},
)
with freeze_time("2020-01-10 12:11:00"):
_create_event(
team=self.team,
event="demonstrate dot all",
distinct_id="2",
# won't match /a.b/
properties={"example_value": "abc"},
)
flush_persons_and_events()

with freeze_time("2020-01-10 12:14:00"):
query = EventsQuery(
event="demonstrate dot all",
select=[
"properties.example_value",
],
properties=[{"key": "example_value", "value": "a.b", "operator": "regex", "type": "event"}],
)
response = self.client.post(f"/api/projects/{self.team.id}/query/", {"query": query.dict()}).json()
assert "(?-s)" in response["hogql"]
assert [x[0] for x in response["results"]] == ["aab", "abb"]

@also_test_with_materialized_columns(event_properties=["key"], person_properties=["email"])
@snapshot_clickhouse_queries
def test_person_property_filter(self):
Expand Down
8 changes: 4 additions & 4 deletions posthog/hogql/printer.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,16 +542,16 @@ def visit_compare_operation(self, node: ast.CompareOperation):
elif node.op == ast.CompareOperationOp.GlobalNotIn:
op = f"globalNotIn({left}, {right})"
elif node.op == ast.CompareOperationOp.Regex:
op = f"match({left}, {right})"
op = f"match({left}, concat('(?-s)', {right}))"
value_if_both_sides_are_null = True
elif node.op == ast.CompareOperationOp.NotRegex:
op = f"not(match({left}, {right}))"
op = f"not(match({left}, concat('(?-s)', {right})))"
value_if_one_side_is_null = True
elif node.op == ast.CompareOperationOp.IRegex:
op = f"match({left}, concat('(?i)', {right}))"
op = f"match({left}, concat('(?i-s)', {right}))"
value_if_both_sides_are_null = True
elif node.op == ast.CompareOperationOp.NotIRegex:
op = f"not(match({left}, concat('(?i)', {right})))"
op = f"not(match({left}, concat('(?i-s)', {right})))"
value_if_one_side_is_null = True
elif node.op == ast.CompareOperationOp.Gt:
op = f"greater({left}, {right})"
Expand Down
10 changes: 8 additions & 2 deletions posthog/hogql/property.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,12 +202,18 @@ def property_to_expr(
left=field,
right=ast.Constant(value=f"%{value}%"),
)
# we follow re2 regex syntax and so does ClickHouse **except**
# For example, the string a\nb shouldn't match the pattern a.b, but it does in CH
# this is because According to the re2 docs, the s flag is false by default,
# but in CH it seems to be true by default.
# prepending (?-s) to the regex string will make it work as expected
# see https://github.com/ClickHouse/ClickHouse/issues/34603
elif operator == PropertyOperator.regex:
return ast.Call(name="match", args=[field, ast.Constant(value=value)])
return ast.Call(name="match", args=[field, ast.Constant(value=f"(?-s){value}")])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This definitely looks correct.

elif operator == PropertyOperator.not_regex:
return ast.Call(
name="not",
args=[ast.Call(name="match", args=[field, ast.Constant(value=value)])],
args=[ast.Call(name="match", args=[field, ast.Constant(value=f"(?-s){value}")])],
)
elif operator == PropertyOperator.exact or operator == PropertyOperator.is_date_exact:
op = ast.CompareOperationOp.Eq
Expand Down
8 changes: 4 additions & 4 deletions posthog/hogql/test/test_property.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,11 @@ def test_property_to_expr_event(self):
)
self.assertEqual(
self._property_to_expr({"type": "event", "key": "a", "value": ".*", "operator": "regex"}),
self._parse_expr("match(properties.a, '.*')"),
self._parse_expr("match(properties.a, '(?-s).*')"),
)
self.assertEqual(
self._property_to_expr({"type": "event", "key": "a", "value": ".*", "operator": "not_regex"}),
self._parse_expr("not(match(properties.a, '.*'))"),
self._parse_expr("not(match(properties.a, '(?-s).*'))"),
)
self.assertEqual(
self._property_to_expr({"type": "event", "key": "a", "value": [], "operator": "exact"}),
Expand Down Expand Up @@ -203,7 +203,7 @@ def test_property_to_expr_event_list(self):
)
self.assertEqual(
self._property_to_expr({"type": "event", "key": "a", "value": ["b", "c"], "operator": "regex"}),
self._parse_expr("match(properties.a, 'b') or match(properties.a, 'c')"),
self._parse_expr("match(properties.a, '(?-s)b') or match(properties.a, '(?-s)c')"),
)
# negative
self.assertEqual(
Expand All @@ -230,7 +230,7 @@ def test_property_to_expr_event_list(self):
"operator": "not_regex",
}
),
self._parse_expr("not(match(properties.a, 'b')) and not(match(properties.a, 'c'))"),
self._parse_expr("not(match(properties.a, '(?-s)b')) and not(match(properties.a, '(?-s)c'))"),
)

def test_property_to_expr_feature(self):
Expand Down
8 changes: 7 additions & 1 deletion posthog/models/property/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,13 @@ def prop_filter_json_extract(

params = {
"k{}_{}".format(prepend, idx): prop.key,
"v{}_{}".format(prepend, idx): prop.value,
# we follow re2 regex syntax and so does ClickHouse **except**
# For example, the string a\nb shouldn't match the pattern a.b, but it does in CH
# this is because According to the re2 docs, the s flag is false by default,
# but in CH it seems to be true by default.
# prepending (?-s) to the regex string will make it work as expected
# see https://github.com/ClickHouse/ClickHouse/issues/34603
"v{}_{}".format(prepend, idx): f"(?-s){prop.value}",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pretty sure this is right

}

return (
Expand Down
Loading