diff --git a/ee/clickhouse/models/test/__snapshots__/test_property.ambr b/ee/clickhouse/models/test/__snapshots__/test_property.ambr deleted file mode 100644 index cc8e77f83a0dc..0000000000000 --- a/ee/clickhouse/models/test/__snapshots__/test_property.ambr +++ /dev/null @@ -1,153 +0,0 @@ -# name: TestPropFormat.test_parse_groups - ' - SELECT uuid - FROM events - WHERE team_id = 2 - AND ((has(['val_1'], replaceRegexpAll(JSONExtractRaw(properties, 'attr_1'), '^"|"$', '')) - AND has(['val_2'], replaceRegexpAll(JSONExtractRaw(properties, 'attr_2'), '^"|"$', ''))) - OR (has(['val_2'], replaceRegexpAll(JSONExtractRaw(properties, 'attr_1'), '^"|"$', '')))) - ' ---- -# name: TestPropFormat.test_parse_groups_persons - ' - SELECT uuid - FROM events - WHERE team_id = 2 - AND ((distinct_id IN - (SELECT distinct_id - FROM - (SELECT distinct_id, - argMax(person_id, version) as person_id - FROM person_distinct_id2 - WHERE team_id = 2 - GROUP BY distinct_id - HAVING argMax(is_deleted, version) = 0) - WHERE person_id IN - (SELECT id - FROM - (SELECT id, - argMax(properties, person._timestamp) as properties, - max(is_deleted) as is_deleted - FROM person - WHERE team_id = 2 - GROUP BY id - HAVING is_deleted = 0) - WHERE has(['1@posthog.com'], replaceRegexpAll(JSONExtractRaw(properties, 'email'), '^"|"$', '')) ) )) - OR (distinct_id IN - (SELECT distinct_id - FROM - (SELECT distinct_id, - argMax(person_id, version) as person_id - FROM person_distinct_id2 - WHERE team_id = 2 - GROUP BY distinct_id - HAVING argMax(is_deleted, version) = 0) - WHERE person_id IN - (SELECT id - FROM - (SELECT id, - argMax(properties, person._timestamp) as properties, - max(is_deleted) as is_deleted - FROM person - WHERE team_id = 2 - GROUP BY id - HAVING is_deleted = 0) - WHERE has(['2@posthog.com'], replaceRegexpAll(JSONExtractRaw(properties, 'email'), '^"|"$', '')) ) ))) - ' ---- -# name: test_parse_groups_persons_edge_case_with_single_filter - ( - 'AND ( has(%(vglobalperson_0)s, replaceRegexpAll(JSONExtractRaw(person_props, %(kglobalperson_0)s), \'^"|"$\', \'\')))', - { - 'kglobalperson_0': 'email', - 'vglobalperson_0': [ - '1@posthog.com', - ], - }, - ) ---- -# name: test_parse_prop_clauses_defaults - ( - ' - AND ( has(%(vglobal_0)s, replaceRegexpAll(JSONExtractRaw(properties, %(kglobal_0)s), '^"|"$', '')) AND distinct_id IN ( - SELECT distinct_id - FROM ( - - SELECT distinct_id, argMax(person_id, version) as person_id - FROM person_distinct_id2 - WHERE team_id = 1 - GROUP BY distinct_id - HAVING argMax(is_deleted, version) = 0 - - ) - WHERE person_id IN - ( - SELECT id - FROM ( - SELECT id, argMax(properties, person._timestamp) as properties, max(is_deleted) as is_deleted - FROM person - WHERE team_id = %(team_id)s - GROUP BY id - HAVING is_deleted = 0 - ) - WHERE replaceRegexpAll(JSONExtractRaw(properties, %(kglobalperson_1)s), '^"|"$', '') ILIKE %(vglobalperson_1)s - ) - )) - ', - { - 'kglobal_0': 'event_prop', - 'kglobalperson_1': 'email', - 'vglobal_0': [ - 'value', - ], - 'vglobalperson_1': '%posthog%', - }, - ) ---- -# name: test_parse_prop_clauses_defaults.1 - ( - 'AND ( has(%(vglobal_0)s, replaceRegexpAll(JSONExtractRaw(properties, %(kglobal_0)s), \'^"|"$\', \'\')) AND replaceRegexpAll(JSONExtractRaw(person_props, %(kglobalperson_1)s), \'^"|"$\', \'\') ILIKE %(vglobalperson_1)s)', - { - 'kglobal_0': 'event_prop', - 'kglobalperson_1': 'email', - 'vglobal_0': [ - 'value', - ], - 'vglobalperson_1': '%posthog%', - }, - ) ---- -# name: test_parse_prop_clauses_defaults.2 - ( - 'AND ( has(%(vglobal_0)s, replaceRegexpAll(JSONExtractRaw(properties, %(kglobal_0)s), \'^"|"$\', \'\')) AND argMax(person."pmat_email", version) ILIKE %(vpersonquery_global_1)s)', - { - 'kglobal_0': 'event_prop', - 'kpersonquery_global_1': 'email', - 'vglobal_0': [ - 'value', - ], - 'vpersonquery_global_1': '%posthog%', - }, - ) ---- -# name: test_parse_prop_clauses_funnel_step_element_prepend_regression - ( - 'AND ( (match(elements_chain, %(PREPEND__text_0_attributes_regex)s)))', - { - 'PREPEND__text_0_attributes_regex': '(text="Insights1")', - }, - ) ---- -# name: test_parse_prop_clauses_precalculated_cohort - ( - ' - AND ( pdi.person_id IN ( - SELECT DISTINCT person_id FROM cohortpeople WHERE team_id = %(team_id)s AND cohort_id = %(global_cohort_id_0)s AND version = %(global_version_0)s - )) - ', - { - 'global_cohort_id_0': 47, - 'global_version_0': None, - }, - ) ---- diff --git a/ee/clickhouse/models/test/test_property.py b/ee/clickhouse/models/test/test_property.py index f55578878f91a..ee3a768b37733 100644 --- a/ee/clickhouse/models/test/test_property.py +++ b/ee/clickhouse/models/test/test_property.py @@ -1511,6 +1511,20 @@ def test_events(db, team) -> List[UUID]: "date_exact_including_seconds_and_milliseconds": f"{datetime(2021, 3, 31, 23, 59, 59, 12):%d/%m/%Y %H:%M:%S.%f}" }, ), + _create_event( + event="$pageview", + team=team, + distinct_id="whatever", + # new line character shouldn't be matched by a single regex dot + properties={"email": "test@post\nhog.com"}, + ), + _create_event( + event="$pageview", + team=team, + distinct_id="whatever", + # not a new line character - instead a single character - should match + properties={"email": "test@postnhog.com"}, + ), ] @@ -1749,6 +1763,9 @@ def clean_up_materialised_columns(): [20, 21], id="can match before date only values", ), + # Regression test, we were previously matching on newline characters + # this should match one of two possibles (how are you supposed to figure out the expected index 🙈) + pytest.param(Property(key="email", value=r"test@post.hog.com", operator="regex"), [28]), ] diff --git a/ee/clickhouse/views/test/__snapshots__/test_clickhouse_experiment_secondary_results.ambr b/ee/clickhouse/views/test/__snapshots__/test_clickhouse_experiment_secondary_results.ambr index 5fa656c60136d..ddaff244437e8 100644 --- a/ee/clickhouse/views/test/__snapshots__/test_clickhouse_experiment_secondary_results.ambr +++ b/ee/clickhouse/views/test/__snapshots__/test_clickhouse_experiment_secondary_results.ambr @@ -1,6 +1,6 @@ # name: ClickhouseTestExperimentSecondaryResults.test_basic_secondary_metric_results ' - /* user_id:132 celery:posthog.celery.sync_insight_caching_state */ + /* user_id:135 celery:posthog.celery.sync_insight_caching_state */ SELECT team_id, date_diff('second', max(timestamp), now()) AS age FROM events diff --git a/posthog/api/test/test_query.py b/posthog/api/test/test_query.py index d8bdb746a67b4..45e3570ffca71 100644 --- a/posthog/api/test/test_query.py +++ b/posthog/api/test/test_query.py @@ -277,6 +277,63 @@ def test_event_property_filter(self): response = self.client.post(f"/api/projects/{self.team.id}/query/", {"query": query.dict()}).json() self.assertEqual(len(response["results"]), 1) + @also_test_with_materialized_columns(event_properties=["example_value"]) + @snapshot_clickhouse_queries + def test_event_property_regex_is_patched_for_dotall_setting(self): + with freeze_time("2020-01-10 12:00:00"): + _create_person( + properties={"email": "tom@posthog.com"}, + distinct_ids=["2", "some-random-uid"], + team=self.team, + immediate=True, + ) + _create_event( + team=self.team, + event="demonstrate dot all", + distinct_id="2", + properties={ + "example_value": """a +b""" + }, + ) + with freeze_time("2020-01-10 12:11:00"): + _create_event( + team=self.team, + event="demonstrate dot all", + distinct_id="2", + # should match /a.b/ + properties={"example_value": "aab"}, + ) + with freeze_time("2020-01-10 12:11:00"): + _create_event( + team=self.team, + event="demonstrate dot all", + distinct_id="2", + # should match /a.b/ + properties={"example_value": "abb"}, + ) + with freeze_time("2020-01-10 12:11:00"): + _create_event( + team=self.team, + event="demonstrate dot all", + distinct_id="2", + # won't match /a.b/ + properties={"example_value": "abc"}, + ) + flush_persons_and_events() + + with freeze_time("2020-01-10 12:14:00"): + query = EventsQuery( + event="demonstrate dot all", + select=[ + "properties.example_value", + ], + properties=[{"key": "example_value", "value": "a.b", "operator": "regex", "type": "event"}], + ) + response = self.client.post(f"/api/projects/{self.team.id}/query/", {"query": query.dict()}).json() + assert "(?-s)" in response["hogql"] + assert [x[0] for x in response["results"]] == ["aab", "abb"] + @also_test_with_materialized_columns(event_properties=["key"], person_properties=["email"]) @snapshot_clickhouse_queries def test_person_property_filter(self): diff --git a/posthog/hogql/printer.py b/posthog/hogql/printer.py index acade9b195878..367a323697da9 100644 --- a/posthog/hogql/printer.py +++ b/posthog/hogql/printer.py @@ -542,16 +542,16 @@ def visit_compare_operation(self, node: ast.CompareOperation): elif node.op == ast.CompareOperationOp.GlobalNotIn: op = f"globalNotIn({left}, {right})" elif node.op == ast.CompareOperationOp.Regex: - op = f"match({left}, {right})" + op = f"match({left}, concat('(?-s)', {right}))" value_if_both_sides_are_null = True elif node.op == ast.CompareOperationOp.NotRegex: - op = f"not(match({left}, {right}))" + op = f"not(match({left}, concat('(?-s)', {right})))" value_if_one_side_is_null = True elif node.op == ast.CompareOperationOp.IRegex: - op = f"match({left}, concat('(?i)', {right}))" + op = f"match({left}, concat('(?i-s)', {right}))" value_if_both_sides_are_null = True elif node.op == ast.CompareOperationOp.NotIRegex: - op = f"not(match({left}, concat('(?i)', {right})))" + op = f"not(match({left}, concat('(?i-s)', {right})))" value_if_one_side_is_null = True elif node.op == ast.CompareOperationOp.Gt: op = f"greater({left}, {right})" diff --git a/posthog/hogql/property.py b/posthog/hogql/property.py index ce4ea3bdfe14f..e65fe204cd216 100644 --- a/posthog/hogql/property.py +++ b/posthog/hogql/property.py @@ -202,12 +202,18 @@ def property_to_expr( left=field, right=ast.Constant(value=f"%{value}%"), ) + # we follow re2 regex syntax and so does ClickHouse **except** + # For example, the string a\nb shouldn't match the pattern a.b, but it does in CH + # this is because According to the re2 docs, the s flag is false by default, + # but in CH it seems to be true by default. + # prepending (?-s) to the regex string will make it work as expected + # see https://github.com/ClickHouse/ClickHouse/issues/34603 elif operator == PropertyOperator.regex: - return ast.Call(name="match", args=[field, ast.Constant(value=value)]) + return ast.Call(name="match", args=[field, ast.Constant(value=f"(?-s){value}")]) elif operator == PropertyOperator.not_regex: return ast.Call( name="not", - args=[ast.Call(name="match", args=[field, ast.Constant(value=value)])], + args=[ast.Call(name="match", args=[field, ast.Constant(value=f"(?-s){value}")])], ) elif operator == PropertyOperator.exact or operator == PropertyOperator.is_date_exact: op = ast.CompareOperationOp.Eq diff --git a/posthog/hogql/test/test_property.py b/posthog/hogql/test/test_property.py index 52e57d9c61c76..7f7b179ee306e 100644 --- a/posthog/hogql/test/test_property.py +++ b/posthog/hogql/test/test_property.py @@ -141,11 +141,11 @@ def test_property_to_expr_event(self): ) self.assertEqual( self._property_to_expr({"type": "event", "key": "a", "value": ".*", "operator": "regex"}), - self._parse_expr("match(properties.a, '.*')"), + self._parse_expr("match(properties.a, '(?-s).*')"), ) self.assertEqual( self._property_to_expr({"type": "event", "key": "a", "value": ".*", "operator": "not_regex"}), - self._parse_expr("not(match(properties.a, '.*'))"), + self._parse_expr("not(match(properties.a, '(?-s).*'))"), ) self.assertEqual( self._property_to_expr({"type": "event", "key": "a", "value": [], "operator": "exact"}), @@ -203,7 +203,7 @@ def test_property_to_expr_event_list(self): ) self.assertEqual( self._property_to_expr({"type": "event", "key": "a", "value": ["b", "c"], "operator": "regex"}), - self._parse_expr("match(properties.a, 'b') or match(properties.a, 'c')"), + self._parse_expr("match(properties.a, '(?-s)b') or match(properties.a, '(?-s)c')"), ) # negative self.assertEqual( @@ -230,7 +230,7 @@ def test_property_to_expr_event_list(self): "operator": "not_regex", } ), - self._parse_expr("not(match(properties.a, 'b')) and not(match(properties.a, 'c'))"), + self._parse_expr("not(match(properties.a, '(?-s)b')) and not(match(properties.a, '(?-s)c'))"), ) def test_property_to_expr_feature(self): diff --git a/posthog/models/property/util.py b/posthog/models/property/util.py index c67a3d8cf3154..4d7d129615193 100644 --- a/posthog/models/property/util.py +++ b/posthog/models/property/util.py @@ -489,7 +489,13 @@ def prop_filter_json_extract( params = { "k{}_{}".format(prepend, idx): prop.key, - "v{}_{}".format(prepend, idx): prop.value, + # we follow re2 regex syntax and so does ClickHouse **except** + # For example, the string a\nb shouldn't match the pattern a.b, but it does in CH + # this is because According to the re2 docs, the s flag is false by default, + # but in CH it seems to be true by default. + # prepending (?-s) to the regex string will make it work as expected + # see https://github.com/ClickHouse/ClickHouse/issues/34603 + "v{}_{}".format(prepend, idx): f"(?-s){prop.value}", } return (