From bcd2866a3f4c04d671546c9f4b36da842e836b6e Mon Sep 17 00:00:00 2001 From: Jodi Jang Date: Wed, 27 Nov 2024 12:04:54 -0800 Subject: [PATCH] fix(similarity): Limit > 30 system frame check to Java --- .../group_similar_issues_embeddings.py | 4 +- src/sentry/seer/similarity/utils.py | 19 ++- tests/sentry/grouping/ingest/test_seer.py | 119 +++++++++++++++++- .../test_group_similar_issues_embeddings.py | 2 +- tests/sentry/seer/similarity/test_utils.py | 34 ++++- .../test_backfill_seer_grouping_records.py | 4 +- 6 files changed, 168 insertions(+), 14 deletions(-) diff --git a/src/sentry/issues/endpoints/group_similar_issues_embeddings.py b/src/sentry/issues/endpoints/group_similar_issues_embeddings.py index fff3dcff065ade..6d5e17476dff92 100644 --- a/src/sentry/issues/endpoints/group_similar_issues_embeddings.py +++ b/src/sentry/issues/endpoints/group_similar_issues_embeddings.py @@ -84,7 +84,9 @@ def get(self, request: Request, group) -> Response: if latest_event and event_content_has_stacktrace(latest_event): grouping_info = get_grouping_info(None, project=group.project, event=latest_event) try: - stacktrace_string = get_stacktrace_string(grouping_info) + stacktrace_string = get_stacktrace_string( + grouping_info, platform=latest_event.platform + ) except TooManyOnlySystemFramesException: stacktrace_string = "" diff --git a/src/sentry/seer/similarity/utils.py b/src/sentry/seer/similarity/utils.py index 29ce438ac8865e..b92fd1858d81df 100644 --- a/src/sentry/seer/similarity/utils.py +++ b/src/sentry/seer/similarity/utils.py @@ -144,6 +144,19 @@ "ruby-rails", ] ) +SYSTEM_FRAME_CHECK_PLATFORMS = frozenset( + [ + "java", + "java-android", + "java-appengine", + "java-log4j", + "java-log4j2", + "java-logging", + "java-logback", + "java-spring", + "java-spring-boot", + ] +) BASE64_ENCODED_PREFIXES = [ "data:text/html;base64", "data:text/javascript;base64", @@ -169,7 +182,7 @@ def _get_value_if_exists(exception_value: dict[str, Any]) -> str: return exception_value["values"][0] if exception_value.get("values") else "" -def get_stacktrace_string(data: dict[str, Any]) -> str: +def get_stacktrace_string(data: dict[str, Any], platform: str | None = None) -> str: """Format a stacktrace string from the grouping information.""" app_hash = get_path(data, "app", "hash") app_component = get_path(data, "app", "component", "values") @@ -280,7 +293,7 @@ def _process_frames(frames: list[dict[str, Any]]) -> list[str]: exc_value = _get_value_if_exists(exception_value) elif exception_value.get("id") == "stacktrace" and frame_count < MAX_FRAME_COUNT: frame_strings = _process_frames(exception_value["values"]) - if is_frames_truncated and not app_hash: + if platform in SYSTEM_FRAME_CHECK_PLATFORMS and is_frames_truncated and not app_hash: raise TooManyOnlySystemFramesException if has_no_filename_or_module: raise NoFilenameOrModuleException @@ -323,7 +336,7 @@ def get_stacktrace_string_with_metrics( data: dict[str, Any], platform: str | None, referrer: ReferrerOptions ) -> str | None: try: - stacktrace_string = get_stacktrace_string(data) + stacktrace_string = get_stacktrace_string(data, platform) except TooManyOnlySystemFramesException: platform = platform if platform else "unknown" metrics.incr( diff --git a/tests/sentry/grouping/ingest/test_seer.py b/tests/sentry/grouping/ingest/test_seer.py index 7af07fe9a1a8d7..f46f5713b6e962 100644 --- a/tests/sentry/grouping/ingest/test_seer.py +++ b/tests/sentry/grouping/ingest/test_seer.py @@ -1,6 +1,6 @@ from dataclasses import asdict from time import time -from unittest.mock import MagicMock, Mock, patch +from unittest.mock import ANY, MagicMock, Mock, call, patch from sentry import options from sentry.conf.server import SEER_SIMILARITY_MODEL_VERSION @@ -368,7 +368,7 @@ def test_too_many_only_system_frames(self, mock_metrics: Mock) -> None: } ] }, - "platform": "python", + "platform": "java", }, ) expected_metadata = { @@ -384,7 +384,7 @@ def test_too_many_only_system_frames(self, mock_metrics: Mock) -> None: mock_metrics.incr.assert_any_call( "grouping.similarity.over_threshold_only_system_frames", sample_rate=sample_rate, - tags={"platform": "python", "referrer": "ingest"}, + tags={"platform": "java", "referrer": "ingest"}, ) mock_metrics.incr.assert_any_call( "grouping.similarity.did_call_seer", @@ -395,6 +395,58 @@ def test_too_many_only_system_frames(self, mock_metrics: Mock) -> None: }, ) + @patch("sentry.seer.similarity.utils.metrics") + def test_too_many_only_system_frames_invalid_platform(self, mock_metrics: Mock) -> None: + type = "FailedToFetchError" + value = "Charlie didn't bring the ball back" + context_line = f"raise {type}('{value}')" + new_event = Event( + project_id=self.project.id, + event_id="22312012112120120908201304152013", + data={ + "title": f"{type}('{value}')", + "exception": { + "values": [ + { + "type": type, + "value": value, + "stacktrace": { + "frames": [ + { + "function": f"play_fetch_{i}", + "filename": f"dogpark{i}.py", + "context_line": context_line, + } + for i in range(MAX_FRAME_COUNT + 1) + ] + }, + } + ] + }, + "platform": "python", + }, + ) + expected_metadata = { + "similarity_model_version": SEER_SIMILARITY_MODEL_VERSION, + "results": [], + } + assert get_seer_similar_issues(new_event, new_event.get_grouping_variants()) == ( + expected_metadata, + None, + ) + + assert ( + call( + "grouping.similarity.did_call_seer", + sample_rate=1.0, + tags={ + "call_made": False, + "blocker": "over-threshold-only-system-frames", + }, + ) + not in mock_metrics.incr.call_args_list + ) + class TestMaybeCheckSeerForMatchingGroupHash(TestCase): @@ -486,7 +538,7 @@ def test_too_many_only_system_frames_maybe_check_seer_for_matching_group_hash( } ] }, - "platform": "python", + "platform": "java", }, ) @@ -502,7 +554,7 @@ def test_too_many_only_system_frames_maybe_check_seer_for_matching_group_hash( mock_metrics.incr.assert_any_call( "grouping.similarity.over_threshold_only_system_frames", sample_rate=sample_rate, - tags={"platform": "python", "referrer": "ingest"}, + tags={"platform": "java", "referrer": "ingest"}, ) mock_metrics.incr.assert_any_call( "grouping.similarity.did_call_seer", @@ -514,3 +566,60 @@ def test_too_many_only_system_frames_maybe_check_seer_for_matching_group_hash( ) mock_get_similar_issues.assert_not_called() + + @patch("sentry.grouping.ingest.seer.get_similarity_data_from_seer", return_value=[]) + def test_too_many_only_system_frames_maybe_check_seer_for_matching_group_hash_invalid_platform( + self, mock_get_similarity_data: MagicMock + ) -> None: + self.project.update_option("sentry:similarity_backfill_completed", int(time())) + + type = "FailedToFetchError" + value = "Charlie didn't bring the ball back" + context_line = f"raise {type}('{value}')" + new_event = Event( + project_id=self.project.id, + event_id="22312012112120120908201304152013", + data={ + "title": f"{type}('{value}')", + "exception": { + "values": [ + { + "type": type, + "value": value, + "stacktrace": { + "frames": [ + { + "function": f"play_fetch_{i}", + "filename": f"dogpark{i}.py", + "context_line": context_line, + } + for i in range(MAX_FRAME_COUNT + 1) + ] + }, + } + ] + }, + "platform": "python", + }, + ) + + GroupHash.objects.create( + project=self.project, group=new_event.group, hash=new_event.get_primary_hash() + ) + group_hashes = list(GroupHash.objects.filter(project_id=self.project.id)) + maybe_check_seer_for_matching_grouphash( + new_event, new_event.get_grouping_variants(), group_hashes + ) + + mock_get_similarity_data.assert_called_with( + { + "event_id": new_event.event_id, + "hash": new_event.get_primary_hash(), + "project_id": self.project.id, + "stacktrace": ANY, + "exception_type": "FailedToFetchError", + "k": 1, + "referrer": "ingest", + "use_reranking": True, + } + ) diff --git a/tests/sentry/issues/endpoints/test_group_similar_issues_embeddings.py b/tests/sentry/issues/endpoints/test_group_similar_issues_embeddings.py index e56f054499a1cd..3810f7a20f0cc3 100644 --- a/tests/sentry/issues/endpoints/test_group_similar_issues_embeddings.py +++ b/tests/sentry/issues/endpoints/test_group_similar_issues_embeddings.py @@ -764,7 +764,7 @@ def test_too_many_system_frames(self): } ] }, - "platform": "python", + "platform": "java", } new_event = self.store_event(data=error_data, project_id=self.project) assert new_event.group diff --git a/tests/sentry/seer/similarity/test_utils.py b/tests/sentry/seer/similarity/test_utils.py index 8675c4a2e2b6a2..25a2efad431a03 100644 --- a/tests/sentry/seer/similarity/test_utils.py +++ b/tests/sentry/seer/similarity/test_utils.py @@ -727,7 +727,20 @@ def test_too_many_system_frames_single_exception(self): ] += self.create_frames(MAX_FRAME_COUNT + 1, True) with pytest.raises(TooManyOnlySystemFramesException): - get_stacktrace_string(data_system) + get_stacktrace_string(data_system, platform="java") + + def test_too_many_system_frames_single_exception_invalid_platform(self): + data_system = copy.deepcopy(self.BASE_APP_DATA) + data_system["system"] = data_system.pop("app") + data_system["system"]["component"]["values"][0]["values"][0][ + "values" + ] += self.create_frames(MAX_FRAME_COUNT + 1, True) + + stacktrace_string = get_stacktrace_string(data_system) + assert stacktrace_string is not None and stacktrace_string != "" + + stacktrace_string = get_stacktrace_string(data_system, platform="python") + assert stacktrace_string is not None and stacktrace_string != "" def test_too_many_system_frames_chained_exception(self): data_system = copy.deepcopy(self.CHAINED_APP_DATA) @@ -741,7 +754,24 @@ def test_too_many_system_frames_chained_exception(self): ] += self.create_frames(MAX_FRAME_COUNT // 2, True) with pytest.raises(TooManyOnlySystemFramesException): - get_stacktrace_string(data_system) + get_stacktrace_string(data_system, platform="java") + + def test_too_many_system_frames_chained_exception_invalid_platform(self): + data_system = copy.deepcopy(self.CHAINED_APP_DATA) + data_system["system"] = data_system.pop("app") + # Split MAX_FRAME_COUNT across the two exceptions + data_system["system"]["component"]["values"][0]["values"][0]["values"][0][ + "values" + ] += self.create_frames(MAX_FRAME_COUNT // 2, True) + data_system["system"]["component"]["values"][0]["values"][1]["values"][0][ + "values" + ] += self.create_frames(MAX_FRAME_COUNT // 2, True) + + stacktrace_string = get_stacktrace_string(data_system) + assert stacktrace_string is not None and stacktrace_string != "" + + stacktrace_string = get_stacktrace_string(data_system, platform="python") + assert stacktrace_string is not None and stacktrace_string != "" def test_too_many_in_app_contributing_frames(self): """ diff --git a/tests/sentry/tasks/test_backfill_seer_grouping_records.py b/tests/sentry/tasks/test_backfill_seer_grouping_records.py index 617657ee7cd68f..4c9a0d758d07bf 100644 --- a/tests/sentry/tasks/test_backfill_seer_grouping_records.py +++ b/tests/sentry/tasks/test_backfill_seer_grouping_records.py @@ -392,7 +392,7 @@ def test_lookup_group_data_stacktrace_bulk_invalid_stacktrace_exception(self, mo ] event = self.store_event( data={ - "platform": "python", + "platform": "java", "exception": exception, "title": "title", "timestamp": before_now(seconds=10).isoformat(), @@ -426,7 +426,7 @@ def test_lookup_group_data_stacktrace_bulk_invalid_stacktrace_exception(self, mo mock_metrics.incr.assert_called_with( "grouping.similarity.over_threshold_only_system_frames", sample_rate=sample_rate, - tags={"platform": "python", "referrer": "backfill"}, + tags={"platform": "java", "referrer": "backfill"}, ) def test_lookup_group_data_stacktrace_bulk_with_fallback_success(self):