From ca332c2ea171222d47d8e15b449a5f397c7e684f Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Wed, 30 Oct 2024 19:14:41 -0400 Subject: [PATCH 01/34] bug 1909875: improve CrashIdsFailedToPublish information (#6775) When a user uses the Reprocessing API, they send some number of crash ids to be reprocessed. If something happens when publishing the crash ids to PubSub, the crash id is added to a "failed" list and the exception is logged. Then at the end if there were any failures, a CrashIdsFailedToPublish exception is thrown with the list of failures. That causes Django to return an HTTP 500 to the user and sentry captures the CrashIdsFailedToPublish exception, but the event isn't very helpful--we have no idea what happened. This attempts to alleviate that in a couple of ways: 1. It uses sentry_sdk.capture_exception() to capture each problem enountered when publishing to PubSub. We'll be able to see what's going on in Sentry events. 2. It keeps track of the errors along with the crash ids and wraps that in the CrashIdsFailedToPublish exception message so we're not ending up with just a list of crash ids--we get the errors, too. --- socorro/external/pubsub/crashqueue.py | 12 +++---- .../tests/external/pubsub/test_crashqueue.py | 31 ++++++++++++++++++- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/socorro/external/pubsub/crashqueue.py b/socorro/external/pubsub/crashqueue.py index 4a3d3fef46..0bd954e0b5 100644 --- a/socorro/external/pubsub/crashqueue.py +++ b/socorro/external/pubsub/crashqueue.py @@ -9,6 +9,7 @@ from google.cloud.pubsub_v1 import PublisherClient, SubscriberClient from google.cloud.pubsub_v1.types import BatchSettings, PublisherOptions from more_itertools import chunked +import sentry_sdk from socorro.external.crashqueue_base import CrashQueueBase @@ -260,15 +261,14 @@ def publish(self, queue, crash_ids): for i, future in enumerate(futures): try: future.result() - except Exception: + except Exception as exc: + sentry_sdk.capture_exception(exc) logger.exception( - "Crashid failed to publish: %s %s", + "crashid failed to publish: %s %s", queue, batch[i], ) - failed.append(batch[i]) + failed.append((repr(exc), batch[i])) if failed: - raise CrashIdsFailedToPublish( - f"Crashids failed to publish: {','.join(failed)}" - ) + raise CrashIdsFailedToPublish(f"Crashids failed to publish: {failed!r}") diff --git a/socorro/tests/external/pubsub/test_crashqueue.py b/socorro/tests/external/pubsub/test_crashqueue.py index a25cb80411..4e8b2db9ae 100644 --- a/socorro/tests/external/pubsub/test_crashqueue.py +++ b/socorro/tests/external/pubsub/test_crashqueue.py @@ -8,9 +8,11 @@ import pytest +from socorro import settings +from socorro.external.pubsub.crashqueue import CrashIdsFailedToPublish from socorro.libclass import build_instance_from_settings from socorro.lib.libooid import create_new_ooid -from socorro import settings + # Amount of time to sleep between publish and pull so messages are available PUBSUB_DELAY_PULL = 0.5 @@ -108,3 +110,30 @@ def test_publish_many(self, pubsub_helper, queue): published_crash_ids = pubsub_helper.get_published_crashids(queue) assert set(published_crash_ids) == {crash_id_1, crash_id_2, crash_id_3} + + def test_publish_with_error(self, pubsub_helper, sentry_helper): + queue = "reprocessing" + crash_id = create_new_ooid() + + crashqueue = build_instance_from_settings(settings.QUEUE_PUBSUB) + + # Run teardown_queues in the helper so there's no queue. That will cause an + # error to get thrown by PubSub. + pubsub_helper.teardown_queues() + + with sentry_helper.init() as sentry_client: + try: + crashqueue.publish(queue, [crash_id]) + except CrashIdsFailedToPublish as exc: + print(exc) + + # wait for published messages to become available before pulling + time.sleep(PUBSUB_DELAY_PULL) + + (envelope,) = sentry_client.envelope_payloads + errors = [ + f"{error['type']} {error['value']}" + for error in envelope["exception"]["values"] + ] + + assert "NotFound Topic not found" in errors From d6a4fbabdbfe69d7a6e3a6608da2dd0ff17f4d3b Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Thu, 31 Oct 2024 10:52:18 -0400 Subject: [PATCH 02/34] bug-1901998: fix buginfo view to stop sending sentry errors (#6779) When Bugzilla is in a mood, the buginfo view queries the BugInfo API and that API throws a requests.exceptions.RetryError which the buginfo view didn't handle and thus we end up with a Sentry error. When this happens, the report view Bugzilla tab gets back an HTTP 500 from querying the buginfo view and kind of shrugs and doesn't show bug details. The user isn't really impacted--they can continue their merry day. Bugzilla seems to be down a lot, so we get these Sentry errors we're not going to do anything with. Thus I did a couple of things: 1. increase the backoff_factor for Bugzilla requests giving us a little more time for Bugzilla to find its happy place 2. change the buginfo view to handle the RetryError by returning an HTTP 500 with an error message and _not_ sending a Sentry event --- webapp/crashstats/crashstats/models.py | 9 ++++++--- .../crashstats/crashstats/tests/test_views.py | 17 +++++++++++++++++ webapp/crashstats/crashstats/views.py | 7 +++++-- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/webapp/crashstats/crashstats/models.py b/webapp/crashstats/crashstats/models.py index 9813c83b9f..a2f0267970 100644 --- a/webapp/crashstats/crashstats/models.py +++ b/webapp/crashstats/crashstats/models.py @@ -57,7 +57,7 @@ def get_bugs_and_related_bugs(self, signatures): class BugAssociation(models.Model): - """Specifies assocations between bug ids in Bugzilla and signatures""" + """Specifies associations between bug ids in Bugzilla and signatures""" bug_id = models.IntegerField(null=False, help_text="Bugzilla bug id") signature = models.TextField( @@ -244,7 +244,7 @@ class ParameterTypeError(Exception): def _clean_path(path): - """Santize a URL path + """Sanitize a URL path :arg path: the path to sanitize @@ -608,7 +608,7 @@ class ProcessedCrash(SocorroMiddleware): HELP_TEXT = """ API for retrieving crash data generated by processing. - Requires protected data acess to view protected parts of the processed crash. + Requires protected data access to view protected parts of the processed crash. Params: @@ -1027,6 +1027,9 @@ def get(self, bugs, **kwargs): # 503 = Service Unavailable # 504 = Gateway Time-out status_forcelist=(500, 502, 503, 504), + # This changes backoff to (0.2, 0.4, 0.8, 1.6, 3.2) which + # gives Bugzilla a better chance of responding helpfully + backoff_factor=0.2, ) response = session.get(url, headers=headers) if response.status_code != 200: diff --git a/webapp/crashstats/crashstats/tests/test_views.py b/webapp/crashstats/crashstats/tests/test_views.py index fee2a1927f..7cb802bc6d 100644 --- a/webapp/crashstats/crashstats/tests/test_views.py +++ b/webapp/crashstats/crashstats/tests/test_views.py @@ -11,6 +11,7 @@ import requests_mock import pyquery from markus.testing import MetricsMock +from requests.exceptions import RetryError from django.conf import settings from django.contrib.auth import REDIRECT_FIELD_NAME @@ -358,6 +359,22 @@ def mocked_get(url, **options): cache_key = "buginfo:987" assert cache.get(cache_key) == struct["bugs"][0] + def test_buginfo_retry_error(self, client): + with requests_mock.Mocker(real_http=False) as mock_requests: + mock_requests.get( + ( + "http://bugzilla.example.com/rest/bug?" + + "id=1901998&include_fields=summary,status,id,resolution" + ), + exc=RetryError, + ) + + url = reverse("crashstats:buginfo") + response = client.get(url, {"bug_ids": "1901998"}) + assert response.status_code == 500 + json_response = json.loads(response.content) + assert json_response == {"error": "Max retries exceeded with Bugzilla."} + class Test_quick_search: def test_quick_search(self, client): diff --git a/webapp/crashstats/crashstats/views.py b/webapp/crashstats/crashstats/views.py index 3ce9c719be..4696020669 100644 --- a/webapp/crashstats/crashstats/views.py +++ b/webapp/crashstats/crashstats/views.py @@ -7,6 +7,7 @@ from urllib.parse import quote import glom +from requests.exceptions import RetryError from django import http from django.conf import settings @@ -332,8 +333,10 @@ def buginfo(request, signatures=None): bug_ids = form.cleaned_data["bug_ids"] bzapi = models.BugzillaBugInfo() - result = bzapi.get(bug_ids) - return result + try: + return bzapi.get(bug_ids) + except RetryError: + return {"error": "Max retries exceeded with Bugzilla."}, 500 @pass_default_context From f60fb3622f184f254ae45a81a99df63c3a3b24e5 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Thu, 31 Oct 2024 11:06:18 -0400 Subject: [PATCH 03/34] bug-1920739: update sentry_sdk to 2.17.0 (#6778) This updates sentry_sdk to 2.17.0. In order to do that, we had to do two fundamental changes: 1. Switch `sentry_sdk.push_scope()` calls to `sentry_sdk.new_scope()` which does roughly the same thing. 2. Switch from using `sentry_sdk.add_extra()` to `sentry_sdk.set_context()`. `add_extra()` was nice in that it allowed us to add additional things to the sentry event payload that gave context to the error. We used this to capture the processor rule, ruleset, crash_id, signature generation rule, etc in an additive way. `add_extra()` is deprecated and Sentry suggests to switch to `set_context()` which is not an additive kind of thing. So now we have to set a new context section everywhere we set the context. That's a little annoying, but it'll be fine. I updated the code and tests accordingly. --- requirements.in | 2 +- requirements.txt | 6 +++--- socorro/processor/pipeline.py | 4 ++-- socorro/processor/processor_app.py | 11 ++++++++--- socorro/processor/rules/mozilla.py | 4 ++-- socorro/stage_submitter/submitter.py | 4 ++-- socorro/tests/processor/rules/test_mozilla.py | 11 ++++++----- socorro/tests/processor/test_cache_manager.py | 3 --- socorro/tests/processor/test_pipeline.py | 6 +++--- socorro/tests/processor/test_processor_app.py | 5 ++++- 10 files changed, 31 insertions(+), 25 deletions(-) diff --git a/requirements.in b/requirements.in index 8eba4c451f..162a3f7e19 100644 --- a/requirements.in +++ b/requirements.in @@ -48,7 +48,7 @@ requests==2.32.3 requests-mock==1.12.1 ruff==0.7.1 semver==3.0.2 -sentry-sdk==2.8.0 +sentry-sdk==2.17.0 Sphinx==8.1.3 sphinx_rtd_theme==3.0.1 statsd==4.0.1 diff --git a/requirements.txt b/requirements.txt index 3aabf0656c..66bebcf863 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1236,9 +1236,9 @@ semver==3.0.2 \ --hash=sha256:6253adb39c70f6e51afed2fa7152bcd414c411286088fb4b9effb133885ab4cc \ --hash=sha256:b1ea4686fe70b981f85359eda33199d60c53964284e0cfb4977d243e37cf4bf4 # via -r requirements.in -sentry-sdk==2.8.0 \ - --hash=sha256:6051562d2cfa8087bb8b4b8b79dc44690f8a054762a29c07e22588b1f619bfb5 \ - --hash=sha256:aa4314f877d9cd9add5a0c9ba18e3f27f99f7de835ce36bd150e48a41c7c646f +sentry-sdk==2.17.0 \ + --hash=sha256:625955884b862cc58748920f9e21efdfb8e0d4f98cca4ab0d3918576d5b606ad \ + --hash=sha256:dd0a05352b78ffeacced73a94e86f38b32e2eae15fff5f30ca5abb568a72eacf # via # -r requirements.in # fillmore diff --git a/socorro/processor/pipeline.py b/socorro/processor/pipeline.py index c5379385f4..c840ba0191 100644 --- a/socorro/processor/pipeline.py +++ b/socorro/processor/pipeline.py @@ -87,8 +87,8 @@ def process_crash(self, ruleset_name, raw_crash, dumps, processed_crash, tmpdir) # Apply rules; if a rule fails, capture the error and continue onward for rule in ruleset: - with sentry_sdk.push_scope() as scope: - scope.set_extra("rule", rule.name) + with sentry_sdk.new_scope() as scope: + scope.set_context("processor_pipeline", {"rule": rule.name}) try: rule.act( diff --git a/socorro/processor/processor_app.py b/socorro/processor/processor_app.py index 8444e03ac4..ba775315a2 100755 --- a/socorro/processor/processor_app.py +++ b/socorro/processor/processor_app.py @@ -130,9 +130,14 @@ def transform(self, task, finished_func=(lambda: None)): with METRICS.timer( "processor.process_crash", tags=[f"ruleset:{ruleset_name}"] ): - with sentry_sdk.push_scope() as scope: - scope.set_extra("crash_id", crash_id) - scope.set_extra("ruleset", ruleset_name) + with sentry_sdk.new_scope() as scope: + scope.set_context( + "processor", + { + "crash_id": crash_id, + "ruleset": ruleset_name, + }, + ) # Create temporary directory context with tempfile.TemporaryDirectory(dir=self.temporary_path) as tmpdir: diff --git a/socorro/processor/rules/mozilla.py b/socorro/processor/rules/mozilla.py index a0217dc51b..938fc190ee 100644 --- a/socorro/processor/rules/mozilla.py +++ b/socorro/processor/rules/mozilla.py @@ -1041,8 +1041,8 @@ def __init__(self): def _error_handler(self, crash_data, exc_info, extra): """Captures errors from signature generation""" - with sentry_sdk.push_scope() as scope: - scope.set_extra("signature_rule", extra["rule"]) + with sentry_sdk.new_scope() as scope: + scope.set_context("signature_generator", {"signature_rule": extra["rule"]}) sentry_sdk.capture_exception(exc_info) def action(self, raw_crash, dumps, processed_crash, tmpdir, status): diff --git a/socorro/stage_submitter/submitter.py b/socorro/stage_submitter/submitter.py index 8f51168669..a5ba7a4229 100644 --- a/socorro/stage_submitter/submitter.py +++ b/socorro/stage_submitter/submitter.py @@ -333,11 +333,11 @@ def sample(self, destinations): def process(self, crash): with METRICS.timer("submitter.process"): - with sentry_sdk.push_scope() as scope: + with sentry_sdk.new_scope() as scope: crash_id = crash.crash_id self.logger.debug(f"processing {crash}") - scope.set_extra("crash_id", crash) + scope.set_context("submitter", {"crash_id": crash_id}) # sample and determine destinations destinations = [] diff --git a/socorro/tests/processor/rules/test_mozilla.py b/socorro/tests/processor/rules/test_mozilla.py index e589ac2eed..ac36529e99 100644 --- a/socorro/tests/processor/rules/test_mozilla.py +++ b/socorro/tests/processor/rules/test_mozilla.py @@ -2054,7 +2054,7 @@ def predicate(self, raw_crash, processed_crash): # doesn't configure Sentry the way the processor does so we shouldn't test # whether things are scrubbed correctly with sentry_helper.init() as sentry_client: - # Override the regular SigntureGenerator with one with a BadRule + # Override the regular SignatureGenerator with one with a BadRule # in the pipeline rule.generator = SignatureGenerator( ruleset=[BadRule], error_handler=rule._error_handler @@ -2073,10 +2073,11 @@ def predicate(self, raw_crash, processed_crash): assert status.notes == ["BadRule: Rule failed: Cough"] (event,) = sentry_client.envelope_payloads - # NOTE(willkg): Some of the extra bits come from the processor app and since - # we're testing SignatureGenerator in isolation, those don't get added to - # the sentry scope - assert event["extra"] == {"signature_rule": "BadRule", "sys.argv": mock.ANY} + + # Assert that the rule that threw an error is captured in the context. + assert event["contexts"]["signature_generator"] == { + "signature_rule": "BadRule" + } assert event["exception"]["values"][0]["type"] == "Exception" diff --git a/socorro/tests/processor/test_cache_manager.py b/socorro/tests/processor/test_cache_manager.py index 75c3d9ddfc..af4d958ca4 100644 --- a/socorro/tests/processor/test_cache_manager.py +++ b/socorro/tests/processor/test_cache_manager.py @@ -470,9 +470,6 @@ def mock_make_room(*args, **kwargs): (event,) = sentry_client.envelope_payloads - # Drop the "_meta" bit because we don't want to compare that. - del event["_meta"] - # Assert that the event is what we expected differences = diff_structure(event, BROKEN_EVENT) assert differences == [] diff --git a/socorro/tests/processor/test_pipeline.py b/socorro/tests/processor/test_pipeline.py index 42f714ca73..46bca9ca1e 100644 --- a/socorro/tests/processor/test_pipeline.py +++ b/socorro/tests/processor/test_pipeline.py @@ -36,6 +36,9 @@ def action(self, *args, **kwargs): "span_id": ANY, "trace_id": ANY, }, + "processor_pipeline": { + "rule": "socorro.tests.processor.test_pipeline.BadRule", + }, }, "environment": "production", "event_id": ANY, @@ -86,9 +89,6 @@ def action(self, *args, **kwargs): } ] }, - "extra": { - "rule": "socorro.tests.processor.test_pipeline.BadRule", - }, "level": "error", "modules": ANY, "platform": "python", diff --git a/socorro/tests/processor/test_processor_app.py b/socorro/tests/processor/test_processor_app.py index 53e18296c1..704222f2e5 100644 --- a/socorro/tests/processor/test_processor_app.py +++ b/socorro/tests/processor/test_processor_app.py @@ -176,6 +176,10 @@ def test_transform_unexpected_exception(self, processor_settings): "span_id": ANY, "trace_id": ANY, }, + "processor": { + "crash_id": ANY, + "ruleset": "default", + }, }, "environment": "production", "event_id": ANY, @@ -234,7 +238,6 @@ def test_transform_unexpected_exception(self, processor_settings): } ] }, - "extra": {"crash_id": "930b08ba-e425-49bf-adbd-7c9172220721", "ruleset": "default"}, "level": "error", "modules": ANY, "platform": "python", From 165fd98c9e4a0cb9c512a3662086ac734ce54146 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Thu, 31 Oct 2024 11:16:27 -0400 Subject: [PATCH 04/34] bug-1901997: handle invalid BuildID for redacted crash reports (#6777) * Add support for --pause to process_crashes.sh This makes it possible to adjust a crash report for testing a specific processing scenario. * bug-1901997: handle invalid BuildID for redacted crash reports If the user viewing the crash report does not have protected data access and the crash report has a crash annotation value that doesn't match our schema, then the redaction process can fail. In this scenario, Crash Stats can't recover and redact the document properly. In bug 1901997, we were seeing this scenario when BuildID=null. Because crash annotation data comes from the crash reporter which might be running on a machine that has bad hardware, this scenario can happen with other fields. These changes handle that scenario and shows a page indicating the crash report is malformed and providing a link the user can use to report the malformed crash report. --- bin/process_crashes.sh | 31 +++++++++++++++++-- .../report_index_malformed_raw_crash.html | 22 +++++++++++++ .../crashstats/crashstats/tests/test_views.py | 28 +++++++++++++++++ webapp/crashstats/crashstats/views.py | 14 +++++++++ 4 files changed, 92 insertions(+), 3 deletions(-) create mode 100644 webapp/crashstats/crashstats/jinja2/crashstats/report_index_malformed_raw_crash.html diff --git a/bin/process_crashes.sh b/bin/process_crashes.sh index 92acac127f..d4e56ed428 100755 --- a/bin/process_crashes.sh +++ b/bin/process_crashes.sh @@ -9,17 +9,23 @@ # # Usage: ./bin/process_crashes.sh # +# This should be run inside the app container. +# # You can use it with fetch_crashids. For example: # # socorro-cmd fetch_crashids --num=1 | ./bin/process_crashes.sh # -# Make sure to run the processor to do the actual processing. +# You can pause after fetching crash ids and before uploading them to the cloud +# storage bucket. This lets you adjust the crash reports. +# +# ./bin/process_crashes.sh --pause afd59adc-1cfd-4aa3-af2f-6b9ed0241029 # -# Note: This should be called from inside a container. +# Make sure to run the processor to do the actual processing. set -euo pipefail DATADIR=./crashdata_tryit_tmp +PAUSE=0 function cleanup { # Cleans up files generated by the script @@ -29,10 +35,24 @@ function cleanup { # Set up cleanup function to run on script exit trap cleanup EXIT +# If there's at least one argument, check for options +if [[ $# -ne 0 ]]; then + if [[ $1 == "--pause" ]]; then + PAUSE=1 + shift + fi +fi + +# Check for crash ids if [[ $# -eq 0 ]]; then if [ -t 0 ]; then # If stdin is a terminal, then there's no input - echo "Usage: process_crashes.sh CRASHID [CRASHID ...]" + echo "Usage: process_crashes.sh [--pause] CRASHID [CRASHID ...]" + exit 1 + fi + + if [[ "${PAUSE}" == "1" ]]; then + echo "Can't use PAUSE and process crash ids from stdin. Exiting." exit 1 fi @@ -45,6 +65,11 @@ mkdir "${DATADIR}" || echo "${DATADIR} already exists." # Pull down the data for all crashes ./socorro-cmd fetch_crash_data "${DATADIR}" $@ +# Pause (if requested) to let user make changes to crash data +if [[ "${PAUSE}" == "1" ]]; then + read -r -n 1 -p "Pausing... Any key to continue." pauseinput +fi + # Make the bucket and sync contents ./socorro-cmd gcs create "${CRASHSTORAGE_GCS_BUCKET}" ./socorro-cmd gcs upload "${DATADIR}" "${CRASHSTORAGE_GCS_BUCKET}" diff --git a/webapp/crashstats/crashstats/jinja2/crashstats/report_index_malformed_raw_crash.html b/webapp/crashstats/crashstats/jinja2/crashstats/report_index_malformed_raw_crash.html new file mode 100644 index 0000000000..58c47e8016 --- /dev/null +++ b/webapp/crashstats/crashstats/jinja2/crashstats/report_index_malformed_raw_crash.html @@ -0,0 +1,22 @@ +{% extends "crashstats_base.html" %} + +{% block content %} +
+
+

Crash Report Malformed

+
+
+
+

+ The crash report you requested is malformed in some way such that it + cannot be shown. +

+

+ If you need to see this crash report, please + submit a bug + describing what happened, and please include the URL for this page. +

+
+
+
+{% endblock %} diff --git a/webapp/crashstats/crashstats/tests/test_views.py b/webapp/crashstats/crashstats/tests/test_views.py index 7cb802bc6d..5d5c68b054 100644 --- a/webapp/crashstats/crashstats/tests/test_views.py +++ b/webapp/crashstats/crashstats/tests/test_views.py @@ -1230,6 +1230,34 @@ def test_raw_crash_not_found(self, client, db, storage_helper): assert response.status_code == 404 assert "Crash Report Not Found" in smart_str(response.content) + def test_raw_crash_malformed(self, client, db, storage_helper): + crash_id, raw_crash, processed_crash = build_crash_data() + + # If the BuildID is None, that will cause the reducer to raise an + # InvalidDocumentError when someone who doesn't have protected data access + # tries to view it. Bug #1901997. + raw_crash["BuildID"] = None + + bucket = storage_helper.get_crashstorage_bucket() + raw_key = build_keys("raw_crash", crash_id)[0] + storage_helper.upload( + bucket_name=bucket, key=raw_key, data=dict_to_str(raw_crash).encode("utf-8") + ) + + validate_instance(processed_crash, PROCESSED_CRASH_SCHEMA) + processed_key = build_keys("processed_crash", crash_id)[0] + storage_helper.upload( + bucket_name=bucket, + key=processed_key, + data=dict_to_str(processed_crash).encode("utf-8"), + ) + + url = reverse("crashstats:report_index", args=[crash_id]) + response = client.get(url) + + assert response.status_code == 500 + assert "Crash Report Malformed" in smart_str(response.content) + def test_processed_crash_not_found(self, client, db, storage_helper, queue_helper): crash_id, raw_crash, processed_crash = build_crash_data() diff --git a/webapp/crashstats/crashstats/views.py b/webapp/crashstats/crashstats/views.py index 4696020669..4bf5723a12 100644 --- a/webapp/crashstats/crashstats/views.py +++ b/webapp/crashstats/crashstats/views.py @@ -25,6 +25,7 @@ from crashstats.crashstats.decorators import track_view, pass_default_context from crashstats.supersearch.models import SuperSearchFields from socorro.external.crashstorage_base import CrashIDNotFound +from socorro.lib.libsocorrodataschema import InvalidDocumentError from socorro.signature.generator import SignatureGenerator from socorro.signature.utils import convert_to_crash_data @@ -110,6 +111,19 @@ def report_index(request, crash_id, default_context=None): raw_api.api_user = request.user try: context["raw"] = raw_api.get(crash_id=crash_id) + except InvalidDocumentError: + # If the user does not have protected data access, then the document is + # redacted. However, if it's invalid because the world is a terrible place and + # things working right is not the norm, then we want to display a "This crash + # report is broken in some way. Please report a bug." message instead + # of throwing an HTTP 500 error. We definitely can't gloss over this because + # the brokenness prevents the document from being redacted correctly. + return render( + request, + "crashstats/report_index_malformed_raw_crash.html", + context, + status=500, + ) except CrashIDNotFound: # If the raw crash can't be found, we can't do much. return render( From fd89e5f93b2ef5e1b31ba064bca4e77cc31e829d Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Thu, 31 Oct 2024 11:17:52 -0400 Subject: [PATCH 05/34] bug-1911367: Remove processor heartbeat and process metrics (#6776) This removes processor heartbeat (and the scaffolding for that heartbeat) which generated process metrics for the processor container. We don't need those anymore--the problem those were added to help us understand is long gone now. --- socorro/lib/task_manager.py | 25 +------- socorro/lib/threaded_task_manager.py | 12 +--- socorro/processor/processor_app.py | 61 ------------------- socorro/statsd_metrics.yaml | 24 -------- socorro/tests/processor/test_processor_app.py | 15 ----- 5 files changed, 4 insertions(+), 133 deletions(-) diff --git a/socorro/lib/task_manager.py b/socorro/lib/task_manager.py index 756f32006d..58c8b40a02 100644 --- a/socorro/lib/task_manager.py +++ b/socorro/lib/task_manager.py @@ -9,8 +9,6 @@ LOGGER = logging.getLogger(__name__) -HEARTBEAT_INTERVAL = 60 - def default_task_func(a_param): """Default task function. @@ -21,16 +19,6 @@ def default_task_func(a_param): """ -def default_heartbeat(): - """Runs once a second from the main thread. - - Note: If this raises an exception, it could kill the process or put it in a - weird state. - - """ - LOGGER.info("THUMP") - - def default_iterator(): """Default iterator for tasks. @@ -76,7 +64,6 @@ def __init__( idle_delay=7, quit_on_empty_queue=False, job_source_iterator=default_iterator, - heartbeat_func=default_heartbeat, task_func=default_task_func, ): """ @@ -88,14 +75,12 @@ def __init__( instantiated with a config object can be iterated. The iterator must yield a tuple consisting of a function's tuple of args and, optionally, a mapping of kwargs. Ex: (('a', 17), {'x': 23}) - :arg heartbeat_func: a function to run every second :arg task_func: a function that will accept the args and kwargs yielded by the job_source_iterator """ self.idle_delay = idle_delay self.quit_on_empty_queue = quit_on_empty_queue self.job_source_iterator = job_source_iterator - self.heartbeat_func = heartbeat_func self.task_func = task_func self._pid = os.getpid() @@ -109,7 +94,7 @@ def _get_iterator(self): job_source_iterator can be one of a few things: * a class that can be instantiated and iterated over - * a function that returns an interator + * a function that returns an iterator * an actual iterator/generator * an iterable collection @@ -124,7 +109,7 @@ def _get_iterator(self): def _responsive_sleep(self, seconds, wait_log_interval=0, wait_reason=""): """Responsive sleep that checks for quit flag - When there is litte work to do, the queuing thread sleeps a lot. It can't sleep + When there is little work to do, the queuing thread sleeps a lot. It can't sleep for too long without checking for the quit flag and/or logging about why it is sleeping. @@ -132,7 +117,7 @@ def _responsive_sleep(self, seconds, wait_log_interval=0, wait_reason=""): :arg wait_log_interval: while sleeping, it is helpful if the thread periodically announces itself so that we know that it is still alive. This number is the time in seconds between log entries. - :arg wait_reason: the is for the explaination of why the thread is + :arg wait_reason: the is for the explanation of why the thread is sleeping. This is likely to be a message like: 'there is no work to do'. This was also partially motivated by old versions' of Python inability to @@ -146,14 +131,10 @@ def _responsive_sleep(self, seconds, wait_log_interval=0, wait_reason=""): def blocking_start(self): """This function starts the task manager running to do tasks.""" - next_heartbeat = time.time() + HEARTBEAT_INTERVAL self.logger.debug("threadless start") try: # May never exhaust for job_params in self._get_iterator(): - if time.time() > next_heartbeat: - self.heartbeat_func() - next_heartbeat = time.time() + HEARTBEAT_INTERVAL self.logger.debug("received %r", job_params) if job_params is None: if self.quit_on_empty_queue: diff --git a/socorro/lib/threaded_task_manager.py b/socorro/lib/threaded_task_manager.py index 2b4ea90202..092a877982 100644 --- a/socorro/lib/threaded_task_manager.py +++ b/socorro/lib/threaded_task_manager.py @@ -17,7 +17,6 @@ import time from socorro.lib.task_manager import ( - default_heartbeat, default_iterator, default_task_func, TaskManager, @@ -26,8 +25,6 @@ STOP_TOKEN = (None, None) -HEARTBEAT_INTERVAL = 60 - class ThreadedTaskManager(TaskManager): """Threaded task manager.""" @@ -39,7 +36,6 @@ def __init__( number_of_threads=4, maximum_queue_size=8, job_source_iterator=default_iterator, - heartbeat_func=default_heartbeat, task_func=default_task_func, ): """ @@ -54,7 +50,6 @@ def __init__( instantiated with a config object can be iterated. The iterator must yield a tuple consisting of a function's tuple of args and, optionally, a mapping of kwargs. Ex: (('a', 17), {'x': 23}) - :arg heartbeat_func: a function to run every second :arg task_func: a function that will accept the args and kwargs yielded by the job_source_iterator """ @@ -71,7 +66,6 @@ def __init__( idle_delay=idle_delay, quit_on_empty_queue=quit_on_empty_queue, job_source_iterator=job_source_iterator, - heartbeat_func=heartbeat_func, task_func=task_func, ) self.thread_list = [] # the thread object storage @@ -107,12 +101,8 @@ def wait_for_completion(self): if self.queueing_thread is None: return - next_heartbeat = time.time() + HEARTBEAT_INTERVAL self.logger.debug("waiting to join queueing_thread") while True: - if time.time() > next_heartbeat: - self.heartbeat_func() - next_heartbeat = time.time() + HEARTBEAT_INTERVAL try: self.queueing_thread.join(1.0) if not self.queueing_thread.is_alive(): @@ -149,7 +139,7 @@ def wait_for_empty_queue(self, wait_log_interval=0, wait_reason=""): :arg wait_log_interval: While sleeping, it is helpful if the thread periodically announces itself so that we know that it is still alive. This number is the time in seconds between log entries. - :arg wait_reason: The is for the explaination of why the thread is sleeping. + :arg wait_reason: The is for the explanation of why the thread is sleeping. This is likely to be a message like: 'there is no work to do'. """ diff --git a/socorro/processor/processor_app.py b/socorro/processor/processor_app.py index ba775315a2..9054e25063 100755 --- a/socorro/processor/processor_app.py +++ b/socorro/processor/processor_app.py @@ -27,7 +27,6 @@ from fillmore.libsentry import set_up_sentry from fillmore.scrubber import Scrubber, SCRUB_RULES_DEFAULT -import psutil import sentry_sdk from sentry_sdk.integrations.atexit import AtexitIntegration from sentry_sdk.integrations.dedupe import DedupeIntegration @@ -275,7 +274,6 @@ def _set_up_task_manager(self): manager_settings.update( { "job_source_iterator": self.source_iterator, - "heartbeat_func": self.heartbeat, "task_func": self.transform, } ) @@ -283,65 +281,6 @@ def _set_up_task_manager(self): class_path=manager_class, kwargs=manager_settings ) - def heartbeat(self): - """Runs once a second from the main thread. - - Note: If this raises an exception, it could kill the process or put it in a - weird state. - - """ - try: - processes_by_type = {} - processes_by_status = {} - open_files = 0 - for proc in psutil.process_iter(["cmdline", "status", "open_files"]): - try: - # NOTE(willkg): This is all intertwined with exactly how we run the - # processor in a Docker container. If we ever make changes to that, this - # will change, too. However, even if we never update this, seeing - # "zombie" and "orphaned" as process statuses or seeing lots of - # processes as a type will be really fishy and suggestive that evil is a - # foot. - cmdline = proc.cmdline() or ["unknown"] - - if cmdline[0] in ["/bin/sh", "/bin/bash"]: - proc_type = "shell" - elif cmdline[0] in ["python", "/usr/local/bin/python"]: - proc_type = "python" - elif "stackwalk" in cmdline[0]: - proc_type = "stackwalker" - else: - proc_type = "other" - - open_files_count = len(proc.open_files()) - proc_status = proc.status() - - except psutil.Error: - # For any psutil error, we want to track that we saw a process, but - # the details don't matter - proc_type = "unknown" - proc_status = "unknown" - open_files_count = 0 - - processes_by_type[proc_type] = processes_by_type.get(proc_type, 0) + 1 - processes_by_status[proc_status] = ( - processes_by_status.get(proc_status, 0) + 1 - ) - open_files += open_files_count - - METRICS.gauge("processor.open_files", open_files) - for proc_type, val in processes_by_type.items(): - METRICS.gauge( - "processor.processes_by_type", val, tags=[f"proctype:{proc_type}"] - ) - for status, val in processes_by_status.items(): - METRICS.gauge( - "processor.processes_by_status", val, tags=[f"procstatus:{status}"] - ) - - except Exception as exc: - sentry_sdk.capture_exception(exc) - def close(self): """Clean up the processor on shutdown.""" with suppress(AttributeError): diff --git a/socorro/statsd_metrics.yaml b/socorro/statsd_metrics.yaml index f7418c162c..2da71ea327 100644 --- a/socorro/statsd_metrics.yaml +++ b/socorro/statsd_metrics.yaml @@ -144,30 +144,6 @@ socorro.processor.minidumpstackwalk.run: * ``outcome``: either ``success`` or ``fail`` * ``exitcode``: the exit code of the minidump stackwalk process -socorro.processor.open_files: - type: "gauge" - description: | - Gauge of currently open files for all processes running in the container. - -socorro.processor.processes_by_type: - type: "gauge" - description: | - Gauge of processes by type. - - Tags: - - * ``proctype``: one of ``shell``, ``python``, ``stackwalker``, or ``other`` - -socorro.processor.processes_by_status: - type: "gauge" - description: | - Gauge of processes by process status. - - Tags: - - * ``procstatus``: one of ``running``, ``sleeping``, or other process - statuses. - socorro.processor.process_crash: type: "timing" description: | diff --git a/socorro/tests/processor/test_processor_app.py b/socorro/tests/processor/test_processor_app.py index 704222f2e5..dd4fa0562d 100644 --- a/socorro/tests/processor/test_processor_app.py +++ b/socorro/tests/processor/test_processor_app.py @@ -66,21 +66,6 @@ def test_source_iterator(self, processor_settings): assert next(queue) is None assert next(queue) == ((3,), {}) - def test_heartbeat(self, sentry_helper): - """Basic test to make sure it runs, captures metrics, and doesn't error out""" - with sentry_helper.reuse() as sentry_client: - with MetricsMock() as metricsmock: - app = ProcessorApp() - app.heartbeat() - - # Assert it emitted some metrics - metricsmock.assert_gauge("socorro.processor.open_files") - metricsmock.assert_gauge("socorro.processor.processes_by_type") - metricsmock.assert_gauge("socorro.processor.processes_by_status") - - # Assert it didn't throw an exception - assert len(sentry_client.envelopes) == 0 - def test_transform_success(self, processor_settings): app = ProcessorApp() app._set_up_source_and_destination() From 1af2fcdbd05cbf5bb36b1672f1521a3558119dfd Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 1 Nov 2024 15:05:37 -0400 Subject: [PATCH 06/34] bug-1851708: fix ESRVersionRewriteRule and FenixVersionRewriteRule (#6781) This fixes these two rules to stop editing the raw crash--that's soo bad! Now they run after we copy everything from the raw crash to the processed crash (validated and normalized), use data from the processed crash to figure out what to do, and edit data in the processed crash if necessary. --- socorro/mozilla_rulesets.py | 5 +- socorro/processor/rules/mozilla.py | 20 +++--- socorro/tests/processor/rules/test_mozilla.py | 69 +++++++++---------- 3 files changed, 47 insertions(+), 47 deletions(-) diff --git a/socorro/mozilla_rulesets.py b/socorro/mozilla_rulesets.py index 2e990c910c..f8db9e73e2 100644 --- a/socorro/mozilla_rulesets.py +++ b/socorro/mozilla_rulesets.py @@ -68,11 +68,10 @@ CollectorMetadataRule(), # fix ModuleSignatureInfo if it needs fixing ConvertModuleSignatureInfoRule(), - # rules to change the internals of the raw crash - FenixVersionRewriteRule(), - ESRVersionRewrite(), # rules to transform a raw crash into a processed crash CopyFromRawCrashRule(schema=get_schema("processed_crash.schema.yaml")), + FenixVersionRewriteRule(), + ESRVersionRewrite(), SubmittedFromRule(), IdentifierRule(), MinidumpSha256HashRule(), diff --git a/socorro/processor/rules/mozilla.py b/socorro/processor/rules/mozilla.py index 938fc190ee..a931b14377 100644 --- a/socorro/processor/rules/mozilla.py +++ b/socorro/processor/rules/mozilla.py @@ -571,23 +571,25 @@ class FenixVersionRewriteRule(Rule): """ def predicate(self, raw_crash, dumps, processed_crash, tmpdir, status): - is_nightly = (raw_crash.get("Version") or "").startswith("Nightly ") - return raw_crash.get("ProductName") == "Fenix" and is_nightly + is_nightly = (processed_crash.get("version") or "").startswith("Nightly ") + return processed_crash.get("product_name") == "Fenix" and is_nightly def action(self, raw_crash, dumps, processed_crash, tmpdir, status): - status.add_note("Changed version from %r to 0.0a1" % raw_crash.get("Version")) - raw_crash["Version"] = "0.0a1" + if "version" in processed_crash: + version = processed_crash["version"] + status.add_note(f"Changed version from {version!r} to 0.0a1") + processed_crash["version"] = "0.0a1" class ESRVersionRewrite(Rule): def predicate(self, raw_crash, dumps, processed_crash, tmpdir, status): - return raw_crash.get("ReleaseChannel", "") == "esr" + return processed_crash.get("release_channel", "") == "esr" def action(self, raw_crash, dumps, processed_crash, tmpdir, status): - try: - raw_crash["Version"] += "esr" - except KeyError: - status.add_note('"Version" missing from esr release raw_crash') + if "version" in processed_crash: + processed_crash["version"] = processed_crash["version"] + "esr" + else: + status.add_note("'version' missing from esr release processed_crash") class TopMostFilesRule(Rule): diff --git a/socorro/tests/processor/rules/test_mozilla.py b/socorro/tests/processor/rules/test_mozilla.py index ac36529e99..85701b7063 100644 --- a/socorro/tests/processor/rules/test_mozilla.py +++ b/socorro/tests/processor/rules/test_mozilla.py @@ -1273,12 +1273,12 @@ class TestFenixVersionRewriteRule: ], ) def test_predicate(self, tmp_path, product, version, expected): - raw_crash = { - "ProductName": product, - "Version": version, - } + raw_crash = {} dumps = {} - processed_crash = {} + processed_crash = { + "product_name": product, + "version": version, + } status = Status() rule = FenixVersionRewriteRule() @@ -1286,67 +1286,66 @@ def test_predicate(self, tmp_path, product, version, expected): assert ret == expected def test_act(self, tmp_path): - raw_crash = { - "ProductName": "Fenix", - "Version": "Nightly 200315 05:05", - } + raw_crash = {} dumps = {} - processed_crash = {} + processed_crash = { + "product_name": "Fenix", + "version": "Nightly 200315 05:05", + } status = Status() rule = FenixVersionRewriteRule() rule.act(raw_crash, dumps, processed_crash, str(tmp_path), status) - assert raw_crash["Version"] == "0.0a1" + assert processed_crash["version"] == "0.0a1" assert status.notes == ["Changed version from 'Nightly 200315 05:05' to 0.0a1"] class TestESRVersionRewrite: def test_everything_we_hoped_for(self, tmp_path): - raw_crash = copy.deepcopy(canonical_standard_raw_crash) - raw_crash["ReleaseChannel"] = "esr" + raw_crash = {} dumps = {} - processed_crash = {} + processed_crash = { + "release_channel": "esr", + "version": "120.0", + } status = Status() rule = ESRVersionRewrite() rule.act(raw_crash, dumps, processed_crash, str(tmp_path), status) - assert raw_crash["Version"] == "12.0esr" - - # processed_crash should be unchanged - assert processed_crash == {} + assert raw_crash == {} + assert processed_crash["version"] == "120.0esr" def test_this_is_not_the_crash_you_are_looking_for(self, tmp_path): - raw_crash = copy.deepcopy(canonical_standard_raw_crash) - raw_crash["ReleaseChannel"] = "not_esr" + raw_crash = {} dumps = {} - processed_crash = {} + processed_crash = { + "release_channel": "release", + "version": "120.0", + } status = Status() rule = ESRVersionRewrite() rule.act(raw_crash, dumps, processed_crash, str(tmp_path), status) - assert raw_crash["Version"] == "12.0" - - # processed_crash should be unchanged - assert processed_crash == {} + assert raw_crash == {} + assert processed_crash["version"] == "120.0" - def test_this_is_really_broken(self, tmp_path): - raw_crash = copy.deepcopy(canonical_standard_raw_crash) - raw_crash["ReleaseChannel"] = "esr" - del raw_crash["Version"] + def test_no_version(self, tmp_path): + raw_crash = {} dumps = {} - processed_crash = {} + processed_crash = { + "release_channel": "esr", + # no "version" + } status = Status() rule = ESRVersionRewrite() rule.act(raw_crash, dumps, processed_crash, str(tmp_path), status) - assert "Version" not in raw_crash - assert status.notes == ['"Version" missing from esr release raw_crash'] - - # processed_crash should be unchanged - assert processed_crash == {} + assert raw_crash == {} + assert "version" not in processed_crash + assert status.notes == ["'version' missing from esr release processed_crash"] class TestTopMostFilesRule: From cce2032119790410e4e36aafa7fed33da9910da3 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 1 Nov 2024 15:06:32 -0400 Subject: [PATCH 07/34] bug-1863963: fix site title to Mozilla Crash Stats (#6780) "crash reports" is the name of the collector, so it's always been a dissonance that the header of the Crash Stats site said Crash Reports. While doing this, I also fixed the hierarchy of error templates and removed an unneeded header_title.html template. --- webapp/crashstats/crashstats/jinja2/400.html | 2 +- webapp/crashstats/crashstats/jinja2/404.html | 2 +- webapp/crashstats/crashstats/jinja2/500.html | 2 +- webapp/crashstats/crashstats/jinja2/crashstats_base.html | 2 +- .../crashstats/jinja2/{error.html => error_base.html} | 2 +- webapp/crashstats/crashstats/jinja2/footer_nav.html | 2 +- webapp/crashstats/crashstats/jinja2/header_title.html | 3 --- .../signature/jinja2/signature/signature_report.html | 2 +- webapp/crashstats/supersearch/jinja2/supersearch/search.html | 2 +- .../supersearch/jinja2/supersearch/search_custom.html | 2 +- 10 files changed, 9 insertions(+), 12 deletions(-) rename webapp/crashstats/crashstats/jinja2/{error.html => error_base.html} (88%) delete mode 100644 webapp/crashstats/crashstats/jinja2/header_title.html diff --git a/webapp/crashstats/crashstats/jinja2/400.html b/webapp/crashstats/crashstats/jinja2/400.html index 1d005ebdd7..a6046b6f0c 100644 --- a/webapp/crashstats/crashstats/jinja2/400.html +++ b/webapp/crashstats/crashstats/jinja2/400.html @@ -1,4 +1,4 @@ -{% extends "error.html" %} +{% extends "error_base.html" %} {% block page_title %}Bad Request{% endblock %} diff --git a/webapp/crashstats/crashstats/jinja2/404.html b/webapp/crashstats/crashstats/jinja2/404.html index 69805db8a4..2cabd49a9b 100644 --- a/webapp/crashstats/crashstats/jinja2/404.html +++ b/webapp/crashstats/crashstats/jinja2/404.html @@ -1,4 +1,4 @@ -{% extends "error.html" %} +{% extends "error_base.html" %} {% block page_title %}Page Not Found{% endblock %} diff --git a/webapp/crashstats/crashstats/jinja2/500.html b/webapp/crashstats/crashstats/jinja2/500.html index 704628dcca..a1e39d6712 100644 --- a/webapp/crashstats/crashstats/jinja2/500.html +++ b/webapp/crashstats/crashstats/jinja2/500.html @@ -1,4 +1,4 @@ -{% extends "error.html" %} +{% extends "error_base.html" %} {% block page_title %}Internal Server Error{% endblock %} diff --git a/webapp/crashstats/crashstats/jinja2/crashstats_base.html b/webapp/crashstats/crashstats/jinja2/crashstats_base.html index 51dca6f84d..77bdc41ab1 100644 --- a/webapp/crashstats/crashstats/jinja2/crashstats_base.html +++ b/webapp/crashstats/crashstats/jinja2/crashstats_base.html @@ -24,7 +24,7 @@