diff --git a/crawler.py b/crawler.py index b1815eec6..5ab497122 100644 --- a/crawler.py +++ b/crawler.py @@ -10,8 +10,6 @@ from types import FrameType from typing import Any, Callable, List, Literal, Optional -import sentry_sdk - from openwpm import mp_logger from openwpm.command_sequence import CommandSequence from openwpm.config import BrowserParams, ManagerParams @@ -52,8 +50,6 @@ SAVE_CONTENT = os.getenv("SAVE_CONTENT", "") PREFS = os.getenv("PREFS", None) - -SENTRY_DSN = os.getenv("SENTRY_DSN", None) LOGGER_SETTINGS = mp_logger.parse_config_from_env() if CALLSTACK_INSTRUMENT is True: @@ -114,38 +110,6 @@ logger_kwargs=LOGGER_SETTINGS, ) -# At this point, Sentry should be initiated -if SENTRY_DSN: - # Add crawler.py-specific context - with sentry_sdk.configure_scope() as scope: - # tags generate breakdown charts and search filters - scope.set_tag("CRAWL_DIRECTORY", CRAWL_DIRECTORY) - scope.set_tag("GCS_BUCKET", GCS_BUCKET) - scope.set_tag("DISPLAY_MODE", DISPLAY_MODE) - scope.set_tag("HTTP_INSTRUMENT", HTTP_INSTRUMENT) - scope.set_tag("COOKIE_INSTRUMENT", COOKIE_INSTRUMENT) - scope.set_tag("NAVIGATION_INSTRUMENT", NAVIGATION_INSTRUMENT) - scope.set_tag("JS_INSTRUMENT", JS_INSTRUMENT) - scope.set_tag("JS_INSTRUMENT_SETTINGS", JS_INSTRUMENT_SETTINGS) - scope.set_tag("CALLSTACK_INSTRUMENT", CALLSTACK_INSTRUMENT) - scope.set_tag("SAVE_CONTENT", SAVE_CONTENT) - scope.set_tag("DWELL_TIME", DWELL_TIME) - scope.set_tag("TIMEOUT", TIMEOUT) - scope.set_tag("MAX_JOB_RETRIES", MAX_JOB_RETRIES) - scope.set_tag("CRAWL_REFERENCE", "%s/%s" % (GCS_BUCKET, CRAWL_DIRECTORY)) - # context adds addition information that may be of interest - if PREFS: - scope.set_context("PREFS", json.loads(PREFS)) - scope.set_context( - "crawl_config", - { - "REDIS_QUEUE_NAME": REDIS_QUEUE_NAME, - }, - ) - # Send a sentry error message (temporarily - to easily be able - # to compare error frequencies to crawl worker instance count) - sentry_sdk.capture_message("Crawl worker started") - # Connect to job queue job_queue = rediswq.RedisWQ( name=REDIS_QUEUE_NAME, host=REDIS_HOST, max_retries=MAX_JOB_RETRIES @@ -234,6 +198,3 @@ def callback(success: bool) -> None: else: manager.logger.info("Job queue finished, exiting.") manager.close() - -if SENTRY_DSN: - sentry_sdk.capture_message("Crawl worker finished") diff --git a/environment.yaml b/environment.yaml index 556c5f50c..659aab555 100644 --- a/environment.yaml +++ b/environment.yaml @@ -15,7 +15,7 @@ dependencies: - leveldb=1.23 - multiprocess=0.70.16 - mypy=1.10.1 -- nodejs=22.5.0 +- nodejs=22.5.1 - pandas=2.2.2 - pillow=10.4.0 - pip=24.0 @@ -25,14 +25,13 @@ dependencies: - pyarrow=16.1.0 - pytest-asyncio=0.23.8 - pytest-cov=5.0.0 -- pytest=8.2.2 +- pytest=8.3.1 - python=3.11.9 - pyvirtualdisplay=3.0 - recommonmark=0.7.1 - redis-py=5.0.7 - s3fs=2024.6.1 - selenium=4.22.0 -- sentry-sdk=2.10.0 - sphinx-markdown-tables=0.0.17 - sphinx=7.4.7 - tabulate=0.9.0 diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index cd1754106..39b605072 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -12,11 +12,12 @@ import traceback from pathlib import Path from queue import Empty as EmptyQueue -from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Type, Union +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Type, Union, cast import psutil from multiprocess import Queue from opentelemetry import trace +from opentelemetry.util.types import AttributeValue from selenium.common.exceptions import WebDriverException from tblib import Traceback, pickling_support @@ -469,6 +470,7 @@ def execute_command_sequence( for k in list(table_entry.keys()): if table_entry[k] is None: del table_entry[k] + table_entry = cast(dict[str, AttributeValue], table_entry) trace.get_current_span().set_attributes(table_entry) if command_status == "critical": diff --git a/openwpm/mp_logger.py b/openwpm/mp_logger.py index a03cf56b5..d91f90f59 100644 --- a/openwpm/mp_logger.py +++ b/openwpm/mp_logger.py @@ -13,9 +13,7 @@ from typing import Optional import dill -import sentry_sdk from multiprocess import JoinableQueue -from sentry_sdk.integrations.logging import BreadcrumbHandler, EventHandler from tblib import pickling_support from .commands.utils.webdriver_utils import parse_neterror @@ -29,8 +27,6 @@ ENV_CONFIG_VARS = [ "LOG_LEVEL_CONSOLE", "LOG_LEVEL_FILE", - "LOG_LEVEL_SENTRY_BREADCRUMB", - "LOG_LEVEL_SENTRY_EVENT", ] @@ -104,30 +100,21 @@ def __init__( crawl_reference: Optional[str] = None, log_level_console=logging.INFO, log_level_file=logging.DEBUG, - log_level_sentry_breadcrumb=logging.DEBUG, - log_level_sentry_event=logging.ERROR, ) -> None: self._crawl_reference = crawl_reference self._log_level_console = log_level_console self._log_level_file = log_level_file - self._log_level_sentry_breadcrumb = log_level_sentry_breadcrumb - self._log_level_sentry_event = log_level_sentry_event # Configure log handlers self._status_queue = JoinableQueue() self._log_file = os.path.expanduser(log_file) self._initialize_loggers() - # Configure sentry (if available) - self._sentry_dsn = os.getenv("SENTRY_DSN", None) - if self._sentry_dsn: - self._initialize_sentry() - def _initialize_loggers(self): """Set up console logging and serialized file logging. The logger and socket handler are set to log at the logging.DEBUG level - and filtering happens at the outputs (console, file, and sentry).""" + and filtering happens at the outputs (console, file).""" logger = logging.getLogger("openwpm") logger.setLevel(logging.DEBUG) @@ -163,57 +150,6 @@ def _initialize_loggers(self): socketHandler.setLevel(logging.DEBUG) logger.addHandler(socketHandler) - def _sentry_before_send(self, event, hint): - """Update sentry events before they are sent - - Note: we want to be very conservative in handling errors here. If this - method throws an error, Sentry silently discards it and no record is - sent. It's much better to have Sentry send an unparsed error then no - error. - """ - - # Strip "BROWSER X: " and `Extension-X: ` prefix to clean up logs - if "logentry" in event and "message" in event["logentry"]: - if re.match(BROWSER_PREFIX, event["logentry"]["message"]): - event["logentry"]["message"] = re.sub( - BROWSER_PREFIX, "", event["logentry"]["message"] - ) - if re.match(EXTENSION_PREFIX, event["logentry"]["message"]): - event["logentry"]["message"] = re.sub( - EXTENSION_PREFIX, "", event["logentry"]["message"] - ) - - # Add traceback info to fingerprint for logs that contain a traceback - try: - event["logentry"]["message"] = event["extra"]["exception"].strip() - except KeyError: - pass - - # Combine neterrors of the same type - try: - if "about:neterror" in event["extra"]["exception"]: - error_text = parse_neterror(event["extra"]["exception"]) - event["fingerprint"] = ["neterror-%s" % error_text] - except Exception: - pass - - return event - - def _initialize_sentry(self): - """If running a cloud crawl, we can pull the sentry endpoint - and related config varibles from the environment""" - self._breadcrumb_handler = BreadcrumbHandler( - level=self._log_level_sentry_breadcrumb - ) - self._event_handler = EventHandler(level=self._log_level_sentry_event) - sentry_sdk.init(dsn=self._sentry_dsn, before_send=self._sentry_before_send) - with sentry_sdk.configure_scope() as scope: - if self._crawl_reference: - scope.set_tag( - "CRAWL_REFERENCE", - self._crawl_reference, - ) - def _start_listener(self): """Start listening socket for remote logs from extension""" socket = ServerSocket(name="loggingserver") @@ -266,7 +202,6 @@ def _handle_serialized_writes(self, obj): """Handle records that must be serialized to the main process This is currently records that are written to a file on disk - and those sent to Sentry. """ if obj["exc_info"]: obj["exc_info"] = dill.loads(obj["exc_info"]) @@ -274,11 +209,6 @@ def _handle_serialized_writes(self, obj): obj["args"] = dill.loads(obj["args"]) record = logging.makeLogRecord(obj) self._file_handler.emit(record) - if self._sentry_dsn: - if record.levelno >= self._breadcrumb_handler.level: - self._breadcrumb_handler.handle(record) - if record.levelno >= self._event_handler.level: - self._event_handler.handle(record) def close(self): self._status_queue.put("SHUTDOWN") diff --git a/pyproject.toml b/pyproject.toml index ec673b532..079b9f50b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,7 @@ disallow_incomplete_defs = false [[tool.mypy.overrides]] module = "test.*" -allow_untyped_defs = true +disallow_untyped_defs = false [tool.coverage.run] concurrency = ["multiprocessing"] diff --git a/scripts/environment-unpinned.yaml b/scripts/environment-unpinned.yaml index 07dbdf64b..5688e0cf4 100644 --- a/scripts/environment-unpinned.yaml +++ b/scripts/environment-unpinned.yaml @@ -24,7 +24,6 @@ dependencies: - redis-py - s3fs - selenium - - sentry-sdk - tabulate - tblib - wget