Skip to content

Commit

Permalink
feat(otel): remove sentry
Browse files Browse the repository at this point in the history
  • Loading branch information
vringar committed Jul 21, 2024
1 parent 8a4dc8d commit 076f053
Show file tree
Hide file tree
Showing 6 changed files with 7 additions and 116 deletions.
39 changes: 0 additions & 39 deletions crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
from types import FrameType
from typing import Any, Callable, List, Literal, Optional

import sentry_sdk

from openwpm import mp_logger
from openwpm.command_sequence import CommandSequence
from openwpm.config import BrowserParams, ManagerParams
Expand Down Expand Up @@ -52,8 +50,6 @@
SAVE_CONTENT = os.getenv("SAVE_CONTENT", "")
PREFS = os.getenv("PREFS", None)


SENTRY_DSN = os.getenv("SENTRY_DSN", None)
LOGGER_SETTINGS = mp_logger.parse_config_from_env()

if CALLSTACK_INSTRUMENT is True:
Expand Down Expand Up @@ -114,38 +110,6 @@
logger_kwargs=LOGGER_SETTINGS,
)

# At this point, Sentry should be initiated
if SENTRY_DSN:
# Add crawler.py-specific context
with sentry_sdk.configure_scope() as scope:
# tags generate breakdown charts and search filters
scope.set_tag("CRAWL_DIRECTORY", CRAWL_DIRECTORY)
scope.set_tag("GCS_BUCKET", GCS_BUCKET)
scope.set_tag("DISPLAY_MODE", DISPLAY_MODE)
scope.set_tag("HTTP_INSTRUMENT", HTTP_INSTRUMENT)
scope.set_tag("COOKIE_INSTRUMENT", COOKIE_INSTRUMENT)
scope.set_tag("NAVIGATION_INSTRUMENT", NAVIGATION_INSTRUMENT)
scope.set_tag("JS_INSTRUMENT", JS_INSTRUMENT)
scope.set_tag("JS_INSTRUMENT_SETTINGS", JS_INSTRUMENT_SETTINGS)
scope.set_tag("CALLSTACK_INSTRUMENT", CALLSTACK_INSTRUMENT)
scope.set_tag("SAVE_CONTENT", SAVE_CONTENT)
scope.set_tag("DWELL_TIME", DWELL_TIME)
scope.set_tag("TIMEOUT", TIMEOUT)
scope.set_tag("MAX_JOB_RETRIES", MAX_JOB_RETRIES)
scope.set_tag("CRAWL_REFERENCE", "%s/%s" % (GCS_BUCKET, CRAWL_DIRECTORY))
# context adds addition information that may be of interest
if PREFS:
scope.set_context("PREFS", json.loads(PREFS))
scope.set_context(
"crawl_config",
{
"REDIS_QUEUE_NAME": REDIS_QUEUE_NAME,
},
)
# Send a sentry error message (temporarily - to easily be able
# to compare error frequencies to crawl worker instance count)
sentry_sdk.capture_message("Crawl worker started")

# Connect to job queue
job_queue = rediswq.RedisWQ(
name=REDIS_QUEUE_NAME, host=REDIS_HOST, max_retries=MAX_JOB_RETRIES
Expand Down Expand Up @@ -234,6 +198,3 @@ def callback(success: bool) -> None:
else:
manager.logger.info("Job queue finished, exiting.")
manager.close()

if SENTRY_DSN:
sentry_sdk.capture_message("Crawl worker finished")
5 changes: 2 additions & 3 deletions environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ dependencies:
- leveldb=1.23
- multiprocess=0.70.16
- mypy=1.10.1
- nodejs=22.5.0
- nodejs=22.5.1
- pandas=2.2.2
- pillow=10.4.0
- pip=24.0
Expand All @@ -25,14 +25,13 @@ dependencies:
- pyarrow=16.1.0
- pytest-asyncio=0.23.8
- pytest-cov=5.0.0
- pytest=8.2.2
- pytest=8.3.1
- python=3.11.9
- pyvirtualdisplay=3.0
- recommonmark=0.7.1
- redis-py=5.0.7
- s3fs=2024.6.1
- selenium=4.22.0
- sentry-sdk=2.10.0
- sphinx-markdown-tables=0.0.17
- sphinx=7.4.7
- tabulate=0.9.0
Expand Down
4 changes: 3 additions & 1 deletion openwpm/browser_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@
import traceback
from pathlib import Path
from queue import Empty as EmptyQueue
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Type, Union
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Type, Union, cast

import psutil
from multiprocess import Queue
from opentelemetry import trace
from opentelemetry.util.types import AttributeValue
from selenium.common.exceptions import WebDriverException
from tblib import Traceback, pickling_support

Expand Down Expand Up @@ -469,6 +470,7 @@ def execute_command_sequence(
for k in list(table_entry.keys()):
if table_entry[k] is None:
del table_entry[k]
table_entry = cast(dict[str, AttributeValue], table_entry)
trace.get_current_span().set_attributes(table_entry)

if command_status == "critical":
Expand Down
72 changes: 1 addition & 71 deletions openwpm/mp_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,7 @@
from typing import Optional

import dill
import sentry_sdk
from multiprocess import JoinableQueue
from sentry_sdk.integrations.logging import BreadcrumbHandler, EventHandler
from tblib import pickling_support

from .commands.utils.webdriver_utils import parse_neterror
Expand All @@ -29,8 +27,6 @@
ENV_CONFIG_VARS = [
"LOG_LEVEL_CONSOLE",
"LOG_LEVEL_FILE",
"LOG_LEVEL_SENTRY_BREADCRUMB",
"LOG_LEVEL_SENTRY_EVENT",
]


Expand Down Expand Up @@ -104,30 +100,21 @@ def __init__(
crawl_reference: Optional[str] = None,
log_level_console=logging.INFO,
log_level_file=logging.DEBUG,
log_level_sentry_breadcrumb=logging.DEBUG,
log_level_sentry_event=logging.ERROR,
) -> None:
self._crawl_reference = crawl_reference
self._log_level_console = log_level_console
self._log_level_file = log_level_file
self._log_level_sentry_breadcrumb = log_level_sentry_breadcrumb
self._log_level_sentry_event = log_level_sentry_event
# Configure log handlers
self._status_queue = JoinableQueue()
self._log_file = os.path.expanduser(log_file)

self._initialize_loggers()

# Configure sentry (if available)
self._sentry_dsn = os.getenv("SENTRY_DSN", None)
if self._sentry_dsn:
self._initialize_sentry()

def _initialize_loggers(self):
"""Set up console logging and serialized file logging.
The logger and socket handler are set to log at the logging.DEBUG level
and filtering happens at the outputs (console, file, and sentry)."""
and filtering happens at the outputs (console, file)."""
logger = logging.getLogger("openwpm")
logger.setLevel(logging.DEBUG)

Expand Down Expand Up @@ -163,57 +150,6 @@ def _initialize_loggers(self):
socketHandler.setLevel(logging.DEBUG)
logger.addHandler(socketHandler)

def _sentry_before_send(self, event, hint):
"""Update sentry events before they are sent
Note: we want to be very conservative in handling errors here. If this
method throws an error, Sentry silently discards it and no record is
sent. It's much better to have Sentry send an unparsed error then no
error.
"""

# Strip "BROWSER X: " and `Extension-X: ` prefix to clean up logs
if "logentry" in event and "message" in event["logentry"]:
if re.match(BROWSER_PREFIX, event["logentry"]["message"]):
event["logentry"]["message"] = re.sub(
BROWSER_PREFIX, "", event["logentry"]["message"]
)
if re.match(EXTENSION_PREFIX, event["logentry"]["message"]):
event["logentry"]["message"] = re.sub(
EXTENSION_PREFIX, "", event["logentry"]["message"]
)

# Add traceback info to fingerprint for logs that contain a traceback
try:
event["logentry"]["message"] = event["extra"]["exception"].strip()
except KeyError:
pass

# Combine neterrors of the same type
try:
if "about:neterror" in event["extra"]["exception"]:
error_text = parse_neterror(event["extra"]["exception"])
event["fingerprint"] = ["neterror-%s" % error_text]
except Exception:
pass

return event

def _initialize_sentry(self):
"""If running a cloud crawl, we can pull the sentry endpoint
and related config varibles from the environment"""
self._breadcrumb_handler = BreadcrumbHandler(
level=self._log_level_sentry_breadcrumb
)
self._event_handler = EventHandler(level=self._log_level_sentry_event)
sentry_sdk.init(dsn=self._sentry_dsn, before_send=self._sentry_before_send)
with sentry_sdk.configure_scope() as scope:
if self._crawl_reference:
scope.set_tag(
"CRAWL_REFERENCE",
self._crawl_reference,
)

def _start_listener(self):
"""Start listening socket for remote logs from extension"""
socket = ServerSocket(name="loggingserver")
Expand Down Expand Up @@ -266,19 +202,13 @@ def _handle_serialized_writes(self, obj):
"""Handle records that must be serialized to the main process
This is currently records that are written to a file on disk
and those sent to Sentry.
"""
if obj["exc_info"]:
obj["exc_info"] = dill.loads(obj["exc_info"])
if obj["args"]:
obj["args"] = dill.loads(obj["args"])
record = logging.makeLogRecord(obj)
self._file_handler.emit(record)
if self._sentry_dsn:
if record.levelno >= self._breadcrumb_handler.level:
self._breadcrumb_handler.handle(record)
if record.levelno >= self._event_handler.level:
self._event_handler.handle(record)

def close(self):
self._status_queue.put("SHUTDOWN")
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ disallow_incomplete_defs = false

[[tool.mypy.overrides]]
module = "test.*"
allow_untyped_defs = true
disallow_untyped_defs = false

[tool.coverage.run]
concurrency = ["multiprocessing"]
Expand Down
1 change: 0 additions & 1 deletion scripts/environment-unpinned.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ dependencies:
- redis-py
- s3fs
- selenium
- sentry-sdk
- tabulate
- tblib
- wget
Expand Down

0 comments on commit 076f053

Please sign in to comment.