Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OpenWPM StorageWatchdog complete #1039

Merged
merged 10 commits into from
Oct 11, 2023
98 changes: 98 additions & 0 deletions demo_watchdog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import argparse
from pathlib import Path

import tranco

from custom_command import LinkCountingCommand
from openwpm.command_sequence import CommandSequence
from openwpm.commands.browser_commands import GetCommand
from openwpm.config import BrowserParams, ManagerParams
from openwpm.storage.sql_provider import SQLiteStorageProvider
from openwpm.task_manager import TaskManager

parser = argparse.ArgumentParser()
parser.add_argument("--tranco", action="store_true", default=False),
args = parser.parse_args()

if args.tranco:
# Load the latest tranco list. See https://tranco-list.eu/
print("Loading tranco top sites list...")
t = tranco.Tranco(cache=True, cache_dir=".tranco")
latest_list = t.list()
sites = ["http://" + x for x in latest_list.top(10)]
else:
sites = [
"http://www.example.com",
"http://www.princeton.edu",
"http://citp.princeton.edu/",
"https://www.google.com",
"https://www.minecraft.net",
"https://www.nytimes.com",
"https://www.github.com"
]

# Loads the default ManagerParams
# and NUM_BROWSERS copies of the default BrowserParams
NUM_BROWSERS = 2
manager_params = ManagerParams(num_browsers=NUM_BROWSERS)
browser_params = [BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS)]

# Update browser configuration (use this for per-browser settings)
for browser_param in browser_params:
# Record HTTP Requests and Responses
browser_param.http_instrument = True
# Record cookie changes
browser_param.cookie_instrument = True
# Record Navigations
browser_param.navigation_instrument = True
# Record JS Web API calls
browser_param.js_instrument = True
# Record the callstack of all WebRequests made
# browser_param.callstack_instrument = True # According to error messages, callstack_instrument is now
# a broken function
# Record DNS resolution
browser_param.dns_instrument = True
# Specify the location of temporary files. Ensure directory exists when specifying.
# browser_param.tmp_profile_dir = "/"

# Update TaskManager configuration (use this for crawl-wide settings)
manager_params.data_directory = Path("./datadir/")
manager_params.log_path = Path("./datadir/openwpm.log")


# memory_watchdog, process_watchdog, storage_watchdog are useful for large scale cloud crawls.
# Please refer to docs/Configuration.md#platform-configuration-options for more information
# manager_params.memory_watchdog = True
# manager_params.process_watchdog = True
manager_params.storage_watchdog_enable = 52428800

# Commands time out by default after 60 seconds
with TaskManager(
manager_params,
browser_params,
SQLiteStorageProvider(Path("./datadir/crawl-data.sqlite")),
None,
) as manager:
# Visits the sites

for index, site in enumerate(sites):

def callback(success: bool, val: str = site) -> None:
print(
f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}"
)

# Parallelize sites over all number of browsers set above.
command_sequence = CommandSequence(
site,
site_rank=index,
callback=callback,
)

# Start by visiting the page
command_sequence.append_command(GetCommand(url=site, sleep=3), timeout=60)
# Have a look at custom_command.py to see how to implement your own command
command_sequence.append_command(LinkCountingCommand())

# Run commands across all browsers (simple parallelization)
manager.execute_command_sequence(command_sequence)
22 changes: 12 additions & 10 deletions environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,39 +3,40 @@ channels:
- main
dependencies:
- beautifulsoup4=4.12.2
- black=23.7.0
- black=23.9.1
- click=8.1.7
- codecov=2.1.13
- dill=0.3.7
- dill=0.3.7
- easyprocess=1.1
- gcsfs=2023.9.0
- gcsfs=2023.9.1
- geckodriver=0.33.0
- ipython=8.15.0
- isort=5.12.0
- leveldb=1.23
- multiprocess=0.70.15
- mypy=1.5.1
- nodejs=20.6.0
- nodejs=20.7.0
- pandas=2.1.0
- pillow=10.0.0
- pillow=10.0.1
- pip=23.2.1
- pre-commit=3.4.0
- psutil=5.9.5
- pyarrow=13.0.0
- pytest-asyncio=0.21.1
- pytest-cov=4.1.0
- pytest=7.4.1
- pytest=7.4.2
- python=3.11.5
- pyvirtualdisplay=3.0
- recommonmark=0.7.1
- redis-py=5.0.0
- s3fs=2023.9.0
- s3fs=2023.9.1
- selenium=4.12.0
- sentry-sdk=1.30.0
- sentry-sdk=1.31.0
- sphinx-markdown-tables=0.0.17
- sphinx=7.2.5
- sphinx=7.2.6
- tabulate=0.9.0
- tblib=1.7.0
- tblib=2.0.0
- wget=1.20.3
- pip:
- dataclasses-json==0.6.0
Expand All @@ -44,6 +45,7 @@ dependencies:
- plyvel==1.5.0
- tranco==0.6
- types-pyyaml==6.0.12.11
- types-redis==4.6.0.5
- types-redis==4.6.0.6
- types-tabulate==0.9.0.3
- watchdog==3.0.0
gridl0ck marked this conversation as resolved.
Show resolved Hide resolved
name: openwpm
28 changes: 27 additions & 1 deletion openwpm/browser_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
kill_process_and_children,
parse_traceback_for_sentry,
)
from .utilities.storage_watchdog import StorageWatchdogThread

pickling_support.install()

Expand Down Expand Up @@ -501,6 +502,29 @@ def execute_command_sequence(
if task_manager.closing:
return

# Allow StorageWatchdog to utilize built-in browser reset functionality
# which results in a graceful restart of the browser instance
if self.manager_params.storage_watchdog_enable:

# storage_checker = threading.Thread(target=self.manager_params.storage_watchdog_obj.periodic_check, args=([self.current_profile_path, self]))
# storage_checker.daemon = True
# storage_checker.name = f"OpenWPM-storage-checker-{self.browser_id}"
storage_checker = StorageWatchdogThread(self.manager_params.storage_watchdog_obj, [
self.current_profile_path,
self
])
storage_checker.daemon = True
storage_checker.name = ""
storage_checker.start()
storage_checker.join()

# storage_checker.start()
# storage_checker.join()

# reset = self.manager_params.storage_watchdog_obj.periodic_check(self.current_profile_path, self)
reset = storage_checker.ret_value


if self.restart_required or reset:
success = self.restart_browser_manager(clear_profile=reset)
if not success:
Expand Down Expand Up @@ -564,7 +588,9 @@ def kill_browser_manager(self):
"type %s" % (self.browser_id, str(self.display_pid))
)
if self.display_port is not None: # xvfb display lock
lockfile = "/tmp/.X%s-lock" % self.display_port
# lockfile = "/tmp/.X%s-lock" % self.display_port
lockfile = os.path.join(self.browser_params.tmp_profile_dir, f".X{self.display_port}-lock")

try:
os.remove(lockfile)
except OSError:
Expand Down
31 changes: 31 additions & 0 deletions openwpm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from json import JSONEncoder
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
import tempfile

from dataclasses_json import DataClassJsonMixin
from dataclasses_json import config as DCJConfig
Expand Down Expand Up @@ -99,6 +100,14 @@ class BrowserParams(DataClassJsonMixin):
profile_archive_dir: Optional[Path] = field(
default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path)
)

tmp_profile_dir: str = tempfile.gettempdir()
"""
The tmp_profile_dir defaults to the OS's temporary file folder (typically /tmp) and is where the generated
browser profiles and residual files are stored.
"""


recovery_tar: Optional[Path] = None
donottrack: bool = False
tracking_protection: bool = False
Expand Down Expand Up @@ -133,6 +142,28 @@ class ManagerParams(DataClassJsonMixin):
"""A watchdog that tries to ensure that no Firefox instance takes up too much memory.
It is mostly useful for long running cloud crawls"""
process_watchdog: bool = False


storage_watchdog_enable: Optional[int] = None
"""A watchdog that serves as a happy medium between killing a browser after each
crawl and allowing the application to still perform quickly. Used as a way to save space
in a limited environment with minimal detriment to speed. This Optional[int] should be the threshold
size of the folder in bytes.
```
# Sample values:
1073741824: 1GB
20971520: 20MB - for testing purposes
52428800: 50MB
73400320: 70MB
104857600: 100MB - IDEAL for 10+ browsers
```
"""

storage_watchdog_obj = None # DO NOT EDIT THIS LINE
"""Stores a handle to the actual watchdog object."""

"""- It is used to create another thread that kills off `GeckoDriver` (or `Xvfb`) instances that haven't been spawned by OpenWPM. (GeckoDriver is used by
=======
"""It is used to create another thread that kills off `GeckoDriver` (or `Xvfb`) instances that haven't been spawned by OpenWPM. (GeckoDriver is used by
Selenium to control Firefox and Xvfb a "virtual display" so we simulate having graphics when running on a server)."""
num_browsers: int = 1
Expand Down
2 changes: 1 addition & 1 deletion openwpm/deploy_browsers/deploy_firefox.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def deploy_firefox(

root_dir = os.path.dirname(__file__) # directory of this file

browser_profile_path = Path(tempfile.mkdtemp(prefix="firefox_profile_"))
browser_profile_path = Path(tempfile.mkdtemp(prefix="firefox_profile_", dir=browser_params.tmp_profile_dir))
status_queue.put(("STATUS", "Profile Created", browser_profile_path))

# Use Options instead of FirefoxProfile to set preferences since the
Expand Down
13 changes: 13 additions & 0 deletions openwpm/task_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
)
from .utilities.multiprocess_utils import kill_process_and_children
from .utilities.platform_utils import get_configuration_string, get_version
from .utilities.storage_watchdog import StorageWatchdog

tblib.pickling_support.install()

Expand Down Expand Up @@ -83,6 +84,8 @@ def __init__(
self.browser_params = browser_params
self._logger_kwargs = logger_kwargs



# Create data directories if they do not exist
if not os.path.exists(manager_params.screenshot_path):
os.makedirs(manager_params.screenshot_path)
Expand Down Expand Up @@ -128,6 +131,16 @@ def __init__(
thread.name = "OpenWPM-watchdog"
thread.start()

# Start the StorageWatchdog
if self.manager_params.storage_watchdog_enable:

storage_watchdog = StorageWatchdog(self.browser_params[0].tmp_profile_dir ,self.manager_params.storage_watchdog_enable)
self.manager_params.storage_watchdog_obj = storage_watchdog
storage_watchdog_thread = threading.Thread(target=storage_watchdog.run, args=())
storage_watchdog_thread.daemon = True
storage_watchdog_thread.name = "OpenWPM-storage-watchdog"

storage_watchdog_thread.start()
# Save crawl config information to database
openwpm_v, browser_v = get_version()
self.storage_controller_handle.save_configuration(
Expand Down
Loading
Loading