From 59ad8df45db363fe2e76a527540f45b8f3ed6789 Mon Sep 17 00:00:00 2001 From: Jalen Morgan Date: Thu, 15 Jun 2023 15:23:32 -0400 Subject: [PATCH 1/4] OpenWPM StorageWatchdog complete --- demo_watchdog.py | 94 ++++++++++++++++++++++ environment.yaml | 1 + openwpm/browser_manager.py | 10 ++- openwpm/config.py | 9 +++ openwpm/deploy_browsers/deploy_firefox.py | 2 +- openwpm/task_manager.py | 9 +++ openwpm/utilities/storage_watchdog.py | 97 +++++++++++++++++++++++ 7 files changed, 220 insertions(+), 2 deletions(-) create mode 100644 demo_watchdog.py create mode 100644 openwpm/utilities/storage_watchdog.py diff --git a/demo_watchdog.py b/demo_watchdog.py new file mode 100644 index 000000000..c01ebd8ce --- /dev/null +++ b/demo_watchdog.py @@ -0,0 +1,94 @@ +import argparse +from pathlib import Path + +import tranco + +from custom_command import LinkCountingCommand +from openwpm.command_sequence import CommandSequence +from openwpm.commands.browser_commands import GetCommand +from openwpm.config import BrowserParams, ManagerParams +from openwpm.storage.sql_provider import SQLiteStorageProvider +from openwpm.task_manager import TaskManager + +parser = argparse.ArgumentParser() +parser.add_argument("--tranco", action="store_true", default=False), +args = parser.parse_args() + +if args.tranco: + # Load the latest tranco list. See https://tranco-list.eu/ + print("Loading tranco top sites list...") + t = tranco.Tranco(cache=True, cache_dir=".tranco") + latest_list = t.list() + sites = ["http://" + x for x in latest_list.top(10)] +else: + sites = [ + "http://www.example.com", + "http://www.princeton.edu", + "http://citp.princeton.edu/", + ] + +# Loads the default ManagerParams +# and NUM_BROWSERS copies of the default BrowserParams +NUM_BROWSERS = 2 +manager_params = ManagerParams(num_browsers=NUM_BROWSERS) +browser_params = [BrowserParams(display_mode="native") for _ in range(NUM_BROWSERS)] + +# Update browser configuration (use this for per-browser settings) +for browser_param in browser_params: + # Record HTTP Requests and Responses + browser_param.http_instrument = True + # Record cookie changes + browser_param.cookie_instrument = True + # Record Navigations + browser_param.navigation_instrument = True + # Record JS Web API calls + browser_param.js_instrument = True + # Record the callstack of all WebRequests made + browser_param.callstack_instrument = True + # Record DNS resolution + browser_param.dns_instrument = True + # Specify the location of temporary files. Ensure directory exists when specifying. + # browser_param.tmp_profile_dir = "/" + +# Update TaskManager configuration (use this for crawl-wide settings) +manager_params.data_directory = Path("./datadir/") +manager_params.log_path = Path("./datadir/openwpm.log") + + +# memory_watchdog, process_watchdog, storage_watchdog are useful for large scale cloud crawls. +# Please refer to docs/Configuration.md#platform-configuration-options for more information +# manager_params.memory_watchdog = True +# manager_params.process_watchdog = True +manager_params.storage_watchdog_enable = True + + +# Commands time out by default after 60 seconds +with TaskManager( + manager_params, + browser_params, + SQLiteStorageProvider(Path("./datadir/crawl-data.sqlite")), + None, +) as manager: + # Visits the sites + + for index, site in enumerate(sites): + + def callback(success: bool, val: str = site) -> None: + print( + f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}" + ) + + # Parallelize sites over all number of browsers set above. + command_sequence = CommandSequence( + site, + site_rank=index, + callback=callback, + ) + + # Start by visiting the page + command_sequence.append_command(GetCommand(url=site, sleep=3), timeout=60) + # Have a look at custom_command.py to see how to implement your own command + command_sequence.append_command(LinkCountingCommand()) + + # Run commands across all browsers (simple parallelization) + manager.execute_command_sequence(command_sequence) diff --git a/environment.yaml b/environment.yaml index d744ad762..31abb5a29 100644 --- a/environment.yaml +++ b/environment.yaml @@ -46,4 +46,5 @@ dependencies: - types-pyyaml==6.0.12.3 - types-redis==4.4.0.2 - types-tabulate==0.9.0.0 + - watchdog==3.0.0 name: openwpm diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index af7685d17..4a2c513da 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -34,6 +34,7 @@ kill_process_and_children, parse_traceback_for_sentry, ) +from .utilities.storage_watchdog import periodic_check pickling_support.install() @@ -501,6 +502,11 @@ def execute_command_sequence( if task_manager.closing: return + # Allow StorageWatchdog to utilize built-in browser reset functionality + # which results in a graceful restart of the browser instance + if self.manager_params.storage_watchdog_enable: + reset = periodic_check(self.current_profile_path, self) + if self.restart_required or reset: success = self.restart_browser_manager(clear_profile=reset) if not success: @@ -564,7 +570,9 @@ def kill_browser_manager(self): "type %s" % (self.browser_id, str(self.display_pid)) ) if self.display_port is not None: # xvfb display lock - lockfile = "/tmp/.X%s-lock" % self.display_port + # lockfile = "/tmp/.X%s-lock" % self.display_port + lockfile = os.path.join(self.browser_params.tmp_profile_dir, f".X{self.display_port}-lock") + try: os.remove(lockfile) except OSError: diff --git a/openwpm/config.py b/openwpm/config.py index 249ef45ef..5ddb4f88e 100644 --- a/openwpm/config.py +++ b/openwpm/config.py @@ -99,6 +99,9 @@ class BrowserParams(DataClassJsonMixin): profile_archive_dir: Optional[Path] = field( default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path) ) + + tmp_profile_dir: str = "/tmp" + recovery_tar: Optional[Path] = None donottrack: bool = False tracking_protection: bool = False @@ -133,6 +136,12 @@ class ManagerParams(DataClassJsonMixin): """A watchdog that tries to ensure that no Firefox instance takes up too much memory. It is mostly useful for long running cloud crawls""" process_watchdog: bool = False + + """A watchdog that serves as a happy medium between killing a browser after each + crawl and allowing the application to still perform quickly. Used as a way to save space + in a limited environment with minimal detriment to speed.""" + storage_watchdog_enable: bool = False + """- It is used to create another thread that kills off `GeckoDriver` (or `Xvfb`) instances that haven't been spawned by OpenWPM. (GeckoDriver is used by Selenium to control Firefox and Xvfb a "virtual display" so we simulate having graphics when running on a server).""" num_browsers: int = 1 diff --git a/openwpm/deploy_browsers/deploy_firefox.py b/openwpm/deploy_browsers/deploy_firefox.py index 0af24686f..5b8784c77 100755 --- a/openwpm/deploy_browsers/deploy_firefox.py +++ b/openwpm/deploy_browsers/deploy_firefox.py @@ -33,7 +33,7 @@ def deploy_firefox( root_dir = os.path.dirname(__file__) # directory of this file - browser_profile_path = Path(tempfile.mkdtemp(prefix="firefox_profile_")) + browser_profile_path = Path(tempfile.mkdtemp(prefix="firefox_profile_", dir=browser_params.tmp_profile_dir)) status_queue.put(("STATUS", "Profile Created", browser_profile_path)) # Use Options instead of FirefoxProfile to set preferences since the diff --git a/openwpm/task_manager.py b/openwpm/task_manager.py index c3d63a150..6d8d83304 100644 --- a/openwpm/task_manager.py +++ b/openwpm/task_manager.py @@ -29,6 +29,7 @@ ) from .utilities.multiprocess_utils import kill_process_and_children from .utilities.platform_utils import get_configuration_string, get_version +from .utilities.storage_watchdog import StorageWatchdog tblib.pickling_support.install() @@ -128,6 +129,14 @@ def __init__( thread.name = "OpenWPM-watchdog" thread.start() + # Start the StorageWatchdog + if self.manager_params.storage_watchdog_enable: + storage_watchdog = StorageWatchdog() + storage_watchdog_thread = threading.Thread(target=storage_watchdog.run, args=()) + storage_watchdog_thread.daemon = True + thread.name = "OpenWPM-storage-watchdog" + + storage_watchdog_thread.start() # Save crawl config information to database openwpm_v, browser_v = get_version() self.storage_controller_handle.save_configuration( diff --git a/openwpm/utilities/storage_watchdog.py b/openwpm/utilities/storage_watchdog.py new file mode 100644 index 000000000..3351cdb5b --- /dev/null +++ b/openwpm/utilities/storage_watchdog.py @@ -0,0 +1,97 @@ +import time +import logging, math, time +import subprocess, os +from watchdog.observers import Observer + + +# Nifty little function to prettyfi the output. Takes in a number of bytes and spits out the +# corresponding size in the largest unit it is able to convert to. +def convert_size(size_bytes): + if size_bytes == 0: + return "0B" + size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") + i = int(math.floor(math.log(size_bytes, 1024))) + p = math.pow(1024, i) + s = round(size_bytes / p, 2) + return "%s %s" % (s, size_name[i]) + +def total_folder_size(startup=False, debug=False, root_dir="/tmp"): + + running_total = 0 + if not startup: + for file in os.listdir(root_dir): + if "firefox" in file or ".xpi" in file or "owpm" in file or "Temp" in file: + path = os.path.join(root_dir,file) + try: + running_total += int(subprocess.check_output(['du','-s', '-b', path]).split()[0].decode('utf-8')) + except: + pass + if debug: + print(f"Currently using: {convert_size(running_total)} of storage on disk...") + return + return (f"Currently using: {convert_size(running_total)} of storage on disk...") + else: + for file in os.listdir(root_dir): + path = os.path.join(root_dir,file) + try: + running_total += int(subprocess.check_output(['du','-s', '-b', path], stderr=subprocess.DEVNULL).split()[0].decode('utf-8')) + except: + pass + if debug: + print(f"Readable files in {root_dir} folder take up {convert_size(running_total)} of storage on disk at start time...") + return + return (f"Readable files in {root_dir} folder take up {convert_size(running_total)} of storage on disk at start time...") + +class StorageWatchdog(): + # DIRECTORY_TO_WATCH = "/mnt/04dc803b-5e97-4b16-bdaf-80845c61942d" + + def __init__(self): + self.observer = Observer() + + def run(self) -> None: + logger = logging.getLogger("openwpm") + logger.info("Starting the StorageWatchdog...") + logger.info(total_folder_size(startup=True)) + try: + while True: + time.sleep(300) # Give storage updates every 5 minutes + logger.info(total_folder_size()) + + except: + self.observer.stop() + print("Error") + + self.observer.join() + + +def start_storage_watchdog(): + w = StorageWatchdog() + w.run() + +def periodic_check(profile_path, obj): + logger = logging.getLogger("openwpm") + # 1073741824: # 1GB + # 20971520: # 20MB - for testing purposes + # 52428800: # 50MB + # 73400320: # 70MB + # 104857600: 100MB - IDEAL for 10+ browsers + + # Max Size before a restart expressed in bytes + MAX_DIRSIZE = 104857600 + READABLE_MAX_DIRSIZE = convert_size(MAX_DIRSIZE) + + dirsize = int(subprocess.check_output(['du','-s', '-b', profile_path]).split()[0].decode('utf-8')) + readable_dirsize = convert_size(dirsize) + + if dirsize < MAX_DIRSIZE: # 100MB + + logger.info(f"Current browser profile directory {profile_path} size is less than {READABLE_MAX_DIRSIZE}: {readable_dirsize}") + return False + else: + obj.restart_required = True + logger.info(f"{profile_path}: Folder scheduled to be deleted and recovered {readable_dirsize} of storage.") + return True + +if __name__ == '__main__': + + total_folder_size(startup=True, debug=True) From ad7aa3f121a105bac318450b87620ff0315aeb2e Mon Sep 17 00:00:00 2001 From: Jalen Morgan Date: Mon, 3 Jul 2023 16:09:02 -0400 Subject: [PATCH 2/4] Revised tmp_profile_dir member to use the tempfile.gettempdir function for increased compatibility --- openwpm/config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/openwpm/config.py b/openwpm/config.py index 5ddb4f88e..34906f5d6 100644 --- a/openwpm/config.py +++ b/openwpm/config.py @@ -3,6 +3,7 @@ from json import JSONEncoder from pathlib import Path from typing import Any, Dict, List, Literal, Optional, Tuple, Union +import tempfile from dataclasses_json import DataClassJsonMixin from dataclasses_json import config as DCJConfig @@ -100,7 +101,8 @@ class BrowserParams(DataClassJsonMixin): default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path) ) - tmp_profile_dir: str = "/tmp" + # tmp_profile_dir: str = "/tmp" + tmp_profile_dir: str = tempfile.gettempdir() recovery_tar: Optional[Path] = None donottrack: bool = False From cedd0b21a6addb86e419a628a5860a0612b81836 Mon Sep 17 00:00:00 2001 From: Jalen Morgan Date: Mon, 3 Jul 2023 16:29:10 -0400 Subject: [PATCH 3/4] Restored version changes added watchdog 3.0.0 as a requirement. --- environment.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/environment.yaml b/environment.yaml index 5eeef8114..0f53769f6 100644 --- a/environment.yaml +++ b/environment.yaml @@ -43,7 +43,8 @@ dependencies: - jsonschema==4.17.3 - plyvel==1.5.0 - tranco==0.6 - - types-pyyaml==6.0.12.3 - - types-redis==4.4.0.2 - - types-tabulate==0.9.0.0 + - types-pyyaml==6.0.12.10 + - types-redis==4.5.5.2 + - types-tabulate==0.9.0.2 + - watchdog==3.0.0 name: openwpm From 58b28c26d3cca6a59f4d174f7b1fca74279d3e39 Mon Sep 17 00:00:00 2001 From: Jalen Morgan Date: Wed, 20 Sep 2023 18:53:49 -0400 Subject: [PATCH 4/4] Implemented changes as requested, with significant alteration to the StorageWatchdog backend. --- demo_watchdog.py | 12 ++- environment.yaml | 54 ++++++------- openwpm/browser_manager.py | 22 +++++- openwpm/config.py | 25 +++++- openwpm/task_manager.py | 8 +- openwpm/utilities/storage_watchdog.py | 106 ++++++++++++++++++-------- scripts/environment-unpinned.yaml | 1 + 7 files changed, 158 insertions(+), 70 deletions(-) diff --git a/demo_watchdog.py b/demo_watchdog.py index c01ebd8ce..1cfd3d9e5 100644 --- a/demo_watchdog.py +++ b/demo_watchdog.py @@ -25,13 +25,17 @@ "http://www.example.com", "http://www.princeton.edu", "http://citp.princeton.edu/", + "https://www.google.com", + "https://www.minecraft.net", + "https://www.nytimes.com", + "https://www.github.com" ] # Loads the default ManagerParams # and NUM_BROWSERS copies of the default BrowserParams NUM_BROWSERS = 2 manager_params = ManagerParams(num_browsers=NUM_BROWSERS) -browser_params = [BrowserParams(display_mode="native") for _ in range(NUM_BROWSERS)] +browser_params = [BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS)] # Update browser configuration (use this for per-browser settings) for browser_param in browser_params: @@ -44,7 +48,8 @@ # Record JS Web API calls browser_param.js_instrument = True # Record the callstack of all WebRequests made - browser_param.callstack_instrument = True + # browser_param.callstack_instrument = True # According to error messages, callstack_instrument is now + # a broken function # Record DNS resolution browser_param.dns_instrument = True # Specify the location of temporary files. Ensure directory exists when specifying. @@ -59,8 +64,7 @@ # Please refer to docs/Configuration.md#platform-configuration-options for more information # manager_params.memory_watchdog = True # manager_params.process_watchdog = True -manager_params.storage_watchdog_enable = True - +manager_params.storage_watchdog_enable = 52428800 # Commands time out by default after 60 seconds with TaskManager( diff --git a/environment.yaml b/environment.yaml index 0f53769f6..cebd6ba94 100644 --- a/environment.yaml +++ b/environment.yaml @@ -3,48 +3,48 @@ channels: - main dependencies: - beautifulsoup4=4.12.2 -- black=23.3.0 -- click=8.1.3 +- black=23.9.1 +- click=8.1.7 - codecov=2.1.13 -- dill=0.3.6 +- dill=0.3.7 - easyprocess=1.1 -- gcsfs=2023.6.0 +- gcsfs=2023.9.1 - geckodriver=0.33.0 -- ipython=8.14.0 +- ipython=8.15.0 - isort=5.12.0 - leveldb=1.23 -- multiprocess=0.70.14 -- mypy=1.3.0 -- nodejs=18.15.0 -- pandas=2.0.2 -- pillow=9.5.0 -- pip=23.1.2 -- pre-commit=3.3.3 +- multiprocess=0.70.15 +- mypy=1.5.1 +- nodejs=20.7.0 +- pandas=2.1.0 +- pillow=10.0.1 +- pip=23.2.1 +- pre-commit=3.4.0 - psutil=5.9.5 -- pyarrow=12.0.0 -- pytest-asyncio=0.21.0 +- pyarrow=13.0.0 +- pytest-asyncio=0.21.1 - pytest-cov=4.1.0 -- pytest=7.3.2 -- python=3.11.4 +- pytest=7.4.2 +- python=3.11.5 - pyvirtualdisplay=3.0 - recommonmark=0.7.1 -- redis-py=4.5.5 -- s3fs=2023.6.0 -- selenium=4.10.0 -- sentry-sdk=1.21.1 +- redis-py=5.0.0 +- s3fs=2023.9.1 +- selenium=4.12.0 +- sentry-sdk=1.31.0 - sphinx-markdown-tables=0.0.17 -- sphinx=7.0.1 +- sphinx=7.2.6 - tabulate=0.9.0 -- tblib=1.7.0 +- tblib=2.0.0 - wget=1.20.3 - pip: - - dataclasses-json==0.5.8 + - dataclasses-json==0.6.0 - domain-utils==0.7.1 - - jsonschema==4.17.3 + - jsonschema==4.19.0 - plyvel==1.5.0 - tranco==0.6 - - types-pyyaml==6.0.12.10 - - types-redis==4.5.5.2 - - types-tabulate==0.9.0.2 + - types-pyyaml==6.0.12.11 + - types-redis==4.6.0.6 + - types-tabulate==0.9.0.3 - watchdog==3.0.0 name: openwpm diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index 4a2c513da..29f282150 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -34,7 +34,7 @@ kill_process_and_children, parse_traceback_for_sentry, ) -from .utilities.storage_watchdog import periodic_check +from .utilities.storage_watchdog import StorageWatchdogThread pickling_support.install() @@ -505,7 +505,25 @@ def execute_command_sequence( # Allow StorageWatchdog to utilize built-in browser reset functionality # which results in a graceful restart of the browser instance if self.manager_params.storage_watchdog_enable: - reset = periodic_check(self.current_profile_path, self) + + # storage_checker = threading.Thread(target=self.manager_params.storage_watchdog_obj.periodic_check, args=([self.current_profile_path, self])) + # storage_checker.daemon = True + # storage_checker.name = f"OpenWPM-storage-checker-{self.browser_id}" + storage_checker = StorageWatchdogThread(self.manager_params.storage_watchdog_obj, [ + self.current_profile_path, + self + ]) + storage_checker.daemon = True + storage_checker.name = "" + storage_checker.start() + storage_checker.join() + + # storage_checker.start() + # storage_checker.join() + + # reset = self.manager_params.storage_watchdog_obj.periodic_check(self.current_profile_path, self) + reset = storage_checker.ret_value + if self.restart_required or reset: success = self.restart_browser_manager(clear_profile=reset) diff --git a/openwpm/config.py b/openwpm/config.py index 34ba78735..19586a1e8 100644 --- a/openwpm/config.py +++ b/openwpm/config.py @@ -101,8 +101,12 @@ class BrowserParams(DataClassJsonMixin): default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path) ) - # tmp_profile_dir: str = "/tmp" tmp_profile_dir: str = tempfile.gettempdir() + """ + The tmp_profile_dir defaults to the OS's temporary file folder (typically /tmp) and is where the generated + browser profiles and residual files are stored. + """ + recovery_tar: Optional[Path] = None donottrack: bool = False @@ -138,11 +142,24 @@ class ManagerParams(DataClassJsonMixin): """A watchdog that tries to ensure that no Firefox instance takes up too much memory. It is mostly useful for long running cloud crawls""" process_watchdog: bool = False - + + storage_watchdog_enable: Optional[int] = None """A watchdog that serves as a happy medium between killing a browser after each crawl and allowing the application to still perform quickly. Used as a way to save space - in a limited environment with minimal detriment to speed.""" - storage_watchdog_enable: bool = False + in a limited environment with minimal detriment to speed. This Optional[int] should be the threshold + size of the folder in bytes. + ``` + # Sample values: + 1073741824: 1GB + 20971520: 20MB - for testing purposes + 52428800: 50MB + 73400320: 70MB + 104857600: 100MB - IDEAL for 10+ browsers + ``` + """ + + storage_watchdog_obj = None # DO NOT EDIT THIS LINE + """Stores a handle to the actual watchdog object.""" """- It is used to create another thread that kills off `GeckoDriver` (or `Xvfb`) instances that haven't been spawned by OpenWPM. (GeckoDriver is used by Selenium to control Firefox and Xvfb a "virtual display" so we simulate having graphics when running on a server).""" diff --git a/openwpm/task_manager.py b/openwpm/task_manager.py index 6d8d83304..b5cd601fa 100644 --- a/openwpm/task_manager.py +++ b/openwpm/task_manager.py @@ -84,6 +84,8 @@ def __init__( self.browser_params = browser_params self._logger_kwargs = logger_kwargs + + # Create data directories if they do not exist if not os.path.exists(manager_params.screenshot_path): os.makedirs(manager_params.screenshot_path) @@ -131,10 +133,12 @@ def __init__( # Start the StorageWatchdog if self.manager_params.storage_watchdog_enable: - storage_watchdog = StorageWatchdog() + + storage_watchdog = StorageWatchdog(self.browser_params[0].tmp_profile_dir ,self.manager_params.storage_watchdog_enable) + self.manager_params.storage_watchdog_obj = storage_watchdog storage_watchdog_thread = threading.Thread(target=storage_watchdog.run, args=()) storage_watchdog_thread.daemon = True - thread.name = "OpenWPM-storage-watchdog" + storage_watchdog_thread.name = "OpenWPM-storage-watchdog" storage_watchdog_thread.start() # Save crawl config information to database diff --git a/openwpm/utilities/storage_watchdog.py b/openwpm/utilities/storage_watchdog.py index 3351cdb5b..cfaa0f1d0 100644 --- a/openwpm/utilities/storage_watchdog.py +++ b/openwpm/utilities/storage_watchdog.py @@ -1,4 +1,5 @@ import time +from threading import Thread import logging, math, time import subprocess, os from watchdog.observers import Observer @@ -6,18 +7,30 @@ # Nifty little function to prettyfi the output. Takes in a number of bytes and spits out the # corresponding size in the largest unit it is able to convert to. + def convert_size(size_bytes): if size_bytes == 0: return "0B" size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") - i = int(math.floor(math.log(size_bytes, 1024))) - p = math.pow(1024, i) - s = round(size_bytes / p, 2) + i: int = int(math.floor(math.log(size_bytes, 1024))) + p: float = math.pow(1024, i) + s: float = round(size_bytes / p, 2) return "%s %s" % (s, size_name[i]) def total_folder_size(startup=False, debug=False, root_dir="/tmp"): + """_summary_ + + Args: + startup (bool, optional): Runs the function on the total supplied folder. Defaults to False. + debug (bool, optional): Useful for debugging functionality locally. Defaults to False. + root_dir (str, optional): The root directory to check. Defaults to "/tmp". + + Returns: + _type_: _description_ + """ - running_total = 0 + + running_total: int = 0 if not startup: for file in os.listdir(root_dir): if "firefox" in file or ".xpi" in file or "owpm" in file or "Temp" in file: @@ -32,7 +45,7 @@ def total_folder_size(startup=False, debug=False, root_dir="/tmp"): return (f"Currently using: {convert_size(running_total)} of storage on disk...") else: for file in os.listdir(root_dir): - path = os.path.join(root_dir,file) + path: str = os.path.join(root_dir,file) try: running_total += int(subprocess.check_output(['du','-s', '-b', path], stderr=subprocess.DEVNULL).split()[0].decode('utf-8')) except: @@ -45,10 +58,17 @@ def total_folder_size(startup=False, debug=False, root_dir="/tmp"): class StorageWatchdog(): # DIRECTORY_TO_WATCH = "/mnt/04dc803b-5e97-4b16-bdaf-80845c61942d" - def __init__(self): + def __init__(self, supplied_dir=None, dirsize=0): + self.MAX_DIRSIZE = dirsize self.observer = Observer() + self.dir_to_watch = supplied_dir def run(self) -> None: + # Checks if the default dirsize and directory to watch were configured. If they are still the default, it exits because + # it would essentially work identically to setting the "reset" flag in the command sequence + if self.MAX_DIRSIZE == 0 or self.dir_to_watch is None: + return + logger = logging.getLogger("openwpm") logger.info("Starting the StorageWatchdog...") logger.info(total_folder_size(startup=True)) @@ -62,36 +82,60 @@ def run(self) -> None: print("Error") self.observer.join() + + def periodic_check(self, profile_path, obj): + logger = logging.getLogger("openwpm") + # 1073741824: # 1GB + # 20971520: # 20MB - for testing purposes + # 52428800: # 50MB + # 73400320: # 70MB + # 104857600: 100MB - IDEAL for 10+ browsers + + # Max Size before a restart expressed in bytes + if self.MAX_DIRSIZE == 0: + pass + + READABLE_MAX_DIRSIZE = convert_size(self.MAX_DIRSIZE) + + dirsize = int(subprocess.check_output(['du','-s', '-b', profile_path]).split()[0].decode('utf-8')) + readable_dirsize = convert_size(dirsize) + if dirsize < self.MAX_DIRSIZE: + + logger.info(f"Current browser profile directory {profile_path} size is less than {READABLE_MAX_DIRSIZE}: {readable_dirsize}") + return False + else: + obj.restart_required = True + logger.info(f"{profile_path}: Folder scheduled to be deleted and recover {readable_dirsize} of storage.") + return True -def start_storage_watchdog(): - w = StorageWatchdog() - w.run() -def periodic_check(profile_path, obj): - logger = logging.getLogger("openwpm") - # 1073741824: # 1GB - # 20971520: # 20MB - for testing purposes - # 52428800: # 50MB - # 73400320: # 70MB - # 104857600: 100MB - IDEAL for 10+ browsers - - # Max Size before a restart expressed in bytes - MAX_DIRSIZE = 104857600 - READABLE_MAX_DIRSIZE = convert_size(MAX_DIRSIZE) - - dirsize = int(subprocess.check_output(['du','-s', '-b', profile_path]).split()[0].decode('utf-8')) - readable_dirsize = convert_size(dirsize) +class StorageWatchdogThread(Thread): + """ + This is a custom implementation of the Thread subclass from the threading module + that allows for collection of the return value. This was necessary to prevent the main + StorageWatchdog thread from being hemmed up running each browser profile check + in its main thread and instead, spawning separate instances and blocking each browser thread until + the check is complete, ensuring asynchio doesnt get upset. + """ + + def __init__(self, watchdog: StorageWatchdog, argList: list[str]): + """_summary_ - if dirsize < MAX_DIRSIZE: # 100MB + Args: + watchdog (StorageWatchdog): The main StorageWatchdog Object, running the main thread + argList (list[str]): + argList[0]: The profile_dir + argList[1]: The BrowserManager instance + """ + Thread.__init__(self) + self.ret_value = None + self.watchdog = watchdog + self.argList = argList - logger.info(f"Current browser profile directory {profile_path} size is less than {READABLE_MAX_DIRSIZE}: {readable_dirsize}") - return False - else: - obj.restart_required = True - logger.info(f"{profile_path}: Folder scheduled to be deleted and recovered {readable_dirsize} of storage.") - return True + def run(self): + self.ret_value = self.watchdog.periodic_check(self.argList[0], self.argList[1]) if __name__ == '__main__': - + print("---Testing the StorageWatchdog folder size function---") total_folder_size(startup=True, debug=True) diff --git a/scripts/environment-unpinned.yaml b/scripts/environment-unpinned.yaml index 25934b66a..79deadfcf 100644 --- a/scripts/environment-unpinned.yaml +++ b/scripts/environment-unpinned.yaml @@ -33,3 +33,4 @@ dependencies: - domain-utils - dataclasses-json - tranco + - watchdog