Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release #1057

Merged
merged 12 commits into from
Oct 21, 2023
Merged

Release #1057

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## v0.25.0 - 2023-10-13

Bump to Firefox 118.0.2
Introduce StorageWatchdog #1056 (Thanks @gridl0ck for contributing this)
Upgrade Docker image to Ubuntu 22.04 #1055

## v0.24.0 - 2023-09-05

Bump to Firefox 117
Expand Down
931 changes: 577 additions & 354 deletions Extension/package-lock.json

Large diffs are not rendered by default.

22 changes: 11 additions & 11 deletions Extension/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,34 @@
"start": "Start is required for the manual_test.py to run"
},
"devDependencies": {
"@babel/cli": "^7.22.15",
"@babel/core": "^7.22.15",
"@babel/cli": "^7.23.0",
"@babel/core": "^7.23.0",
"@babel/eslint-parser": "^7.22.15",
"@babel/preset-env": "^7.22.15",
"@babel/preset-env": "^7.23.0",
"@types/download": "^8.0.2",
"@types/firefox-webext-browser": "^111.0.1",
"@typescript-eslint/eslint-plugin": "^6.6.0",
"@typescript-eslint/parser": "^6.6.0",
"@types/firefox-webext-browser": "^111.0.2",
"@typescript-eslint/eslint-plugin": "^6.7.5",
"@typescript-eslint/parser": "^6.7.5",
"ajv": "^8.12.0",
"body-parser": "^1.20.2",
"download": "^8.0.0",
"eslint": "^8.48.0",
"eslint": "^8.51.0",
"eslint-config-prettier": "^8.0.0",
"eslint-plugin-html": "^7.1.0",
"eslint-plugin-import": "^2.28.1",
"eslint-plugin-jsdoc": "^46.5.1",
"eslint-plugin-jsdoc": "^46.8.2",
"eslint-plugin-json": "^3.1.0",
"eslint-plugin-mozilla": "^3.1.0",
"eslint-plugin-mozilla": "^3.2.0",
"eslint-plugin-no-unsanitized": "^4.0.2",
"eslint-plugin-prettier": "^5.0.0",
"eslint-plugin-unicorn": "^48.0.1",
"express": "^4.18.2",
"prettier": "^3.0.3",
"safe-compare": "^1.1.4",
"ts-loader": "^9.4.4",
"ts-loader": "^9.5.0",
"typedoc": "^0.25.1",
"typescript": "^5.2.2",
"web-ext": "^7.6.2",
"web-ext": "^7.8.0",
"webpack": "^5.88.2",
"webpack-cli": "^5.1.4"
},
Expand Down
2 changes: 1 addition & 1 deletion Extension/src/loggingdb.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ export const open = async function (
);
console.log("StorageController started?", rv);
}

storageController.send(JSON.stringify(`Browser-${crawlID}`));
// Listen for incoming urls as visit ids
listeningSocket = new socket.ListeningSocket(listeningSocketCallback);
console.log("Starting socket listening for incoming connections.");
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.24.0
0.25.0
32 changes: 16 additions & 16 deletions environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,42 +9,42 @@ dependencies:
- dill=0.3.7
- dill=0.3.7
- easyprocess=1.1
- gcsfs=2023.9.1
- gcsfs=2023.9.2
- geckodriver=0.33.0
- ipython=8.15.0
- ipython=8.16.1
- isort=5.12.0
- leveldb=1.23
- multiprocess=0.70.15
- mypy=1.5.1
- nodejs=20.7.0
- pandas=2.1.0
- mypy=1.6.0
- nodejs=20.8.0
- pandas=2.1.1
- pillow=10.0.1
- pip=23.2.1
- plyvel=1.5.0
- pre-commit=3.4.0
- psutil=5.9.5
- pyarrow=13.0.0
- pytest-asyncio=0.21.1
- pytest-cov=4.1.0
- pytest=7.4.2
- python=3.11.5
- pyvirtualdisplay=3.0
- python=3.12.0
- pyvirtualdisplay=2.2
- recommonmark=0.7.1
- redis-py=5.0.0
- s3fs=2023.9.1
- selenium=4.12.0
- sentry-sdk=1.31.0
- redis-py=5.0.1
- s3fs=2023.9.2
- selenium=4.13.0
- sentry-sdk=1.32.0
- sphinx-markdown-tables=0.0.17
- sphinx=7.2.6
- tabulate=0.9.0
- tblib=2.0.0
- wget=1.20.3
- pip:
- dataclasses-json==0.6.0
- dataclasses-json==0.6.1
- domain-utils==0.7.1
- jsonschema==4.19.0
- plyvel==1.5.0
- jsonschema==4.19.1
- tranco==0.6
- types-pyyaml==6.0.12.11
- types-redis==4.6.0.6
- types-pyyaml==6.0.12.12
- types-redis==4.6.0.7
- types-tabulate==0.9.0.3
name: openwpm
22 changes: 13 additions & 9 deletions openwpm/browser_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,15 +549,17 @@ def kill_browser_manager(self):
if self.browser_manager is not None and self.browser_manager.pid is not None:
self.logger.debug(
"BROWSER %i: Attempting to kill BrowserManager with pid %i. "
"Browser PID: %s"
% (self.browser_id, self.browser_manager.pid, self.geckodriver_pid)
"Browser PID: %s",
self.browser_id,
self.browser_manager.pid,
self.geckodriver_pid,
)
try:
os.kill(self.browser_manager.pid, signal.SIGKILL)
except OSError:
self.logger.debug(
"BROWSER %i: Browser manager process does "
"not exist" % self.browser_id
"BROWSER %i: Browser manager process does not exist",
self.browser_id,
)
pass

Expand All @@ -566,13 +568,14 @@ def kill_browser_manager(self):
os.kill(self.display_pid, signal.SIGKILL)
except OSError:
self.logger.debug(
"BROWSER %i: Display process does not exit" % self.browser_id
"BROWSER %i: Display process does not exit", self.browser_id
)
pass
except TypeError:
self.logger.error(
"BROWSER %i: PID may not be the correct "
"type %s" % (self.browser_id, str(self.display_pid))
"BROWSER %i: PID may not be the correct " "type %s",
self.browser_id,
str(self.display_pid),
)
if self.display_port is not None: # xvfb display lock
# lockfile = "/tmp/.X%s-lock" % self.display_port
Expand All @@ -584,8 +587,9 @@ def kill_browser_manager(self):
os.remove(lockfile)
except OSError:
self.logger.debug(
"BROWSER %i: Screen lockfile (%s) already "
"removed" % (self.browser_id, lockfile)
"BROWSER %i: Screen lockfile (%s) already removed",
self.browser_id,
lockfile,
)
pass

Expand Down
2 changes: 1 addition & 1 deletion openwpm/deploy_browsers/deploy_firefox.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def deploy_firefox(
display_port = None
display = None
if display_mode == "headless":
fo.headless = True
fo.add_argument("--headless")
fo.add_argument("--width={}".format(DEFAULT_SCREEN_RES[0]))
fo.add_argument("--height={}".format(DEFAULT_SCREEN_RES[1]))
if display_mode == "xvfb":
Expand Down
31 changes: 23 additions & 8 deletions openwpm/storage/storage_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,11 @@ def __init__(
self._shutdown_flag = False
self._relaxed = False
self.logger = logging.getLogger("openwpm")
self.store_record_tasks: DefaultDict[VisitId, List[Task[None]]] = defaultdict(
self.store_record_tasks: DefaultDict[VisitId, list[Task[None]]] = defaultdict(
list
)
"""Contains all store_record tasks for a given visit_id"""
self.finalize_tasks: List[Tuple[VisitId, Optional[Task[None]], bool]] = []
self.finalize_tasks: list[tuple[VisitId, Optional[Task[None]], bool]] = []
"""Contains all information required for update_completion_queue to work
Tuple structure is: VisitId, optional completion token, success
"""
Expand All @@ -97,18 +97,21 @@ async def _handler(
self.logger.error(
"An exception occurred while processing records", exc_info=e
)
writer.close()
await writer.wait_closed()

async def handler(
self, reader: asyncio.StreamReader, _: asyncio.StreamWriter
) -> None:
"""Created for every new connection to the Server"""
self.logger.debug("Initializing new handler")
client_name = await get_message_from_reader(reader)
self.logger.info(f"Initializing new handler for {client_name}")
while True:
try:
record: Tuple[str, Any] = await get_message_from_reader(reader)
except IncompleteReadError:
self.logger.info(
"Terminating handler, because the underlying socket closed"
f"Terminating handler for {client_name}, because the underlying socket closed"
)
break
if len(record) != 2:
Expand Down Expand Up @@ -248,6 +251,7 @@ async def update_status_queue(self) -> NoReturn:
)

async def shutdown(self, completion_queue_task: Task[None]) -> None:
self.logger.info("Entering self.shutdown")
completion_tokens = {}
visit_ids = list(self.store_record_tasks.keys())
for visit_id in visit_ids:
Expand All @@ -261,6 +265,7 @@ async def shutdown(self, completion_queue_task: Task[None]) -> None:
self.completion_queue.put((visit_id, False))

await self.structured_storage.shutdown()
self.logger.info("structured_storage is shut down")

if self.unstructured_storage is not None:
await self.unstructured_storage.flush_cache()
Expand Down Expand Up @@ -342,13 +347,21 @@ async def _run(self) -> None:
update_completion_queue = asyncio.create_task(
self.update_completion_queue(), name="CompletionQueueFeeder"
)
# Blocks until we should shutdown
# Blocks until we should shut down
await self.should_shutdown()

self.logger.info(f"Closing Server")
server.close()
self.logger.info("Closed Server")
self.logger.info("Cancelling status_queue_update")
status_queue_update.cancel()
self.logger.info("Cancelled status_queue_update")
self.logger.info("Cancelling timeout_check")
timeout_check.cancel()
self.logger.info("Cancelled timeout_check")
self.logger.info("Starting wait_closed")
await server.wait_closed()
self.logger.info("Completed wait_closed")

await self.shutdown(update_completion_queue)

def run(self) -> None:
Expand All @@ -359,10 +372,11 @@ def run(self) -> None:
class DataSocket:
"""Wrapper around ClientSocket to make sending records to the StorageController more convenient"""

def __init__(self, listener_address: Tuple[str, int]) -> None:
def __init__(self, listener_address: Tuple[str, int], client_name: str) -> None:
self.socket = ClientSocket(serialization="dill")
self.socket.connect(*listener_address)
self.logger = logging.getLogger("openwpm")
self.socket.send(client_name)

def store_record(
self, table_name: TableName, visit_id: VisitId, data: Dict[str, Any]
Expand Down Expand Up @@ -443,7 +457,7 @@ def save_configuration(
browser_version: str,
) -> None:
assert self.listener_address is not None
sock = DataSocket(self.listener_address)
sock = DataSocket(self.listener_address, "StorageControllerHandle")
task_id = random.getrandbits(32)
sock.store_record(
TableName("task"),
Expand All @@ -467,6 +481,7 @@ def save_configuration(
},
)
sock.finalize_visit_id(INVALID_VISIT_ID, success=True)
sock.close()

def launch(self) -> None:
"""Starts the storage controller"""
Expand Down
4 changes: 3 additions & 1 deletion openwpm/task_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,9 @@ def _launch_storage_controller(
)
assert self.manager_params.storage_controller_address is not None
# open connection to storage controller for saving crawl details
self.sock = DataSocket(self.manager_params.storage_controller_address)
self.sock = DataSocket(
self.manager_params.storage_controller_address, "TaskManager"
)

def _shutdown_manager(
self, during_init: bool = False, relaxed: bool = True
Expand Down
Loading
Loading