From b21b9145d0200696bded496d7955185da511c4c7 Mon Sep 17 00:00:00 2001 From: Stefan Zabka Date: Tue, 19 Sep 2023 23:55:07 +0200 Subject: [PATCH] Docs fixing (#1049) * docs(papers): adding papers list * docs(results): reference WebTAP data * docs(results): add footnotes * docs(README): redirect to correct link --- Extension/.prettierignore | 1 + Extension/package.json | 2 +- README.md | 2 +- docs/Papers.rst | 259 +++++++++++++++++++++++++++ docs/index.rst | 2 + openwpm/command_sequence.py | 28 +-- openwpm/commands/browser_commands.py | 16 +- openwpm/commands/types.py | 24 ++- openwpm/config.py | 6 +- openwpm/socket_interface.py | 13 +- openwpm/storage/arrow_storage.py | 6 +- 11 files changed, 310 insertions(+), 49 deletions(-) create mode 100644 docs/Papers.rst diff --git a/Extension/.prettierignore b/Extension/.prettierignore index bef68808a..9f43aeec8 100644 --- a/Extension/.prettierignore +++ b/Extension/.prettierignore @@ -9,6 +9,7 @@ coverage *.log yarn.lock +package-lock.json # built extension artifacts dist diff --git a/Extension/package.json b/Extension/package.json index c021cbd76..b9a59ac13 100644 --- a/Extension/package.json +++ b/Extension/package.json @@ -82,4 +82,4 @@ "singleQuote": false, "trailingComma": "all" } -} \ No newline at end of file +} diff --git a/README.md b/README.md index 37467bdf7..e85f558be 100644 --- a/README.md +++ b/README.md @@ -162,7 +162,7 @@ Further information is available at [OPENWPM's Documentation Page](https://openw ## Advice for Measurement Researchers -OpenWPM is [often used](https://webtap.princeton.edu/software/) for web +OpenWPM is [often used](https://openwpm.readthedocs.io/Papers.html) for web measurement research. We recommend the following for researchers using the tool: **Use a versioned [release](https://github.com/openwpm/OpenWPM/releases).** We diff --git a/docs/Papers.rst b/docs/Papers.rst new file mode 100644 index 000000000..264038f21 --- /dev/null +++ b/docs/Papers.rst @@ -0,0 +1,259 @@ +Studies using OpenWPM +====================== + +Data collected by WebTAP +------------------------ + +Since 2015, WebTAP has conducted a web census to study third-party online tracking. +Each month between 2015-2018, they visited the web’s 1 million most popular sites using +OpenWPM and record data pertaining +to user privacy, including cookies, fingerprinting scripts, the effect of browser privacy tools, +and the exchange of tracking data between different sites (“cookie syncing”). + +WebTAP has `released `_ +the entire Princeton Web Census data — about 15 terabytes — containing +privacy measurements of 1 million sites conducted each month from December 2015 to June 2018. + +List of Studies that have used OpenWPM +--------------------------------------- + +.. list-table:: + :widths: 5 25 70 + :header-rows: 1 + + * - Year + - Venue + - Study Name + * - 2014 + - ACM CCS + - `The Web Never Forgets: Persistent Tracking Mechanisms in the Wild `_ + * - 2014 + - ACM CoSN + - `Cognitive disconnect: Understanding Facebook Connect login permissions `_ + * - 2015 + - WWW + - `Cookies that give you away: The surveillance implications of web tracking `_ + * - 2015 + - NDSS + - `Upgrading HTTPS in midair: HSTS and key pinning in practice `_ + * - 2015 + - Tech Science + - `Web privacy census `_ + * - 2015 + - W2SP + - `Variations in tracking in relation to geographic location `_ + * - 2016 + - IFIP AICT + - `Evaluating Websites and Their Adherence to Data Protection Principles `_ + * - 2016 + - ACM CCS + - `Online Tracking: A 1-million-site Measurement and Analysis `_ + * - 2016 + - WWW + - `No honor among thieves: A large-scale analysis of malicious web shells `_ + * - 2017 + - NDSS + - `Dial One for Scam: A Large-Scale Analysis of Technical Support Scams `_ + * - 2017 + - PETS + - `Cross-Device Tracking: Measurement and Disclosures `_ + * - 2017 + - CODASPY + - `Identifying HTTPS-Protected Netflix Videos in Real-Time `_ + * - 2017 + - WWW + - `De-anonymizing Web Browsing Data with Social Networks `_ [#f1]_ + * - 2017 + - IWPE + - `Battery Status Not Included: Assessing Privacy in Web Standards `_ + * - 2017 + - Annual Privacy Forum + - `PrivacyScore: Improving Privacy and Security via Crowd-Sourced Benchmarks of Websites `_ + * - 2017 + - arXiv + - `Horcrux: A Password Manager for Paranoids `_ + * - 2017 + - USENIX Security + - `Measuring the Insecurity of Mobile Deep Links of Android `_ + * - 2017 + - Applied Economics Letters + - `Online advertising networks and consumer perceptions of privacy `_ + * - 2018 + - PETS + - `When the cookie meets the blockchain: Privacy risks of web payments via cryptocurrencies `_ + * - 2018 + - PETS + - `I never signed up for this! Privacy implications of email tracking `_ + * - 2018 + - ACM TOIT + - `Measuring third party tracker power across web and mobile `_ + * - 2018 + - CALIcon + - `Third Party Trackers on Law School Library Websites `_ + * - 2018 + - Master Thesis, Delft University of Technology + - `Tracking Cookies in the European Union, an Empirical Analysis of the Current Situation `_ + * - 2018 + - ACM CCS + - `The Web’s Sixth Sense: A Study of Scripts Accessing Smartphone Sensors `_ + * - 2018 + - ACSAC + - `Raising the Bar: Evaluating Origin-wide Security Manifests `_ + * - 2018 + - arXiv + - `The Unwanted Sharing Economy: An Analysis of Cookie Syncing and User Transparency under GDPR `_ + * - 2018 + - PhD thesis, Princeton University + - `Automated discovery of privacy violations on the web `_ + * - 2018 + - AINTEC’18 + - `Understanding abusive web resources: characteristics and counter-measures of malicious web resources and cryptocurrency mining `_ + * - 2018 + - ACSAC + - `Raising the Bar: Evaluating Origin-wide Security Manifests `_ + * - 2018 + - SSRN + - `Acquisitions in the Third Party Tracking Industry: Competition and Data Protection Aspects `_ + * - 2019 + - Communications in Computer and Information Science + - `Transparency in Keyword Faceted Search: An Investigation on Google Shopping `_ + * - 2019 + - arXiv + - `The Price of Free Illegal Live Streaming Services `_ + * - 2019 + - Advances in Intelligent Systems and Computing + - `Usage of HTTPS by Municipal Websites in Portugal `_ + * - 2019 + - ConPro + - `The Impact of User Location on Cookie Notices (Inside and Outside of the European Union) `_ + * - 2019 + - WWW + - `Before and After GDPR: The Changes in Third Party Presence at Public and Private European Websites `_ + * - 2019 + - IEEE EuroS&P + - `TraffickStop: Detecting and Measuring Illicit Traffic Monetization Through Large-Scale DNS Analysis `_ + * - 2019 + - SSRN + - `The Market for Data Privacy `_ + * - 2019 + - ACM CSCW + - `Dark Patterns at Scale: Findings from a Crawl of 11K Shopping Websites `_ + * - 2019 + - Computer Communications + - `A comparison of web privacy protection techniques `_ + * - 2019 + - DPM + - `On Privacy Risks of Public WiFi Captive Portals `_ + * - 2019 + - Computers & Security + - `Towards a global perspective on web tracking `_ + * - 2019 + - APF + - `Towards Transparency in Email Tracking `_ + * - 2019 + - RAID + - `Talon: An Automated Framework for Cross-Device Tracking Detection `_ + * - 2019 + - ACM CCS + - `Watching You Watch: The Tracking Ecosystem of Over-the-Top TV Streaming Devices `_ + * - 2019 + - ACM IMC + - `Tales from the Porn: A Comprehensive Privacy Analysis of the Web Porn Ecosystem `_ + * - 2019 + - IEEE EuroS&P + - `TraffickStop: Detecting and Measuring Illicit Traffic Monetization Through Large-scale DNS Analysis `_ + * - 2019 + - The New York Times + - `I Visited 47 Sites. Hundreds of Trackers Followed Me. `_ + * - 2019 + - The Washington Post + - `Think you’re anonymous online? A third of popular websites are ‘fingerprinting’ you. `_ + * - 2019 + - ESORICS + - `Fingerprint surface-based detection of web bot detectors `_ + * - 2019 + - DPM + - `A Study on Subject Data Access in Online Advertising after the GDPR `_ + * - 2019 + - IEEE SPW + - `After GDPR, Still Tracking or Not? Understanding Opt-Out States for Online Behavioral Advertising `_ + * - 2020 + - PETS + - `Missed by Filter Lists: Detecting Unknown Third-Party Trackers with Invisible Pixels `_ + * - 2020 + - PETS + - `Inferring Tracker-Advertiser Relationships in the Online Advertising Ecosystem using Header Bidding `_ + * - 2020 + - PETS + - `A Comparative Measurement Study of Web Tracking on Mobile and Desktop Environments `_ + * - 2020 + - PETS + - `No boundaries: data exfiltration by third parties embedded on web pages `_ + * - 2020 + - PETS + - `In-Depth Evaluation of Redirect Tracking and Link Usage `_ + * - 2020 + - The Web Conference + - `The Representativeness of Automated Web Crawls as a Surrogate for Human Browsing `_ [#f2]_ + * - 2020 + - The Web Conference + - `Apophanies or Epiphanies? How Crawlers Impact Our Understanding of the Web `_ + * - 2020 + - The Web Conference + - `Stop Tracking me Bro! Differential Tracking of User Demographics on Hyper-partisan Websites `_ [#f2]_ + * - 2020 + - The Web Conference + - `Beyond the Front Page: Measuring Third Party Dynamics in the Field `_ + * - 2020 + - ACM ASIACCS + - `Measuring the Impact of the GDPR on Data Sharing in Ad Networks `_ + * - 2020 + - arXiv + - `Actions speak louder than words: Semi-supervised learning for browser fingerprinting detection `_ + * - 2020 + - PAM + - `Extortion or Expansion? An investigation into the costs and consequences of ICANN’s gTLD experiments `_ + * - 2020 + - Bachelor Thesis, Radboud University + - `Design and implementation of a stealthy OpenWPM web scraper `_ + * - 2020 + - IWPE + - `On Compliance of Cookie Purposes with the Purpose Specification Principle `_ + * - 2020 + - FTC PrivacyCon + - `Unaccounted Privacy Violation: A Comparative Analysis of Persistent Identification of Users Across Social Contexts `_ + * - 2020 + - IEEE EuroS&P + - `Multi-country Study of Third Party Trackers from Real Browser Histories `_ + * - 2020 + - TMA + - `Characterizing CNAME Cloaking-Based Tracking on the Web `_ + * - 2020 + - TMA + - `Clash of the Trackers: Measuring the Evolution of the Online Tracking Ecosystem `_ + * - 2020 + - WEIS + - `The Impact of the GDPR on Content Providers `_ + * - 2020 + - PhD Thesis, University of Michigan + - `Enhancing System Transparency, Trust, and Privacy with Internet Measurement `_ + * - 2020 + - Masters Thesis, Concordia University + - `A Large-Scale Evaluation of Privacy Practices of Public WiFi Captive Portals `_ + * - 2020 + - IEEE Globecom + - `A machine learning approach for detecting CNAME cloaking-based tracking on the Web `_ + * - 2021 + - NDSS + - `Reining in the Web’s Inconsistencies with Site Policy `_ + * - 2021 + - PETS + - `Unveiling Web Fingerprinting in the Wild Via Code Mining and Machine Learning `_ + * - 2021 + - IEEE S&P + - `Fingerprinting the Fingerprinters: Learning to Detect Browser Fingerprinting Behaviors `_ + +.. rubric:: Footnotes + +.. [#f1] Uses data released by us. +.. [#f2] Studies OpenWPM’s behavior. diff --git a/docs/index.rst b/docs/index.rst index 258554c0e..2a737f960 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -23,6 +23,8 @@ We're hoping to improve this setup in the future. Configuration + Papers + .. toctree:: :maxdepth: 4 :caption: Developer documentation diff --git a/openwpm/command_sequence.py b/openwpm/command_sequence.py index 2c9564a80..1c54aa790 100644 --- a/openwpm/command_sequence.py +++ b/openwpm/command_sequence.py @@ -156,19 +156,23 @@ def recursive_dump_page_source(self, suffix="", timeout=30): stored in `manager_params.source_dump_path` and is keyed by the current `visit_id` and top-level url. The source dump is a gzipped json file with the following structure: - { - 'document_url': "http://example.com", - 'source': " ... ", - 'iframes': { - 'frame_1': {'document_url': ..., - 'source': ..., - 'iframes: { ... }}, - 'frame_2': {'document_url': ..., - 'source': ..., - 'iframes: { ... }}, - 'frame_3': { ... } + + .. code-block:: JSON + :linenos: + + { + "document_url": "http://example.com", + "source": " ... ", + "iframes": { + "frame_1": {"document_url": "...", + "source": "...", + "iframes": "{ ... }"}, + "frame_2": {"document_url": "...", + "source": "...", + "iframes": "{ ... }"}, + "frame_3": "{ ... }" + } } - } """ self.total_timeout += timeout if not self.contains_get_or_browse: diff --git a/openwpm/commands/browser_commands.py b/openwpm/commands/browser_commands.py index a1dcebf57..45aee69e0 100644 --- a/openwpm/commands/browser_commands.py +++ b/openwpm/commands/browser_commands.py @@ -40,8 +40,7 @@ def bot_mitigation(webdriver): - """performs three optional commands for bot-detection - mitigation when getting a site""" + """Performs three optional commands for bot-detection mitigation when getting a site""" # bot mitigation 1: move the randomly around a number of times window_size = webdriver.get_window_size() @@ -86,9 +85,7 @@ def close_other_windows(webdriver): def tab_restart_browser(webdriver): - """ - kills the current tab and creates a new one to stop traffic - """ + """kills the current tab and creates a new one to stop traffic""" # note: this technically uses windows, not tabs, due to problems with # chrome-targeted keyboard commands in Selenium 3 (intermittent # nonsense WebDriverExceptions are thrown). windows can be reliably @@ -114,9 +111,7 @@ def tab_restart_browser(webdriver): class GetCommand(BaseCommand): - """ - goes to using the given instance - """ + """goes to using the given instance""" def __init__(self, url, sleep): self.url = url @@ -467,6 +462,7 @@ def collect_source(webdriver, frame_stack, rv={}): class FinalizeCommand(BaseCommand): """This command is automatically appended to the end of a CommandSequence + It's apperance means there won't be any more commands for this visit_id """ @@ -494,8 +490,8 @@ def execute( class InitializeCommand(BaseCommand): - """The command is automatically prepended to the beginning of a - CommandSequence + """The command is automatically prepended to the beginning of a CommandSequence + It initializes state both in the extensions as well in as the StorageController """ diff --git a/openwpm/commands/types.py b/openwpm/commands/types.py index 2b4fcf307..878e83118 100644 --- a/openwpm/commands/types.py +++ b/openwpm/commands/types.py @@ -30,22 +30,18 @@ def execute( manager_params: ManagerParamsInternal, extension_socket: ClientSocket, ) -> None: - """ - This method gets called in the Browser process - :parameter webdriver: - WebDriver is a Selenium class used to control - browser. - You can simulate arbitrary interactions and extract almost all browser state - with the tools that Selenium gives you - :parameter browser_params: - Contains the per browser configuration + """This method gets called in the Browser process + + :parameter webdriver: WebDriver is a Selenium class used to control + browser. You can simulate arbitrary interactions and extract almost + all browser state with the tools that Selenium gives you + :parameter browser_params: Contains the per browser configuration E.g. which instruments are enabled - :parameter manager_params: - Per crawl parameters - E.g. where to store files + :parameter manager_params: Per crawl parameters E.g. where to store files :parameter extension_socket: Communication channel to the storage provider - TODO: Further document this once the StorageProvider PR has landed - This allows you to send data to be persisted to storage. + + TODO: Further document this once the StorageProvider PR has landed + This allows you to send data to be persisted to storage. """ pass diff --git a/openwpm/config.py b/openwpm/config.py index cc0773df6..5c0b811f6 100644 --- a/openwpm/config.py +++ b/openwpm/config.py @@ -133,15 +133,15 @@ class ManagerParams(DataClassJsonMixin): """A watchdog that tries to ensure that no Firefox instance takes up too much memory. It is mostly useful for long running cloud crawls""" process_watchdog: bool = False - """- It is used to create another thread that kills off `GeckoDriver` (or `Xvfb`) instances that haven't been spawned by OpenWPM. (GeckoDriver is used by + """It is used to create another thread that kills off `GeckoDriver` (or `Xvfb`) instances that haven't been spawned by OpenWPM. (GeckoDriver is used by Selenium to control Firefox and Xvfb a "virtual display" so we simulate having graphics when running on a server).""" num_browsers: int = 1 _failure_limit: Optional[int] = None - """- The number of command failures the platform will tolerate before raising a + """The number of command failures the platform will tolerate before raising a `CommandExecutionError` exception. Otherwise the default is set to 2 x the number of browsers plus 10. The failure counter is reset at the end of each successfully completed command sequence. - - For non-blocking command sequences that cause the number of failures to + For non-blocking command sequences that cause the number of failures to exceed `failure_limit` the `CommandExecutionError` is raised when attempting to execute the next command sequence.""" diff --git a/openwpm/socket_interface.py b/openwpm/socket_interface.py index 47fcf15fc..a4f8941ca 100644 --- a/openwpm/socket_interface.py +++ b/openwpm/socket_interface.py @@ -164,18 +164,19 @@ def close(self): async def get_message_from_reader(reader: asyncio.StreamReader) -> Any: - """ - Reads a message from the StreamReader - :exception IncompleteReadError if the underlying socket is closed - + """Reads a message from the StreamReader To safely use this method, you should guard against the exception like this: - ``` + + .. code-block:: Python + try: record: Tuple[str, Any] = await get_message_from_reader(reader) except IncompleteReadError as e: print("The underlying socket closed", repr(e)) - ``` + + :raises: + IncompleteReadError: If the underlying socket is closed """ msg = await reader.readexactly(5) msglen, serialization = struct.unpack(">Lc", msg) diff --git a/openwpm/storage/arrow_storage.py b/openwpm/storage/arrow_storage.py index f474f84e9..a0e436683 100644 --- a/openwpm/storage/arrow_storage.py +++ b/openwpm/storage/arrow_storage.py @@ -100,11 +100,12 @@ async def finalize_visit_id( """This method is the reason the finalize_visit_id interface returns a task. This was necessary as we needed to enable the following pattern. - ``` + .. code-block:: Python + token = await structured_storage.finalize_visit_id(1) structured_storage.flush_cache() await token - ``` + If there was no task returned and the method would just block/yield after turning the record into a batch, there would be no way to know, when it's safe to flush_cache as I couldn't find a way to run a coroutine until it yields and then run a different one. @@ -135,6 +136,7 @@ async def wait_on_condition(e: asyncio.Event) -> None: @abstractmethod async def write_table(self, table_name: TableName, table: Table) -> None: """Write out the table to persistent storage + This should only return once it's actually saved out """