Merge pull request #26 from EvilDrPurple/tag-collector-fixes

Overhaul tag collector
Police-Data-Accessibility-Project · Dec 11, 2023 · 4ac625c · 4ac625c
2 parents de9de0a + f0696fb
commit 4ac625c
Show file tree

Hide file tree

Showing 3 changed files with 160 additions and 80 deletions.
diff --git a/html_tag_collector/README.md b/html_tag_collector/README.md
@@ -6,12 +6,17 @@ This script adds HTML properties to a JSON file of existing URLs.
 1. If running from the command line, pass the name of the file you want to run as an argument and make sure your file in the same directory. It should be populated with URLs and properties as in the example provided. If importing collector_main, it expects a polars dataframe as an input.
 2. Optionally, create a virtual environment. This is especially useful if you don't already have `beautifulsoup4` and `requests` and `polars` installed. In your terminal:
 
-```
+```commandline
 python -m venv collector-environment
 source collector-environment/bin/activate
 ```
 
-3. Run `pip install beautifulsoup4` and `pip install requests` and `pip install polars`, if they are not installed already.
+3. Now install the required python libraries:
+
+```commandline
+$pip install -r requirements.txt
+```
+
 4. Run `python3 collector.py`.
 5. If running from the command line, check the directory: you should now have a `urls_and_headers.csv` file. Invalid URLs are removed. Otherewise the function returns a processed polars dataframe.
 

diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py
@@ -6,161 +6,231 @@
 import traceback
 
 import requests
+from requests_html import AsyncHTMLSession
+import asyncio
+import pyppeteer
 from tqdm import tqdm
+from tqdm.asyncio import tqdm
 import bs4
 from bs4 import BeautifulSoup
 import sys
 import polars as pl
 
 # Define the list of header tags we want to extract
 header_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+DEBUG = False # Set to True to enable debug output
 
 # Define a function to process a URL and update the JSON object
-def process_urls(urls):
+def process_urls(urls, render_javascript=False):
     """Process a list of urls and retrieve their HTML tags.
 
     Args:
         urls (list): List of urls.
+        render_javascript (bool): Whether or not to render webpage's JavaScript rendered HTML. Default is False.
 
     Returns:
         list: List of dicts with HTML tags.
     """
-    results = []
     new_urls = ["https://" + url[0] if not url[0].startswith("http") else url[0] for url in urls]
-
-    with ThreadPoolExecutor(max_workers=100) as executor:
-        print("Retrieving HTML tags...")
-        future_to_url = [executor.submit(get_response, url, i) for i, url in enumerate(new_urls)]
-
-        for future in tqdm(as_completed(future_to_url), total=len(future_to_url)):
-            data = future.result()
-            results.append(data)
+
+    loop = asyncio.get_event_loop()
+    loop.set_exception_handler(exception_handler)
+    future = asyncio.ensure_future(run_get_response(new_urls))
+    loop.run_until_complete(future)
+    results = future.result()
 
     results.sort(key=lambda d: d["index"])
-    urls_and_responses = [{"url": urls[i], "response": result["response"]} for i, result in enumerate(results)]
+    urls_and_responses = [{"index": result["index"], "url": urls[i], "response": result["response"]} for i, result in enumerate(results)]
+
+    if render_javascript:
+        future = asyncio.ensure_future(render_js(urls_and_responses))
+        loop.run_until_complete(future)
+        results = future.result()
 
-    urls_and_headers = []
-    tags = None
+    parsed_data = []
+    with ThreadPoolExecutor(max_workers=100) as executor:
+        print("Parsing responses...")
+        future_to_tags = [executor.submit(parse_response, url_response) for url_response in urls_and_responses]
 
-    print("Parsing responses...")
-    # TODO: May want to parallelize this as well, it tends to take a while
-    for row in tqdm(urls_and_responses):
-        if tags is not None:
-            urls_and_headers.append(tags)
+        for future in tqdm(as_completed(future_to_tags), total=len(future_to_tags)):
+            data = future.result()
+            parsed_data.append(data)
 
-        tags = {}
-        res = row["response"]
-        tags["url"] = row["url"][0]
+    urls_and_headers = sorted(parsed_data, key=lambda d: d["index"])
+    [url_headers.pop("index") for url_headers in urls_and_headers]
+    header_tags_df = pl.DataFrame(urls_and_headers)
+    clean_header_tags_df = header_tags_df.with_columns(pl.col(["html_title", "meta_description"]).fill_null(""))
 
-        if res is None:
-            tags["http_response"] = "Request failed"
-            continue
+    return clean_header_tags_df
 
-        tags["http_response"] = res.status_code
-        if not res.ok:
-            continue
 
-        try:
-            soup = BeautifulSoup(res.content, "html.parser", from_encoding="iso-8859-1")
-        except (bs4.builder.ParserRejectedMarkup, AssertionError):
-            continue
+def exception_handler(loop, context):
+    if DEBUG:
+        msg = context.get("exception", context["message"])
+        print(msg)
 
-        tags["html_title"] = soup.title.string if soup.title is not None else ""
 
-        meta_tag = soup.find("meta", attrs={"name": "description"})
-        try:
-            tags["meta_description"] = meta_tag["content"] if meta_tag is not None else ""
-        except KeyError:
-            tags["meta_description"] = ""
+async def run_get_response(urls):
+    """Asynchronously retrieves responses from a list of urls.
 
-        for header_tag in header_tags:
-            headers = soup.find_all(header_tag)
-            header_content = [header.text for header in headers]
-            tags[header_tag] = json.dumps(header_content)
+    Args:
+        urls (list): List of urls.
 
-    urls_and_headers.append(tags)
-    header_tags_df = pl.DataFrame(urls_and_headers)
-    clean_header_tags_df = header_tags_df.with_columns(pl.col(["html_title", "meta_description"]).fill_null(""))
+    Returns:
+        Future: Future with Response objects.
+    """    
+    tasks = []
+    urllib3.disable_warnings()
+    session = AsyncHTMLSession(workers=100)
+
+    print("Retrieving HTML tags...")
+    for i, url in enumerate(urls):
+        task = asyncio.ensure_future(get_response(session, url, i))
+        tasks.append(task)
+
+    results = await tqdm.gather(*tasks)
 
-    return clean_header_tags_df
+    await session.close()
+    return results
 
 
-def get_response(url, index):
+async def get_response(session, url, index):
     """Retrieves GET response for given url.
 
     Args:
+        session (AsyncHTMLSession): Browser session used to retreive responses.
         url (str): Url to request.
+        index (int): Index of the url to keep results in the same order.
 
     Returns:
         dict(int, Response): Dictionary of the url's index value and Response object, None if an error occurred.
     """
-
     headers = {
         # Some websites refuse the connection of automated requests, setting the User-Agent will circumvent that
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
         # Make sure there's no pre-mature closing of responses before a redirect completes
         "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
     }
     response = None
-    debug = True  # Set to True to enable debug output
     url = url.removesuffix(".json")
 
     try:
-        response = requests.get(url, headers=headers, timeout=60)
+        response = await session.get(url, headers=headers, timeout=60)
     except requests.exceptions.SSLError:
         # This error is raised when the website uses a legacy SSL version, which is not supported by requests
-        if debug:
+        if DEBUG:
             print("SSLError:", url)
-        # Retry using legacy SSL session
-        response = get_legacy_session().get(url, headers=headers, timeout=60)
+
+        # Retry without SSL verification
+        response = await session.get(url, headers=headers, timeout=60, verify=False)
     except requests.exceptions.ConnectionError:
         # Sometimes this error is raised because the provided url uses http when it should be https and the website does not handle it properly
-        if debug:
+        if DEBUG:
             print("MaxRetryError:", url)
 
         if not url[4] == "s":
             url = url[:4] + "s" + url[4:]
             # Retry with https
-            response = requests.get(url, headers=headers, timeout=60)
+            response = await session.get(url, headers=headers, timeout=60)
     except (urllib3.exceptions.LocationParseError, requests.exceptions.ReadTimeout) as e:
-        if debug:
+        if DEBUG:
             print(f"{type(e).__name__}: {url}")
     except Exception as e:
-        if debug:
+        if DEBUG:
             print("Exception:", url)
             print(traceback.format_exc())
             print(str(e))
     finally:
-        if debug:
+        if DEBUG:
             print(url, response)
 
         return {"index": index, "response": response}
 
 
-# The following adapter code was shamelessly stolen from Harry Mallon on Stack Overflow:
-# https://stackoverflow.com/a/71646353/14045691
-class CustomHttpAdapter(requests.adapters.HTTPAdapter):
-    # "Transport adapter" that allows us to use custom ssl_context.
-    def __init__(self, ssl_context=None, **kwargs):
-        self.ssl_context = ssl_context
-        super().__init__(**kwargs)
+async def render_js(urls_responses):
+    """Renders JavaScript from a list of urls.
 
-    def init_poolmanager(self, connections, maxsize, block=False):
-        self.poolmanager = urllib3.poolmanager.PoolManager(
-            num_pools=connections, maxsize=maxsize, block=block, ssl_context=self.ssl_context
-        )
+    Args:
+        urls_responses (dict): Dictionary containing urls and their responses.
+    """
+    print("Rendering JavaScript...")
+    for url_response in tqdm(urls_responses):
+        res = url_response["response"]
+
+        if res is not None and res.ok:
+            if DEBUG:
+                print("Rendering", url_response["url"][0])
+            task = asyncio.create_task(res.html.arender())
+
+            # Some websites will cause the rendering to hang indefinitely so we cancel the task if more than 15 seconds have elapsed
+            time_elapsed = 0
+            while not task.done():
+                time_elapsed += 1
+                await asyncio.sleep(0.1)
+
+                if time_elapsed > 150:
+                    task.cancel()
+                    break
+
+            try:
+                await task
+            except (pyppeteer.errors.PageError, pyppeteer.errors.NetworkError) as e:
+                if DEBUG:
+                    print(f"{type(e).__name__}")
+            except Exception as e:
+                if DEBUG:
+                    print(traceback.format_exc())
+                    print(str(e))
+            except asyncio.CancelledError:
+                if DEBUG:
+                    print("Rendering cancelled")
+
+
+def parse_response(url_response):
+    """Parses relevant HTML tags from a Response object into a dictionary.
 
+    Args:
+        url_response (list[dict]): List of dictionaries containing urls and theeir responses.
+
+    Returns:
+        list[dict]: List of dictionaries containing urls and relevant HTML tags.
+    """    
+    tags = {}
+    res = url_response["response"]
+    tags["index"] = url_response["index"]
+    tags["url"] = url_response["url"][0]
 
-def get_legacy_session():
-    ctx = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
-    ctx.options |= 0x4  # OP_LEGACY_SERVER_CONNECT
-    session = requests.session()
-    session.mount("https://", CustomHttpAdapter(ctx))
-    return session
+    if res is None:
+        tags["http_response"] = "Request failed"
+        return tags
 
+    tags["http_response"] = res.status_code
+    if not res.ok:
+        return tags
 
-def collector_main(df):
-    header_tags_df = process_urls(df.select(pl.col("url")).rows())
+    try:
+        soup = BeautifulSoup(res.html.html, "html.parser")
+    except (bs4.builder.ParserRejectedMarkup, AssertionError):
+        return tags
+
+    tags["html_title"] = soup.title.string if soup.title is not None else ""
+
+    meta_tag = soup.find("meta", attrs={"name": "description"})
+    try:
+        tags["meta_description"] = meta_tag["content"] if meta_tag is not None else ""
+    except KeyError:
+        tags["meta_description"] = ""
+
+    for header_tag in header_tags:
+        headers = soup.find_all(header_tag)
+        header_content = [header.text for header in headers]
+        tags[header_tag] = json.dumps(header_content)
+
+    return tags
+
+
+def collector_main(df, render_javascript=False):
+    header_tags_df = process_urls(df.select(pl.col("url")).rows(), render_javascript=render_javascript)
 
     return header_tags_df
 
@@ -174,5 +244,4 @@ def collector_main(df):
     header_tags_df = collector_main(df)
 
     # Write the updated JSON data to a new file
-    header_tags_df.write_csv('urls_and_headers.csv', index=False)
-
+    header_tags_df.write_csv('urls_and_headers.csv')
diff --git a/html_tag_collector/requirements.txt b/html_tag_collector/requirements.txt
@@ -0,0 +1,6 @@
+requests
+requests_html
+polars
+tqdm
+pyppeteer
+beautifulsoup4