Skip to content

Commit

Permalink
Merge pull request #26 from EvilDrPurple/tag-collector-fixes
Browse files Browse the repository at this point in the history
Overhaul tag collector
  • Loading branch information
mbodeantor authored Dec 11, 2023
2 parents de9de0a + f0696fb commit 4ac625c
Show file tree
Hide file tree
Showing 3 changed files with 160 additions and 80 deletions.
9 changes: 7 additions & 2 deletions html_tag_collector/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,17 @@ This script adds HTML properties to a JSON file of existing URLs.
1. If running from the command line, pass the name of the file you want to run as an argument and make sure your file in the same directory. It should be populated with URLs and properties as in the example provided. If importing collector_main, it expects a polars dataframe as an input.
2. Optionally, create a virtual environment. This is especially useful if you don't already have `beautifulsoup4` and `requests` and `polars` installed. In your terminal:

```
```commandline
python -m venv collector-environment
source collector-environment/bin/activate
```

3. Run `pip install beautifulsoup4` and `pip install requests` and `pip install polars`, if they are not installed already.
3. Now install the required python libraries:

```commandline
$pip install -r requirements.txt
```

4. Run `python3 collector.py`.
5. If running from the command line, check the directory: you should now have a `urls_and_headers.csv` file. Invalid URLs are removed. Otherewise the function returns a processed polars dataframe.

Expand Down
225 changes: 147 additions & 78 deletions html_tag_collector/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,161 +6,231 @@
import traceback

import requests
from requests_html import AsyncHTMLSession
import asyncio
import pyppeteer
from tqdm import tqdm
from tqdm.asyncio import tqdm
import bs4
from bs4 import BeautifulSoup
import sys
import polars as pl

# Define the list of header tags we want to extract
header_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
DEBUG = False # Set to True to enable debug output

# Define a function to process a URL and update the JSON object
def process_urls(urls):
def process_urls(urls, render_javascript=False):
"""Process a list of urls and retrieve their HTML tags.
Args:
urls (list): List of urls.
render_javascript (bool): Whether or not to render webpage's JavaScript rendered HTML. Default is False.
Returns:
list: List of dicts with HTML tags.
"""
results = []
new_urls = ["https://" + url[0] if not url[0].startswith("http") else url[0] for url in urls]

with ThreadPoolExecutor(max_workers=100) as executor:
print("Retrieving HTML tags...")
future_to_url = [executor.submit(get_response, url, i) for i, url in enumerate(new_urls)]

for future in tqdm(as_completed(future_to_url), total=len(future_to_url)):
data = future.result()
results.append(data)

loop = asyncio.get_event_loop()
loop.set_exception_handler(exception_handler)
future = asyncio.ensure_future(run_get_response(new_urls))
loop.run_until_complete(future)
results = future.result()

results.sort(key=lambda d: d["index"])
urls_and_responses = [{"url": urls[i], "response": result["response"]} for i, result in enumerate(results)]
urls_and_responses = [{"index": result["index"], "url": urls[i], "response": result["response"]} for i, result in enumerate(results)]

if render_javascript:
future = asyncio.ensure_future(render_js(urls_and_responses))
loop.run_until_complete(future)
results = future.result()

urls_and_headers = []
tags = None
parsed_data = []
with ThreadPoolExecutor(max_workers=100) as executor:
print("Parsing responses...")
future_to_tags = [executor.submit(parse_response, url_response) for url_response in urls_and_responses]

print("Parsing responses...")
# TODO: May want to parallelize this as well, it tends to take a while
for row in tqdm(urls_and_responses):
if tags is not None:
urls_and_headers.append(tags)
for future in tqdm(as_completed(future_to_tags), total=len(future_to_tags)):
data = future.result()
parsed_data.append(data)

tags = {}
res = row["response"]
tags["url"] = row["url"][0]
urls_and_headers = sorted(parsed_data, key=lambda d: d["index"])
[url_headers.pop("index") for url_headers in urls_and_headers]
header_tags_df = pl.DataFrame(urls_and_headers)
clean_header_tags_df = header_tags_df.with_columns(pl.col(["html_title", "meta_description"]).fill_null(""))

if res is None:
tags["http_response"] = "Request failed"
continue
return clean_header_tags_df

tags["http_response"] = res.status_code
if not res.ok:
continue

try:
soup = BeautifulSoup(res.content, "html.parser", from_encoding="iso-8859-1")
except (bs4.builder.ParserRejectedMarkup, AssertionError):
continue
def exception_handler(loop, context):
if DEBUG:
msg = context.get("exception", context["message"])
print(msg)

tags["html_title"] = soup.title.string if soup.title is not None else ""

meta_tag = soup.find("meta", attrs={"name": "description"})
try:
tags["meta_description"] = meta_tag["content"] if meta_tag is not None else ""
except KeyError:
tags["meta_description"] = ""
async def run_get_response(urls):
"""Asynchronously retrieves responses from a list of urls.
for header_tag in header_tags:
headers = soup.find_all(header_tag)
header_content = [header.text for header in headers]
tags[header_tag] = json.dumps(header_content)
Args:
urls (list): List of urls.
urls_and_headers.append(tags)
header_tags_df = pl.DataFrame(urls_and_headers)
clean_header_tags_df = header_tags_df.with_columns(pl.col(["html_title", "meta_description"]).fill_null(""))
Returns:
Future: Future with Response objects.
"""
tasks = []
urllib3.disable_warnings()
session = AsyncHTMLSession(workers=100)

print("Retrieving HTML tags...")
for i, url in enumerate(urls):
task = asyncio.ensure_future(get_response(session, url, i))
tasks.append(task)

results = await tqdm.gather(*tasks)

return clean_header_tags_df
await session.close()
return results


def get_response(url, index):
async def get_response(session, url, index):
"""Retrieves GET response for given url.
Args:
session (AsyncHTMLSession): Browser session used to retreive responses.
url (str): Url to request.
index (int): Index of the url to keep results in the same order.
Returns:
dict(int, Response): Dictionary of the url's index value and Response object, None if an error occurred.
"""

headers = {
# Some websites refuse the connection of automated requests, setting the User-Agent will circumvent that
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
# Make sure there's no pre-mature closing of responses before a redirect completes
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
}
response = None
debug = True # Set to True to enable debug output
url = url.removesuffix(".json")

try:
response = requests.get(url, headers=headers, timeout=60)
response = await session.get(url, headers=headers, timeout=60)
except requests.exceptions.SSLError:
# This error is raised when the website uses a legacy SSL version, which is not supported by requests
if debug:
if DEBUG:
print("SSLError:", url)
# Retry using legacy SSL session
response = get_legacy_session().get(url, headers=headers, timeout=60)

# Retry without SSL verification
response = await session.get(url, headers=headers, timeout=60, verify=False)
except requests.exceptions.ConnectionError:
# Sometimes this error is raised because the provided url uses http when it should be https and the website does not handle it properly
if debug:
if DEBUG:
print("MaxRetryError:", url)

if not url[4] == "s":
url = url[:4] + "s" + url[4:]
# Retry with https
response = requests.get(url, headers=headers, timeout=60)
response = await session.get(url, headers=headers, timeout=60)
except (urllib3.exceptions.LocationParseError, requests.exceptions.ReadTimeout) as e:
if debug:
if DEBUG:
print(f"{type(e).__name__}: {url}")
except Exception as e:
if debug:
if DEBUG:
print("Exception:", url)
print(traceback.format_exc())
print(str(e))
finally:
if debug:
if DEBUG:
print(url, response)

return {"index": index, "response": response}


# The following adapter code was shamelessly stolen from Harry Mallon on Stack Overflow:
# https://stackoverflow.com/a/71646353/14045691
class CustomHttpAdapter(requests.adapters.HTTPAdapter):
# "Transport adapter" that allows us to use custom ssl_context.
def __init__(self, ssl_context=None, **kwargs):
self.ssl_context = ssl_context
super().__init__(**kwargs)
async def render_js(urls_responses):
"""Renders JavaScript from a list of urls.
def init_poolmanager(self, connections, maxsize, block=False):
self.poolmanager = urllib3.poolmanager.PoolManager(
num_pools=connections, maxsize=maxsize, block=block, ssl_context=self.ssl_context
)
Args:
urls_responses (dict): Dictionary containing urls and their responses.
"""
print("Rendering JavaScript...")
for url_response in tqdm(urls_responses):
res = url_response["response"]

if res is not None and res.ok:
if DEBUG:
print("Rendering", url_response["url"][0])
task = asyncio.create_task(res.html.arender())

# Some websites will cause the rendering to hang indefinitely so we cancel the task if more than 15 seconds have elapsed
time_elapsed = 0
while not task.done():
time_elapsed += 1
await asyncio.sleep(0.1)

if time_elapsed > 150:
task.cancel()
break

try:
await task
except (pyppeteer.errors.PageError, pyppeteer.errors.NetworkError) as e:
if DEBUG:
print(f"{type(e).__name__}")
except Exception as e:
if DEBUG:
print(traceback.format_exc())
print(str(e))
except asyncio.CancelledError:
if DEBUG:
print("Rendering cancelled")


def parse_response(url_response):
"""Parses relevant HTML tags from a Response object into a dictionary.
Args:
url_response (list[dict]): List of dictionaries containing urls and theeir responses.
Returns:
list[dict]: List of dictionaries containing urls and relevant HTML tags.
"""
tags = {}
res = url_response["response"]
tags["index"] = url_response["index"]
tags["url"] = url_response["url"][0]

def get_legacy_session():
ctx = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
ctx.options |= 0x4 # OP_LEGACY_SERVER_CONNECT
session = requests.session()
session.mount("https://", CustomHttpAdapter(ctx))
return session
if res is None:
tags["http_response"] = "Request failed"
return tags

tags["http_response"] = res.status_code
if not res.ok:
return tags

def collector_main(df):
header_tags_df = process_urls(df.select(pl.col("url")).rows())
try:
soup = BeautifulSoup(res.html.html, "html.parser")
except (bs4.builder.ParserRejectedMarkup, AssertionError):
return tags

tags["html_title"] = soup.title.string if soup.title is not None else ""

meta_tag = soup.find("meta", attrs={"name": "description"})
try:
tags["meta_description"] = meta_tag["content"] if meta_tag is not None else ""
except KeyError:
tags["meta_description"] = ""

for header_tag in header_tags:
headers = soup.find_all(header_tag)
header_content = [header.text for header in headers]
tags[header_tag] = json.dumps(header_content)

return tags


def collector_main(df, render_javascript=False):
header_tags_df = process_urls(df.select(pl.col("url")).rows(), render_javascript=render_javascript)

return header_tags_df

Expand All @@ -174,5 +244,4 @@ def collector_main(df):
header_tags_df = collector_main(df)

# Write the updated JSON data to a new file
header_tags_df.write_csv('urls_and_headers.csv', index=False)

header_tags_df.write_csv('urls_and_headers.csv')
6 changes: 6 additions & 0 deletions html_tag_collector/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
requests
requests_html
polars
tqdm
pyppeteer
beautifulsoup4

0 comments on commit 4ac625c

Please sign in to comment.