From 158901a3db4cc15473c6ac7192739bc6ab6f1d99 Mon Sep 17 00:00:00 2001 From: Emmie Maeda Date: Fri, 23 Feb 2024 00:21:11 -0500 Subject: [PATCH] Initial work to extract forum posts. --- yellowstone/request/forum_posts.py | 5 ++- yellowstone/request/forum_threads.py | 4 +-- yellowstone/scraper.py | 52 ++++++++++++++++++++++++++-- yellowstone/types.py | 26 +++++++++++++- 4 files changed, 77 insertions(+), 10 deletions(-) diff --git a/yellowstone/request/forum_posts.py b/yellowstone/request/forum_posts.py index 7bb8cd7..99b2a1d 100644 --- a/yellowstone/request/forum_posts.py +++ b/yellowstone/request/forum_posts.py @@ -109,10 +109,9 @@ def process_post(source: str, post: Tag) -> ForumPostData: ) created_by = get_entity_user( source, - select_element(source, started, ".printuser a"), + find_element(source, started, class_="printuser"), ) - created_by = 0 - html = str(find_element(source, post, class_="content")) + html = find_element(source, post, class_="content").decode_contents().strip() # NOTE: basic list of revisions can be seen in # changes = post.find(class_="changes") diff --git a/yellowstone/request/forum_threads.py b/yellowstone/request/forum_threads.py index 2462f12..7e8901b 100644 --- a/yellowstone/request/forum_threads.py +++ b/yellowstone/request/forum_threads.py @@ -103,10 +103,8 @@ def process_row(source: str, row: Tag) -> ForumThreadData: ) created_by = get_entity_user( source, - select_element(source, started, ".printuser a"), + find_element(source, started, class_="printuser"), ) - # TODO support created by: Wikidot - # find("span", class_="printuser") -> ~"Wikidot" # Thread's last post last_post = extract_last_forum_post(source, row) diff --git a/yellowstone/scraper.py b/yellowstone/scraper.py index 6e9587c..2f98102 100644 --- a/yellowstone/scraper.py +++ b/yellowstone/scraper.py @@ -11,12 +11,13 @@ from bs4 import BeautifulSoup, PageElement, Tag from .exception import ScrapingError -from .types import ForumLastPostData, UserModuleData +from .types import CustomUserData, DeletedUserData, ForumLastPostData, UserModuleData LAST_THREAD_AND_POST_ID = re.compile(r"/forum/t-(\d+)(?:/[^/]*)?#post-(\d+)") TIMESTAMP_REGEX = re.compile(r"time_(\d+)") USER_ID_REGEX = re.compile(r"WIKIDOT\.page\.listeners\.userInfo\((\d+)\).*") USER_SLUG_REGEX = re.compile(r"https?://www\.wikidot\.com/user:info/([^/]+)") +USER_GUEST_REGEX = re.compile(r"\s*(.+?) \(.+\)\s*") logger = logging.getLogger(__name__) @@ -90,9 +91,54 @@ def get_entity_date(source: str, tag: Tag) -> datetime: raise ScrapingError(f"Could not find date timestamp from {source}") -def get_entity_user(source: str, tag: Tag) -> UserModuleData: +def get_entity_user(source: str, tag: Tag) -> Union[UserModuleData, DeletedUserData, CustomUserData]: """ - Parses out a user module entity. + Parses out a user module entity, including unusual cases. + Requires being focused on .printuser + + It can output one of: + * Regular user (current) + * Deleted user (ID only) + * Anonymous user (IP) + * Created by Wikidot (for forum threads) + """ + + assert "printuser" in tag.attrs["class"] + + # If this has the "deleted" class, it's a deleted user + if "deleted" in tag.attrs["class"]: + return DeletedUserData(int(tag.attrs["data-id"])) + + # If there is a ".printuser a", it's either a regular user or a guest + entity = tag.find("a") + if entity is not None: + # Anonymous users have an IP address + ip_entity = entity.find(class_="ip") + if ip_entity is not None: + ip = ip_entity.text + if ip.startswith("(") and ip.endswith(")"): + ip = ip[1:-1] + return AnonymousUserData(ip) + + # Guests don't have profile links + if entity.attrs["href"] == "javascript:;": + guest_name = regex_extract_str(source, entity.text.strip(), USER_GUEST_REGEX) + return CustomUserData(guest_name) + + # Regular users + return get_entity_user_exists(source, entity) + + # Created by Wikidot + if tag.text.strip() == "Wikidot": + return CustomUserData("wikidot") + + import pdb; pdb.set_trace() + + +def get_entity_user_exists(source: str, tag: Tag) -> UserModuleData: + """ + Parses out a user module entity, when it is known to be "real" (e.g. not anonymous, deleted, etc). + Requires being focused on the ".printuser a" element. Example ```html diff --git a/yellowstone/types.py b/yellowstone/types.py index 70017b3..d531d49 100644 --- a/yellowstone/types.py +++ b/yellowstone/types.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from datetime import datetime -from typing import Union +from typing import Optional, Union Json = Union[None, int, float, str, list["Json"], dict[str, "Json"]] @@ -16,6 +16,30 @@ class UserModuleData: name: str +@dataclass +class DeletedUserData: + id: int + + +@dataclass +class AnonymousUserData: + ip: str + + +@dataclass +class CustomUserData: + name: str + + @property + def is_system(self) -> bool: + """ + The 'Wikidot' user is special, and not a regular user. + It designates the system user taking an action. + """ + + return self.name.casefold() == "wikidot" + + @dataclass class ForumLastPostData: posted_time: datetime