Skip to content

Commit

Permalink
Initial work to extract forum posts.
Browse files Browse the repository at this point in the history
  • Loading branch information
emmiegit committed Feb 23, 2024
1 parent 6dbf22e commit 158901a
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 10 deletions.
5 changes: 2 additions & 3 deletions yellowstone/request/forum_posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,10 +109,9 @@ def process_post(source: str, post: Tag) -> ForumPostData:
)
created_by = get_entity_user(
source,
select_element(source, started, ".printuser a"),
find_element(source, started, class_="printuser"),
)
created_by = 0
html = str(find_element(source, post, class_="content"))
html = find_element(source, post, class_="content").decode_contents().strip()

# NOTE: basic list of revisions can be seen in
# changes = post.find(class_="changes")
Expand Down
4 changes: 1 addition & 3 deletions yellowstone/request/forum_threads.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,8 @@ def process_row(source: str, row: Tag) -> ForumThreadData:
)
created_by = get_entity_user(
source,
select_element(source, started, ".printuser a"),
find_element(source, started, class_="printuser"),
)
# TODO support created by: Wikidot
# find("span", class_="printuser") -> ~"Wikidot"

# Thread's last post
last_post = extract_last_forum_post(source, row)
Expand Down
52 changes: 49 additions & 3 deletions yellowstone/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@
from bs4 import BeautifulSoup, PageElement, Tag

from .exception import ScrapingError
from .types import ForumLastPostData, UserModuleData
from .types import CustomUserData, DeletedUserData, ForumLastPostData, UserModuleData

LAST_THREAD_AND_POST_ID = re.compile(r"/forum/t-(\d+)(?:/[^/]*)?#post-(\d+)")
TIMESTAMP_REGEX = re.compile(r"time_(\d+)")
USER_ID_REGEX = re.compile(r"WIKIDOT\.page\.listeners\.userInfo\((\d+)\).*")
USER_SLUG_REGEX = re.compile(r"https?://www\.wikidot\.com/user:info/([^/]+)")
USER_GUEST_REGEX = re.compile(r"\s*(.+?) \(.+\)\s*")

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -90,9 +91,54 @@ def get_entity_date(source: str, tag: Tag) -> datetime:
raise ScrapingError(f"Could not find date timestamp from {source}")


def get_entity_user(source: str, tag: Tag) -> UserModuleData:
def get_entity_user(source: str, tag: Tag) -> Union[UserModuleData, DeletedUserData, CustomUserData]:
"""
Parses out a user module entity.
Parses out a user module entity, including unusual cases.
Requires being focused on .printuser
It can output one of:
* Regular user (current)
* Deleted user (ID only)
* Anonymous user (IP)
* Created by Wikidot (for forum threads)
"""

assert "printuser" in tag.attrs["class"]

# If this has the "deleted" class, it's a deleted user
if "deleted" in tag.attrs["class"]:
return DeletedUserData(int(tag.attrs["data-id"]))

# If there is a ".printuser a", it's either a regular user or a guest
entity = tag.find("a")
if entity is not None:
# Anonymous users have an IP address
ip_entity = entity.find(class_="ip")
if ip_entity is not None:
ip = ip_entity.text
if ip.startswith("(") and ip.endswith(")"):
ip = ip[1:-1]
return AnonymousUserData(ip)

# Guests don't have profile links
if entity.attrs["href"] == "javascript:;":
guest_name = regex_extract_str(source, entity.text.strip(), USER_GUEST_REGEX)
return CustomUserData(guest_name)

# Regular users
return get_entity_user_exists(source, entity)

# Created by Wikidot
if tag.text.strip() == "Wikidot":
return CustomUserData("wikidot")

import pdb; pdb.set_trace()


def get_entity_user_exists(source: str, tag: Tag) -> UserModuleData:
"""
Parses out a user module entity, when it is known to be "real" (e.g. not anonymous, deleted, etc).
Requires being focused on the ".printuser a" element.
Example
```html
Expand Down
26 changes: 25 additions & 1 deletion yellowstone/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from dataclasses import dataclass
from datetime import datetime
from typing import Union
from typing import Optional, Union

Json = Union[None, int, float, str, list["Json"], dict[str, "Json"]]

Expand All @@ -16,6 +16,30 @@ class UserModuleData:
name: str


@dataclass
class DeletedUserData:
id: int


@dataclass
class AnonymousUserData:
ip: str


@dataclass
class CustomUserData:
name: str

@property
def is_system(self) -> bool:
"""
The 'Wikidot' user is special, and not a regular user.
It designates the system user taking an action.
"""

return self.name.casefold() == "wikidot"


@dataclass
class ForumLastPostData:
posted_time: datetime
Expand Down

0 comments on commit 158901a

Please sign in to comment.