Skip to content

Commit

Permalink
Output ForumPostData from partial chunks.
Browse files Browse the repository at this point in the history
  • Loading branch information
emmiegit committed Mar 24, 2024
1 parent 4d113fb commit 7251ab9
Showing 1 changed file with 31 additions and 31 deletions.
62 changes: 31 additions & 31 deletions yellowstone/request/forum_posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,9 @@

from ..scraper import (
find_element,
get_entity_date,
get_entity_user,
make_soup,
regex_extract_int,
regex_extract_str,
)
from ..types import ForumPostUser, assert_is_tag
from ..utils import chunks
Expand Down Expand Up @@ -80,13 +78,6 @@ def get(
)
assert category_id == category_id_ex, "category ID in scraped thread doesn't match"

thread_title_child = tuple(breadcrumbs.children)[-1]
assert isinstance(
thread_title_child,
str,
), "last element in forum breadcrumbs is not text"
thread_title = regex_extract_str(source, thread_title_child, THREAD_TITLE)

# Here, we could get the forum thread's creator, created time, and description.
# However we already got that above so it's not necessary here.

Expand All @@ -95,35 +86,44 @@ def get(
soup.find(id="thread-container-posts"),
"thread container",
)
posts = list(
partial_posts = tuple(
map(
lambda post: process_post(source, post),
lambda post: process_post_partial(source, post),
container.find_all(class_="post"),
)
)

# Then do batch post fetches until everything is populated.
# Wikidot allows at most 10 posts to be fetched per request.
for chunk in chunks(posts, 10):
results = wikidot.proxy.posts.get({"site": site_slug, "posts": [str(post.id) for post in chunk]})
for post in chunk:
assert isinstance(post, dict)
data = results[str(post.id)]
assert post.id == data["id"]
post.parent = data["reply_to"]
post.title = data["title"]
post.wikitext = data["content"]
post.html = data["html"].strip()
post.created_at = datetime.fromisoformat(data["created_at"])

# TODO queue per-post jobs into batches of at most 10
# we don't need to be *that* efficient, just group
# by thread, so if a thread has < 10 posts just put
# them in one request and that's that
_ = thread_title


def process_post(source: str, post: Tag) -> ForumPostDataPartial:
posts = []
for chunk in chunks(partial_posts, 10):
results = wikidot.proxy.posts.get(
{
"site": site_slug,
"posts": [str(post.id) for post in chunk],
}
)
for partial in chunk:
assert isinstance(partial, dict)
data = results[str(partial.id)]

assert partial.id == data["id"]
posts.append(
ForumPostData(
id=partial.id,
parent=data["reply_to"],
title=data["title"],
created_by=partial.created_by,
created_at=datetime.fromisoformat(data["created_at"]),
wikitext=data["content"],
html=data["html"].strip(),
)
)

return posts


def process_post_partial(source: str, post: Tag) -> ForumPostDataPartial:
post_id = regex_extract_int(source, post.attrs["id"], POST_ID)
source = f"{source} post {post_id}"

Expand Down

0 comments on commit 7251ab9

Please sign in to comment.