Skip to content

Commit

Permalink
Fix imports creating duplicate Editions
Browse files Browse the repository at this point in the history
This fixes bugs at both the sending and receiving instances:
As the primary fix is at the sending end, a full fix will need to be collaborative at the network level.

**Fixing at the sending end: don't send duplicate GeneratedNotes**

The primary fix is to `handle_reading_status`: it calls `create_generated_note` which saves and broadcasts the GeneratedNote.
Before this fix, `handle_reading_status` then saved and broadcast the note again, creating two incoming GeneratedNotes
for all federated BookWyrm servers, and thus triggering simultaneous or near-simultaneous imports of the associated book
within the `tag` reference. This is then dereferenced, creating two identical Works, the first of which ends up empty
and the second of which ends up with two identical Editions.

**Fixing at the sending end: delay BookStatus broadcasts**

My testing doesn't indicate one way or the other whether this happens, but there's a theoretical race condition for incoming
editions that are ListItem or ShelfBook items. The Add activity needs to create the Edition and Work if they are not already
on the receiving instance, and so does the Comment or GeneratedNote. The fix in `ActivitypubMixin::broadcast` sets a delay on
these statuses to ensure that the Add activity is processed first, and thus the book will already exist when the status is
processed.

**Fixing at the receiving end: prevent dereferencing the incoming Edition**

The book attached to a ListItem or ShelfBook is an Edition. This means that when a receiving instance dereferences it, we
end up with a recursion: The Edition dereferences the parent Work, which dereferences all the child editions in celery tasks.
This includes the Edition that triggered the whole process in the first place.

I couldn't create a reproducable test of this, but I've seen evidence that this can cause a race condition where the incoming
Edition ends up being duplicated - the task to create child Editions checks for existence before the original triggering
Edition has saved.

The changes to Note, Fields, and `ActivityObject::to_model` are aimed at preventing this by allowing us to compare related
field ids to the original triggering remote_id before running `set_related_field` and excluding them if there is a match.
  • Loading branch information
hughrun committed Nov 16, 2024
1 parent 13381b9 commit 32b041d
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 13 deletions.
6 changes: 6 additions & 0 deletions bookwyrm/activitypub/base_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ def to_model(
save: bool = True,
overwrite: bool = True,
allow_external_connections: bool = True,
trigger=None,
) -> Optional[TBookWyrmModel]:
"""convert from an activity to a model instance. Args:
model: the django model that this object is being converted to
Expand All @@ -133,6 +134,9 @@ def to_model(
only update blank fields if false
allow_external_connections: look up missing data if true,
throw an exception if false and an external connection is needed
trigger: the object that originally triggered this
self.to_model. e.g. if this is a Work being dereferenced from
an incoming Edition
"""
model = model or get_model_from_type(self.type)

Expand Down Expand Up @@ -223,6 +227,8 @@ def to_model(
related_field_name = model_field.field.name

for item in values:
if trigger and item == trigger.remote_id:
continue
set_related_field.delay(
related_model.__name__,
instance.__class__.__name__,
Expand Down
1 change: 1 addition & 0 deletions bookwyrm/activitypub/note.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def to_model(
save=True,
overwrite=True,
allow_external_connections=True,
trigger=None,
):
instance = super().to_model(
model, instance, allow_create, save, overwrite, allow_external_connections
Expand Down
14 changes: 14 additions & 0 deletions bookwyrm/models/activitypub_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,20 @@ def find_existing(cls, data):

def broadcast(self, activity, sender, software=None, queue=BROADCAST):
"""send out an activity"""

# if we're posting about ShelfBooks, set a delay to give the base activity
# time to add the book on remote servers first to avoid race conditions
countdown = (
10
if (
isinstance(activity, object)
and not isinstance(activity["object"], str)
and activity["object"].get("type", None) in ["GeneratedNote", "Comment"]
)
else 0
)
broadcast_task.apply_async(
countdown=countdown,
args=(
sender.id,
json.dumps(activity, cls=activitypub.ActivityEncoder),
Expand Down Expand Up @@ -227,6 +240,7 @@ def save(
return

try:
# TODO: here is where we might use an ActivityPub extension instead
# do we have a "pure" activitypub version of this for mastodon?
if software != "bookwyrm" and hasattr(self, "pure_content"):
pure_activity = self.to_create_activity(user, pure=True)
Expand Down
24 changes: 14 additions & 10 deletions bookwyrm/models/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,9 @@ def set_field_from_activity(
raise
value = getattr(data, "actor")
formatted = self.field_from_activity(
value, allow_external_connections=allow_external_connections
value,
allow_external_connections=allow_external_connections,
trigger=instance,
)
if formatted is None or formatted is MISSING or formatted == {}:
return False
Expand Down Expand Up @@ -128,7 +130,7 @@ def field_to_activity(self, value):
return value

# pylint: disable=unused-argument
def field_from_activity(self, value, allow_external_connections=True):
def field_from_activity(self, value, allow_external_connections=True, trigger=None):
"""formatter to convert activitypub into a model value"""
if value and hasattr(self, "activitypub_wrapper"):
value = value.get(self.activitypub_wrapper)
Expand All @@ -150,7 +152,9 @@ def __init__(self, *args, load_remote=True, **kwargs):
self.load_remote = load_remote
super().__init__(*args, **kwargs)

def field_from_activity(self, value, allow_external_connections=True):
def field_from_activity(self, value, allow_external_connections=True, trigger=None):
"""trigger: the object that triggered this deserialization.
For example the Edition for which self is the parent Work"""
if not value:
return None

Expand All @@ -160,7 +164,7 @@ def field_from_activity(self, value, allow_external_connections=True):
# only look in the local database
return related_model.find_existing(value.serialize())
# this is an activitypub object, which we can deserialize
return value.to_model(model=related_model)
return value.to_model(model=related_model, trigger=trigger)
try:
# make sure the value looks like a remote id
validate_remote_id(value)
Expand Down Expand Up @@ -336,7 +340,7 @@ def field_to_activity(self, value):
return f"{value.instance.remote_id}/{self.name}"
return [i.remote_id for i in value.all()]

def field_from_activity(self, value, allow_external_connections=True):
def field_from_activity(self, value, allow_external_connections=True, trigger=None):
if value is None or value is MISSING:
return None
if not isinstance(value, list):
Expand Down Expand Up @@ -386,7 +390,7 @@ def field_to_activity(self, value):
)
return tags

def field_from_activity(self, value, allow_external_connections=True):
def field_from_activity(self, value, allow_external_connections=True, trigger=None):
if not isinstance(value, list):
# GoToSocial DMs and single-user mentions are
# sent as objects, not as an array of objects
Expand Down Expand Up @@ -481,7 +485,7 @@ def field_to_activity(self, value, alt=None):

return activitypub.Image(url=url, name=alt)

def field_from_activity(self, value, allow_external_connections=True):
def field_from_activity(self, value, allow_external_connections=True, trigger=None):
image_slug = value
# when it's an inline image (User avatar/icon, Book cover), it's a json
# blob, but when it's an attached image, it's just a url
Expand Down Expand Up @@ -538,7 +542,7 @@ def field_to_activity(self, value):
return None
return value.isoformat()

def field_from_activity(self, value, allow_external_connections=True):
def field_from_activity(self, value, allow_external_connections=True, trigger=None):
missing_fields = datetime(1970, 1, 1) # "2022-10" => "2022-10-01"
try:
date_value = dateutil.parser.parse(value, default=missing_fields)
Expand All @@ -556,7 +560,7 @@ class PartialDateField(ActivitypubFieldMixin, PartialDateModel):
def field_to_activity(self, value) -> str:
return value.partial_isoformat() if value else None

def field_from_activity(self, value, allow_external_connections=True):
def field_from_activity(self, value, allow_external_connections=True, trigger=None):
# pylint: disable=no-else-return
try:
return from_partial_isoformat(value)
Expand Down Expand Up @@ -584,7 +588,7 @@ def field_from_activity(self, value, allow_external_connections=True):
class HtmlField(ActivitypubFieldMixin, models.TextField):
"""a text field for storing html"""

def field_from_activity(self, value, allow_external_connections=True):
def field_from_activity(self, value, allow_external_connections=True, trigger=None):
if not value or value == MISSING:
return None
return clean(value)
Expand Down
10 changes: 9 additions & 1 deletion bookwyrm/tests/views/test_reading.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,9 @@ def test_start_reading(self, *_):
},
)
request.user = self.local_user
with patch("bookwyrm.models.activitypub_mixin.broadcast_task.apply_async"):
with patch(
"bookwyrm.models.activitypub_mixin.broadcast_task.apply_async"
) as mock:
views.ReadingStatus.as_view()(request, "start", self.book.id)

self.assertEqual(shelf.books.get(), self.book)
Expand All @@ -86,6 +88,12 @@ def test_start_reading(self, *_):
self.assertEqual(readthrough.user, self.local_user)
self.assertEqual(readthrough.book, self.book)

# Three broadcast tasks:
# 1. Create Readthrough
# 2. Create post as pure_content (for non-BookWyrm)
# 3. Create post with book attached - this should only happen once!
self.assertEqual(len(mock.mock_calls), 3)

def test_start_reading_with_comment(self, *_):
"""begin a book"""
shelf = self.local_user.shelf_set.get(identifier=models.Shelf.READING)
Expand Down
3 changes: 1 addition & 2 deletions bookwyrm/views/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,7 @@ def handle_reading_status(user, shelf, book, privacy):
# it's a non-standard shelf, don't worry about it
return

status = create_generated_note(user, message, mention_books=[book], privacy=privacy)
status.save()
create_generated_note(user, message, mention_books=[book], privacy=privacy)


def load_date_in_user_tz_as_utc(date_str: str, user: models.User) -> datetime:
Expand Down

0 comments on commit 32b041d

Please sign in to comment.