From 32b041d10c6caa982e99f7cd722ce8fadf960a60 Mon Sep 17 00:00:00 2001 From: Hugh Rundle Date: Sun, 17 Nov 2024 09:24:49 +1100 Subject: [PATCH] Fix imports creating duplicate Editions This fixes bugs at both the sending and receiving instances: As the primary fix is at the sending end, a full fix will need to be collaborative at the network level. **Fixing at the sending end: don't send duplicate GeneratedNotes** The primary fix is to `handle_reading_status`: it calls `create_generated_note` which saves and broadcasts the GeneratedNote. Before this fix, `handle_reading_status` then saved and broadcast the note again, creating two incoming GeneratedNotes for all federated BookWyrm servers, and thus triggering simultaneous or near-simultaneous imports of the associated book within the `tag` reference. This is then dereferenced, creating two identical Works, the first of which ends up empty and the second of which ends up with two identical Editions. **Fixing at the sending end: delay BookStatus broadcasts** My testing doesn't indicate one way or the other whether this happens, but there's a theoretical race condition for incoming editions that are ListItem or ShelfBook items. The Add activity needs to create the Edition and Work if they are not already on the receiving instance, and so does the Comment or GeneratedNote. The fix in `ActivitypubMixin::broadcast` sets a delay on these statuses to ensure that the Add activity is processed first, and thus the book will already exist when the status is processed. **Fixing at the receiving end: prevent dereferencing the incoming Edition** The book attached to a ListItem or ShelfBook is an Edition. This means that when a receiving instance dereferences it, we end up with a recursion: The Edition dereferences the parent Work, which dereferences all the child editions in celery tasks. This includes the Edition that triggered the whole process in the first place. I couldn't create a reproducable test of this, but I've seen evidence that this can cause a race condition where the incoming Edition ends up being duplicated - the task to create child Editions checks for existence before the original triggering Edition has saved. The changes to Note, Fields, and `ActivityObject::to_model` are aimed at preventing this by allowing us to compare related field ids to the original triggering remote_id before running `set_related_field` and excluding them if there is a match. --- bookwyrm/activitypub/base_activity.py | 6 ++++++ bookwyrm/activitypub/note.py | 1 + bookwyrm/models/activitypub_mixin.py | 14 ++++++++++++++ bookwyrm/models/fields.py | 24 ++++++++++++++---------- bookwyrm/tests/views/test_reading.py | 10 +++++++++- bookwyrm/views/helpers.py | 3 +-- 6 files changed, 45 insertions(+), 13 deletions(-) diff --git a/bookwyrm/activitypub/base_activity.py b/bookwyrm/activitypub/base_activity.py index dc4b8f6ae1..55d87eec3f 100644 --- a/bookwyrm/activitypub/base_activity.py +++ b/bookwyrm/activitypub/base_activity.py @@ -120,6 +120,7 @@ def to_model( save: bool = True, overwrite: bool = True, allow_external_connections: bool = True, + trigger=None, ) -> Optional[TBookWyrmModel]: """convert from an activity to a model instance. Args: model: the django model that this object is being converted to @@ -133,6 +134,9 @@ def to_model( only update blank fields if false allow_external_connections: look up missing data if true, throw an exception if false and an external connection is needed + trigger: the object that originally triggered this + self.to_model. e.g. if this is a Work being dereferenced from + an incoming Edition """ model = model or get_model_from_type(self.type) @@ -223,6 +227,8 @@ def to_model( related_field_name = model_field.field.name for item in values: + if trigger and item == trigger.remote_id: + continue set_related_field.delay( related_model.__name__, instance.__class__.__name__, diff --git a/bookwyrm/activitypub/note.py b/bookwyrm/activitypub/note.py index 6a081058cb..376560f6e8 100644 --- a/bookwyrm/activitypub/note.py +++ b/bookwyrm/activitypub/note.py @@ -50,6 +50,7 @@ def to_model( save=True, overwrite=True, allow_external_connections=True, + trigger=None, ): instance = super().to_model( model, instance, allow_create, save, overwrite, allow_external_connections diff --git a/bookwyrm/models/activitypub_mixin.py b/bookwyrm/models/activitypub_mixin.py index 54ad115119..71f7b0ed90 100644 --- a/bookwyrm/models/activitypub_mixin.py +++ b/bookwyrm/models/activitypub_mixin.py @@ -129,7 +129,20 @@ def find_existing(cls, data): def broadcast(self, activity, sender, software=None, queue=BROADCAST): """send out an activity""" + + # if we're posting about ShelfBooks, set a delay to give the base activity + # time to add the book on remote servers first to avoid race conditions + countdown = ( + 10 + if ( + isinstance(activity, object) + and not isinstance(activity["object"], str) + and activity["object"].get("type", None) in ["GeneratedNote", "Comment"] + ) + else 0 + ) broadcast_task.apply_async( + countdown=countdown, args=( sender.id, json.dumps(activity, cls=activitypub.ActivityEncoder), @@ -227,6 +240,7 @@ def save( return try: + # TODO: here is where we might use an ActivityPub extension instead # do we have a "pure" activitypub version of this for mastodon? if software != "bookwyrm" and hasattr(self, "pure_content"): pure_activity = self.to_create_activity(user, pure=True) diff --git a/bookwyrm/models/fields.py b/bookwyrm/models/fields.py index 75d3acf76d..5e493c51d0 100644 --- a/bookwyrm/models/fields.py +++ b/bookwyrm/models/fields.py @@ -86,7 +86,9 @@ def set_field_from_activity( raise value = getattr(data, "actor") formatted = self.field_from_activity( - value, allow_external_connections=allow_external_connections + value, + allow_external_connections=allow_external_connections, + trigger=instance, ) if formatted is None or formatted is MISSING or formatted == {}: return False @@ -128,7 +130,7 @@ def field_to_activity(self, value): return value # pylint: disable=unused-argument - def field_from_activity(self, value, allow_external_connections=True): + def field_from_activity(self, value, allow_external_connections=True, trigger=None): """formatter to convert activitypub into a model value""" if value and hasattr(self, "activitypub_wrapper"): value = value.get(self.activitypub_wrapper) @@ -150,7 +152,9 @@ def __init__(self, *args, load_remote=True, **kwargs): self.load_remote = load_remote super().__init__(*args, **kwargs) - def field_from_activity(self, value, allow_external_connections=True): + def field_from_activity(self, value, allow_external_connections=True, trigger=None): + """trigger: the object that triggered this deserialization. + For example the Edition for which self is the parent Work""" if not value: return None @@ -160,7 +164,7 @@ def field_from_activity(self, value, allow_external_connections=True): # only look in the local database return related_model.find_existing(value.serialize()) # this is an activitypub object, which we can deserialize - return value.to_model(model=related_model) + return value.to_model(model=related_model, trigger=trigger) try: # make sure the value looks like a remote id validate_remote_id(value) @@ -336,7 +340,7 @@ def field_to_activity(self, value): return f"{value.instance.remote_id}/{self.name}" return [i.remote_id for i in value.all()] - def field_from_activity(self, value, allow_external_connections=True): + def field_from_activity(self, value, allow_external_connections=True, trigger=None): if value is None or value is MISSING: return None if not isinstance(value, list): @@ -386,7 +390,7 @@ def field_to_activity(self, value): ) return tags - def field_from_activity(self, value, allow_external_connections=True): + def field_from_activity(self, value, allow_external_connections=True, trigger=None): if not isinstance(value, list): # GoToSocial DMs and single-user mentions are # sent as objects, not as an array of objects @@ -481,7 +485,7 @@ def field_to_activity(self, value, alt=None): return activitypub.Image(url=url, name=alt) - def field_from_activity(self, value, allow_external_connections=True): + def field_from_activity(self, value, allow_external_connections=True, trigger=None): image_slug = value # when it's an inline image (User avatar/icon, Book cover), it's a json # blob, but when it's an attached image, it's just a url @@ -538,7 +542,7 @@ def field_to_activity(self, value): return None return value.isoformat() - def field_from_activity(self, value, allow_external_connections=True): + def field_from_activity(self, value, allow_external_connections=True, trigger=None): missing_fields = datetime(1970, 1, 1) # "2022-10" => "2022-10-01" try: date_value = dateutil.parser.parse(value, default=missing_fields) @@ -556,7 +560,7 @@ class PartialDateField(ActivitypubFieldMixin, PartialDateModel): def field_to_activity(self, value) -> str: return value.partial_isoformat() if value else None - def field_from_activity(self, value, allow_external_connections=True): + def field_from_activity(self, value, allow_external_connections=True, trigger=None): # pylint: disable=no-else-return try: return from_partial_isoformat(value) @@ -584,7 +588,7 @@ def field_from_activity(self, value, allow_external_connections=True): class HtmlField(ActivitypubFieldMixin, models.TextField): """a text field for storing html""" - def field_from_activity(self, value, allow_external_connections=True): + def field_from_activity(self, value, allow_external_connections=True, trigger=None): if not value or value == MISSING: return None return clean(value) diff --git a/bookwyrm/tests/views/test_reading.py b/bookwyrm/tests/views/test_reading.py index 139d91820f..d8172b7fb0 100644 --- a/bookwyrm/tests/views/test_reading.py +++ b/bookwyrm/tests/views/test_reading.py @@ -70,7 +70,9 @@ def test_start_reading(self, *_): }, ) request.user = self.local_user - with patch("bookwyrm.models.activitypub_mixin.broadcast_task.apply_async"): + with patch( + "bookwyrm.models.activitypub_mixin.broadcast_task.apply_async" + ) as mock: views.ReadingStatus.as_view()(request, "start", self.book.id) self.assertEqual(shelf.books.get(), self.book) @@ -86,6 +88,12 @@ def test_start_reading(self, *_): self.assertEqual(readthrough.user, self.local_user) self.assertEqual(readthrough.book, self.book) + # Three broadcast tasks: + # 1. Create Readthrough + # 2. Create post as pure_content (for non-BookWyrm) + # 3. Create post with book attached - this should only happen once! + self.assertEqual(len(mock.mock_calls), 3) + def test_start_reading_with_comment(self, *_): """begin a book""" shelf = self.local_user.shelf_set.get(identifier=models.Shelf.READING) diff --git a/bookwyrm/views/helpers.py b/bookwyrm/views/helpers.py index d89e195ca7..b3307ab424 100644 --- a/bookwyrm/views/helpers.py +++ b/bookwyrm/views/helpers.py @@ -156,8 +156,7 @@ def handle_reading_status(user, shelf, book, privacy): # it's a non-standard shelf, don't worry about it return - status = create_generated_note(user, message, mention_books=[book], privacy=privacy) - status.save() + create_generated_note(user, message, mention_books=[book], privacy=privacy) def load_date_in_user_tz_as_utc(date_str: str, user: models.User) -> datetime: