diff --git a/bookwyrm/activitypub/book.py b/bookwyrm/activitypub/book.py index a53222053c..861eae9650 100644 --- a/bookwyrm/activitypub/book.py +++ b/bookwyrm/activitypub/book.py @@ -22,6 +22,7 @@ class BookData(ActivityObject): aasin: Optional[str] = None isfdb: Optional[str] = None lastEditedBy: Optional[str] = None + mergedInto: Optional[str] = None # pylint: disable=invalid-name diff --git a/bookwyrm/management/commands/deduplicate_book_data.py b/bookwyrm/management/commands/deduplicate_book_data.py index dde7d133c5..c4752bcf0b 100644 --- a/bookwyrm/management/commands/deduplicate_book_data.py +++ b/bookwyrm/management/commands/deduplicate_book_data.py @@ -1,9 +1,9 @@ """ PROCEED WITH CAUTION: uses deduplication fields to permanently merge book data objects """ + from django.core.management.base import BaseCommand from django.db.models import Count from bookwyrm import models -from bookwyrm.management.merge import merge_objects def dedupe_model(model): @@ -16,7 +16,8 @@ def dedupe_model(model): dupes = ( model.objects.values(field.name) .annotate(Count(field.name)) - .filter(**{"%s__count__gt" % field.name: 1}) + .filter(merged_into__isnull=True, **{"%s__count__gt" % field.name: 1}) + .exclude(**{field.name: "", "%s__isnull" % field.name: True}) ) for dupe in dupes: @@ -30,13 +31,14 @@ def dedupe_model(model): print("keeping", canonical.remote_id) for obj in objs[1:]: print(obj.remote_id) - merge_objects(canonical, obj) + obj.merge_into(canonical) class Command(BaseCommand): """deduplicate allllll the book data models""" help = "merges duplicate book data" + # pylint: disable=no-self-use,unused-argument def handle(self, *args, **options): """run deduplications""" diff --git a/bookwyrm/management/merge.py b/bookwyrm/management/merge.py deleted file mode 100644 index f55229f18d..0000000000 --- a/bookwyrm/management/merge.py +++ /dev/null @@ -1,50 +0,0 @@ -from django.db.models import ManyToManyField - - -def update_related(canonical, obj): - """update all the models with fk to the object being removed""" - # move related models to canonical - related_models = [ - (r.remote_field.name, r.related_model) for r in canonical._meta.related_objects - ] - for (related_field, related_model) in related_models: - # Skip the ManyToMany fields that aren’t auto-created. These - # should have a corresponding OneToMany field in the model for - # the linking table anyway. If we update it through that model - # instead then we won’t lose the extra fields in the linking - # table. - related_field_obj = related_model._meta.get_field(related_field) - if isinstance(related_field_obj, ManyToManyField): - through = related_field_obj.remote_field.through - if not through._meta.auto_created: - continue - related_objs = related_model.objects.filter(**{related_field: obj}) - for related_obj in related_objs: - print("replacing in", related_model.__name__, related_field, related_obj.id) - try: - setattr(related_obj, related_field, canonical) - related_obj.save() - except TypeError: - getattr(related_obj, related_field).add(canonical) - getattr(related_obj, related_field).remove(obj) - - -def copy_data(canonical, obj): - """try to get the most data possible""" - for data_field in obj._meta.get_fields(): - if not hasattr(data_field, "activitypub_field"): - continue - data_value = getattr(obj, data_field.name) - if not data_value: - continue - if not getattr(canonical, data_field.name): - print("setting data field", data_field.name, data_value) - setattr(canonical, data_field.name, data_value) - canonical.save() - - -def merge_objects(canonical, obj): - copy_data(canonical, obj) - update_related(canonical, obj) - # remove the outdated entry - obj.delete() diff --git a/bookwyrm/management/merge_command.py b/bookwyrm/management/merge_command.py index 805dc73fa4..2f3f90c863 100644 --- a/bookwyrm/management/merge_command.py +++ b/bookwyrm/management/merge_command.py @@ -1,4 +1,3 @@ -from bookwyrm.management.merge import merge_objects from django.core.management.base import BaseCommand @@ -26,4 +25,4 @@ def handle(self, *args, **options): print("other book doesn’t exist!") return - merge_objects(canonical, other) + other.merge_into(canonical) diff --git a/bookwyrm/migrations/0196_merged_into.py b/bookwyrm/migrations/0196_merged_into.py new file mode 100644 index 0000000000..ddf59a77dc --- /dev/null +++ b/bookwyrm/migrations/0196_merged_into.py @@ -0,0 +1,33 @@ +# Generated by Django 3.2.24 on 2024-02-22 09:05 + +import bookwyrm.models.fields +from django.db import migrations +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ("bookwyrm", "0195_alter_user_preferred_language"), + ] + + operations = [ + migrations.AddField( + model_name="author", + name="merged_into", + field=bookwyrm.models.fields.ForeignKey( + null=True, + on_delete=django.db.models.deletion.PROTECT, + to="bookwyrm.author", + ), + ), + migrations.AddField( + model_name="book", + name="merged_into", + field=bookwyrm.models.fields.ForeignKey( + null=True, + on_delete=django.db.models.deletion.PROTECT, + to="bookwyrm.book", + ), + ), + ] diff --git a/bookwyrm/models/book.py b/bookwyrm/models/book.py index 6893b9da15..4d207f5aa0 100644 --- a/bookwyrm/models/book.py +++ b/bookwyrm/models/book.py @@ -1,13 +1,15 @@ """ database schema for books and shelves """ + from itertools import chain import re from typing import Any +from typing_extensions import Self from django.contrib.postgres.search import SearchVectorField from django.contrib.postgres.indexes import GinIndex from django.core.cache import cache from django.db import models, transaction -from django.db.models import Prefetch +from django.db.models import Prefetch, ManyToManyField from django.dispatch import receiver from django.utils.translation import gettext_lazy as _ from model_utils import FieldTracker @@ -71,6 +73,11 @@ class BookDataModel(ObjectMixin, BookWyrmModel): on_delete=models.PROTECT, null=True, ) + merged_into = fields.ForeignKey( + "self", + on_delete=models.PROTECT, + null=True, + ) @property def openlibrary_link(self): @@ -106,6 +113,58 @@ def broadcast(self, activity, sender, software="bookwyrm", **kwargs): """only send book data updates to other bookwyrm instances""" super().broadcast(activity, sender, software=software, **kwargs) + def merge_into(self, canonical: Self) -> None: + """merge this entity into another entity""" + if canonical.id == self.id: + raise ValueError(f"Cannot merge {self} into itself") + if canonical.merged_into: + raise ValueError( + f"Cannot merge {self} into {canonical} because " + f"{canonical} has itself been merged into {canonical.merged_into}" + ) + canonical.adopt_data_from(self) + canonical.save() + + # move related models to canonical + related_models = [ + (r.remote_field.name, r.related_model) for r in self._meta.related_objects + ] + # pylint: disable=protected-access + for related_field, related_model in related_models: + # Skip the ManyToMany fields that aren’t auto-created. These + # should have a corresponding OneToMany field in the model for + # the linking table anyway. If we update it through that model + # instead then we won’t lose the extra fields in the linking + # table. + # pylint: disable=protected-access + related_field_obj = related_model._meta.get_field(related_field) + if isinstance(related_field_obj, ManyToManyField): + through = related_field_obj.remote_field.through + if not through._meta.auto_created: + continue + related_objs = related_model.objects.filter(**{related_field: self}) + for related_obj in related_objs: + try: + setattr(related_obj, related_field, canonical) + related_obj.save() + except TypeError: + getattr(related_obj, related_field).add(canonical) + getattr(related_obj, related_field).remove(self) + + self.merged_into = canonical + self.save() + + def adopt_data_from(self, other: Self) -> None: + """fill empty fields with values from another entity""" + for data_field in self._meta.get_fields(): + if not hasattr(data_field, "activitypub_field"): + continue + data_value = getattr(other, data_field.name) + if not data_value: + continue + if not getattr(self, data_field.name): + setattr(self, data_field.name, data_value) + class Book(BookDataModel): """a generic book, which can mean either an edition or a work""" @@ -190,9 +249,13 @@ def edition_info(self): """properties of this edition, as a string""" items = [ self.physical_format if hasattr(self, "physical_format") else None, - f"{self.languages[0]} language" - if self.languages and self.languages[0] and self.languages[0] != "English" - else None, + ( + f"{self.languages[0]} language" + if self.languages + and self.languages[0] + and self.languages[0] != "English" + else None + ), str(self.published_date.year) if self.published_date else None, ", ".join(self.publishers) if hasattr(self, "publishers") else None, ] @@ -252,6 +315,11 @@ def save(self, *args, **kwargs): edition.save() return super().save(*args, **kwargs) + @property + def editions(self): + """exclude editions that have been merged""" + return self.editions.filter(merged_into__isnull=True) + @property def default_edition(self): """in case the default edition is not set"""