Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Track which Author/Work/Edition a duplicate has been merged into (mark) #3293

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bookwyrm/activitypub/book.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class BookData(ActivityObject):
aasin: Optional[str] = None
isfdb: Optional[str] = None
lastEditedBy: Optional[str] = None
mergedInto: Optional[str] = None


# pylint: disable=invalid-name
Expand Down
8 changes: 5 additions & 3 deletions bookwyrm/management/commands/deduplicate_book_data.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
""" PROCEED WITH CAUTION: uses deduplication fields to permanently
merge book data objects """

from django.core.management.base import BaseCommand
from django.db.models import Count
from bookwyrm import models
from bookwyrm.management.merge import merge_objects


def dedupe_model(model):
Expand All @@ -16,7 +16,8 @@ def dedupe_model(model):
dupes = (
model.objects.values(field.name)
.annotate(Count(field.name))
.filter(**{"%s__count__gt" % field.name: 1})
.filter(merged_into__isnull=True, **{"%s__count__gt" % field.name: 1})
.exclude(**{field.name: "", "%s__isnull" % field.name: True})
)

for dupe in dupes:
Expand All @@ -30,13 +31,14 @@ def dedupe_model(model):
print("keeping", canonical.remote_id)
for obj in objs[1:]:
print(obj.remote_id)
merge_objects(canonical, obj)
obj.merge_into(canonical)


class Command(BaseCommand):
"""deduplicate allllll the book data models"""

help = "merges duplicate book data"

# pylint: disable=no-self-use,unused-argument
def handle(self, *args, **options):
"""run deduplications"""
Expand Down
50 changes: 0 additions & 50 deletions bookwyrm/management/merge.py

This file was deleted.

3 changes: 1 addition & 2 deletions bookwyrm/management/merge_command.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from bookwyrm.management.merge import merge_objects
from django.core.management.base import BaseCommand


Expand Down Expand Up @@ -26,4 +25,4 @@ def handle(self, *args, **options):
print("other book doesn’t exist!")
return

merge_objects(canonical, other)
other.merge_into(canonical)
33 changes: 33 additions & 0 deletions bookwyrm/migrations/0196_merged_into.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Generated by Django 3.2.24 on 2024-02-22 09:05

import bookwyrm.models.fields
from django.db import migrations
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
("bookwyrm", "0195_alter_user_preferred_language"),
]

operations = [
migrations.AddField(
model_name="author",
name="merged_into",
field=bookwyrm.models.fields.ForeignKey(
null=True,
on_delete=django.db.models.deletion.PROTECT,
to="bookwyrm.author",
),
),
migrations.AddField(
model_name="book",
name="merged_into",
field=bookwyrm.models.fields.ForeignKey(
null=True,
on_delete=django.db.models.deletion.PROTECT,
to="bookwyrm.book",
),
),
]
76 changes: 72 additions & 4 deletions bookwyrm/models/book.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
""" database schema for books and shelves """

from itertools import chain
import re
from typing import Any
from typing_extensions import Self

from django.contrib.postgres.search import SearchVectorField
from django.contrib.postgres.indexes import GinIndex
from django.core.cache import cache
from django.db import models, transaction
from django.db.models import Prefetch
from django.db.models import Prefetch, ManyToManyField
from django.dispatch import receiver
from django.utils.translation import gettext_lazy as _
from model_utils import FieldTracker
Expand Down Expand Up @@ -71,6 +73,11 @@ class BookDataModel(ObjectMixin, BookWyrmModel):
on_delete=models.PROTECT,
null=True,
)
merged_into = fields.ForeignKey(
"self",
on_delete=models.PROTECT,
null=True,
)

@property
def openlibrary_link(self):
Expand Down Expand Up @@ -106,6 +113,58 @@ def broadcast(self, activity, sender, software="bookwyrm", **kwargs):
"""only send book data updates to other bookwyrm instances"""
super().broadcast(activity, sender, software=software, **kwargs)

def merge_into(self, canonical: Self) -> None:
"""merge this entity into another entity"""
if canonical.id == self.id:
raise ValueError(f"Cannot merge {self} into itself")
if canonical.merged_into:
raise ValueError(
f"Cannot merge {self} into {canonical} because "
f"{canonical} has itself been merged into {canonical.merged_into}"
)
canonical.adopt_data_from(self)
canonical.save()

# move related models to canonical
related_models = [
(r.remote_field.name, r.related_model) for r in self._meta.related_objects
]
# pylint: disable=protected-access
for related_field, related_model in related_models:
# Skip the ManyToMany fields that aren’t auto-created. These
# should have a corresponding OneToMany field in the model for
# the linking table anyway. If we update it through that model
# instead then we won’t lose the extra fields in the linking
# table.
# pylint: disable=protected-access
related_field_obj = related_model._meta.get_field(related_field)
if isinstance(related_field_obj, ManyToManyField):
through = related_field_obj.remote_field.through
if not through._meta.auto_created:
continue
related_objs = related_model.objects.filter(**{related_field: self})
for related_obj in related_objs:
try:
setattr(related_obj, related_field, canonical)
related_obj.save()
except TypeError:
getattr(related_obj, related_field).add(canonical)
getattr(related_obj, related_field).remove(self)

self.merged_into = canonical
self.save()

def adopt_data_from(self, other: Self) -> None:
"""fill empty fields with values from another entity"""
for data_field in self._meta.get_fields():
if not hasattr(data_field, "activitypub_field"):
continue
data_value = getattr(other, data_field.name)
if not data_value:
continue
if not getattr(self, data_field.name):
setattr(self, data_field.name, data_value)


class Book(BookDataModel):
"""a generic book, which can mean either an edition or a work"""
Expand Down Expand Up @@ -190,9 +249,13 @@ def edition_info(self):
"""properties of this edition, as a string"""
items = [
self.physical_format if hasattr(self, "physical_format") else None,
f"{self.languages[0]} language"
if self.languages and self.languages[0] and self.languages[0] != "English"
else None,
(
f"{self.languages[0]} language"
if self.languages
and self.languages[0]
and self.languages[0] != "English"
else None
),
str(self.published_date.year) if self.published_date else None,
", ".join(self.publishers) if hasattr(self, "publishers") else None,
]
Expand Down Expand Up @@ -252,6 +315,11 @@ def save(self, *args, **kwargs):
edition.save()
return super().save(*args, **kwargs)

@property
def editions(self):
"""exclude editions that have been merged"""
return self.editions.filter(merged_into__isnull=True)

@property
def default_edition(self):
"""in case the default edition is not set"""
Expand Down
Loading