Skip to content

Commit

Permalink
Efficiency improvements for people-people list (#1595)
Browse files Browse the repository at this point in the history
  • Loading branch information
blms committed Sep 10, 2024
1 parent fdbb8ef commit 4c41bd0
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 47 deletions.
170 changes: 135 additions & 35 deletions geniza/entities/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import re
from datetime import datetime
from math import modf
from operator import itemgetter

from django.conf import settings
from django.contrib.admin.models import CHANGE, LogEntry
Expand All @@ -10,7 +11,7 @@
from django.contrib.contenttypes.models import ContentType
from django.core.validators import RegexValidator
from django.db import models
from django.db.models import Count, F, Q
from django.db.models import F, Q, Value
from django.db.models.query import Prefetch
from django.forms import ValidationError
from django.urls import reverse
Expand Down Expand Up @@ -479,47 +480,146 @@ def related_people(self):
any notes on the relationship, taking into account converse relations"""

# gather all relationships with people, both entered from this person and
# entered from the person on the other side of the relationship; annotate
# with count of documents shared between the two people in each relationship
to_people = self.to_person.all().annotate(
shared_documents=Count(
"from_person__documents__pk",
filter=Q(to_person__documents__pk=F("from_person__documents__pk")),
)
)
from_people = self.from_person.all().annotate(
shared_documents=Count(
"to_person__documents__pk",
filter=Q(from_person__documents__pk=F("to_person__documents__pk")),
# entered from the person on the other side of the relationship
people_relations = (
self.from_person.annotate(
# boolean to indicate if we should use converse or regular relation type name
use_converse_typename=Value(True),
has_page=F("from_person__has_page"),
related_slug=F("from_person__slug"),
related_id=F("from_person"),
).union( # union instead of joins for efficiency
self.to_person.annotate(
use_converse_typename=Value(False),
has_page=F("to_person__has_page"),
related_slug=F("to_person__slug"),
related_id=F("to_person"),
)
)
# have to use values_list (NOT values) here with one argument, otherwise strange things
# happen, possibly due to union(). TODO: see if this is fixed in later django versions.
.values_list("related_id")
)

# standardize into objects with the same attribute names for sorting and display
related = [
relation_list = [
{
"person": rel.to_person,
"type": rel.type.name,
"notes": rel.notes,
"shared_documents": rel.shared_documents,
# this is the order of fields returned by SQL after the annotated union
"id": r[-1],
"slug": r[-2],
"has_page": r[-3],
"use_converse_typename": r[-4],
"notes": r[-5],
"type_id": r[-6],
}
for rel in to_people
for r in people_relations
]
# keep track of people already entered to prevent duplicates
used_pks = [p["person"].pk for p in related]
related += [
{
"person": rel.from_person,
# since these are entered from other people, use converse relationship type name
# if one exists
"type": rel.type.converse_name or rel.type.name,
"notes": rel.notes,
"shared_documents": rel.shared_documents,

# folow GenericForeignKey to find primary name for each related person
person_contenttype = ContentType.objects.get_for_model(Person).pk
names = Name.objects.filter(
object_id__in=[r["id"] for r in relation_list],
primary=True,
content_type_id=person_contenttype,
).values("name", "object_id")
# dict keyed on related person id
names_dict = {n["object_id"]: n["name"] for n in names}

# grab name and converse_name for each relation type since we may need either
# (name if the relation was entered from self, converse if entered from related person)
types = PersonPersonRelationType.objects.filter(
pk__in=[r["type_id"] for r in relation_list],
).values("pk", "name", "converse_name")
# dict keyed on related person id
types_dict = {t["pk"]: t for t in types}

# store each related person's documents to see if we can display their url
related_person_docs = PersonDocumentRelation.objects.filter(
person__id__in=[r["id"] for r in relation_list]
).values("document__id", "person__id")

# efficiently get shared document counts between people by filtering doc relations
self_docs = PersonDocumentRelation.objects.filter(
person__id=self.pk
).values_list("document__id", flat=True)
shared_docs = list(
related_person_docs.filter(document__id__in=list(self_docs)).values(
"document__id", "person__id"
)
)
# dict keyed on related person id
docs_dict = {
r["person__id"]: {
# number of shared person-doc relations matching this person's id
"shared": len(
list(
filter(
lambda shared: shared["person__id"] == r["person__id"],
shared_docs,
)
)
),
# number of total person-doc relations matching this person's id
"total": len(
list(
filter(
lambda total: total["person__id"] == r["person__id"],
related_person_docs,
)
)
),
}
for rel in from_people
if rel.from_person.pk not in used_pks
]
# only need to calculate these for people who have related documents
for r in related_person_docs
}

# update with new data & dedupe
prev_relation = None
# sort by id (dedupe by matching against previous id), then type id for type dedupe
for relation in sorted(relation_list, key=itemgetter("id", "type_id")):
relation.update(
{
# get name from cached queryset dict
"name": names_dict[relation["id"]],
# use type.converse_name if this relation is reverse (and if the type has one)
"type": types_dict[relation["type_id"]][
"converse_name" if relation["use_converse_typename"] else "name"
]
# fallback to type.name if converse_name doesn't exist
or types_dict[relation["type_id"]]["name"],
# get count of shared documents from cached queryset dict
"shared_documents": (
docs_dict[relation["id"]]["shared"]
if relation["id"] in docs_dict
else 0
),
# determine if this person can be linked (can if has_page is true or total docs
# >= Person.MIN_DOCUMENTS constant)
"can_link": (
True
if relation["has_page"]
or (
docs_dict[relation["id"]]["total"]
if relation["id"] in docs_dict
else 0
)
>= Person.MIN_DOCUMENTS
else False
),
}
)
# dedupe and combine type and notes
if prev_relation and prev_relation["id"] == relation["id"]:
# dedupe type by string matching since we can't match reverse relations by id
if relation["type"].lower() not in prev_relation["type"].lower():
prev_relation["type"] += f", {relation['type']}".lower()

Check warning on line 613 in geniza/entities/models.py

View check run for this annotation

Codecov / codecov/patch

geniza/entities/models.py#L612-L613

Added lines #L612 - L613 were not covered by tests
# simply combine notes with html line break
prev_relation["notes"] += (

Check warning on line 615 in geniza/entities/models.py

View check run for this annotation

Codecov / codecov/patch

geniza/entities/models.py#L615

Added line #L615 was not covered by tests
f"<br />{relation['notes']}" if relation["notes"] else ""
)
relation_list.remove(relation)

Check warning on line 618 in geniza/entities/models.py

View check run for this annotation

Codecov / codecov/patch

geniza/entities/models.py#L618

Added line #L618 was not covered by tests
else:
prev_relation = relation

return related
return relation_list

def merge_with(self, merge_people, user=None):
"""Merge the specified people into this one. Combines all metadata
Expand Down
11 changes: 6 additions & 5 deletions geniza/entities/templates/entities/person_related_people.html
Original file line number Diff line number Diff line change
Expand Up @@ -57,20 +57,21 @@
{% for rel in related_people %}
<tr>
<td>
{% if rel.person.get_absolute_url %}
<a href="{{ rel.person.permalink }}" title="{{ rel.person }}">
{{ rel.person }}
{% if rel.can_link %}
{% url "entities:person" rel.slug as person_url %}
<a href="{{ person_url }}" title="{{ rel.name }}">
{{ rel.name }}
</a>
{% else %}
{{ rel.person }}
{{ rel.name }}
{% endif %}
</td>
<td>{{ rel.type }}</td>
<td>
{{ rel.shared_documents }}
</td>
<td class="person-notes">
{{ rel.notes }}
{{ rel.notes|safe }}
</td>
</tr>
{% endfor %}
Expand Down
12 changes: 6 additions & 6 deletions geniza/entities/tests/test_entities_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -686,31 +686,31 @@ def test_get_related(
response = client.get(reverse("entities:person-people", args=(person.slug,)))
assert any(
[
r["person"] == person_diacritic and r["type"] == "parent"
r["id"] == person_diacritic.pk and r["type"].lower() == "parent"
for r in response.context["related_people"]
]
)
assert any(
[
r["person"] == person_multiname and r["type"] == "grandchild"
r["id"] == person_multiname.pk and r["type"].lower() == "grandchild"
for r in response.context["related_people"]
]
)
# should sort by name, asc by default
assert response.context["related_people"][0]["person"] == person_diacritic
assert response.context["related_people"][0]["id"] == person_diacritic.pk
# can also sort by name, desc
response = client.get(
reverse("entities:person-people", args=(person.slug,)),
{"sort": "name_desc"},
)
assert response.context["related_people"][0]["person"] == person_multiname
assert response.context["related_people"][0]["id"] == person_multiname.pk

# sort by relation type
response = client.get(
reverse("entities:person-people", args=(person.slug,)),
{"sort": "relation_asc"},
)
assert response.context["related_people"][0]["type"] == "grandchild"
assert response.context["related_people"][0]["type"].lower() == "grandchild"

# add shared documents
PersonDocumentRelation.objects.create(person=person, document=document)
Expand All @@ -721,7 +721,7 @@ def test_get_related(
reverse("entities:person-people", args=(person.slug,)),
{"sort": "documents_desc"},
)
assert response.context["related_people"][0]["person"] == person_multiname
assert response.context["related_people"][0]["id"] == person_multiname.pk
assert response.context["related_people"][0]["shared_documents"] == 1
assert response.context["related_people"][1]["shared_documents"] == 0

Expand Down
2 changes: 1 addition & 1 deletion geniza/entities/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ def get_related(self):
if "name" in sort:
# sort by slug (stand-in for name, but diacritic insensitive)
related_people = sorted(
related_people, key=lambda p: p["person"].slug, reverse=reverse
related_people, key=lambda p: p["slug"], reverse=reverse
)

if "relation" in sort:
Expand Down

0 comments on commit 4c41bd0

Please sign in to comment.