Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add related objects CSV export for individual person (#1641) #1657

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion geniza/entities/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@
PlacePersonForm,
PlacePlaceForm,
)
from geniza.entities.metadata_export import AdminPersonExporter, AdminPlaceExporter
from geniza.entities.metadata_export import (
AdminPersonExporter,
AdminPlaceExporter,
PersonRelationsExporter,
)
from geniza.entities.models import (
DocumentPlaceRelation,
DocumentPlaceRelationType,
Expand Down Expand Up @@ -354,6 +358,12 @@ def export_to_csv(self, request, queryset=None):
exporter = AdminPersonExporter(queryset=queryset, progress=False)
return exporter.http_export_data_csv()

def export_relations_to_csv(self, request, pk):
"""Stream related objects data for a single object instance as a CSV file"""
queryset = Person.objects.filter(pk=pk)
exporter = PersonRelationsExporter(queryset=queryset, progress=False)
return exporter.http_export_data_csv()

def get_urls(self):
"""Return admin urls; adds custom URLs for exporting as CSV, merging people"""
urls = [
Expand All @@ -362,6 +372,11 @@ def get_urls(self):
self.admin_site.admin_view(self.export_to_csv),
name="person-csv",
),
path(
"<int:pk>/relations-csv/",
self.admin_site.admin_view(self.export_relations_to_csv),
name="person-relations-csv",
),
path(
"merge/",
PersonMerge.as_view(),
Expand Down
322 changes: 321 additions & 1 deletion geniza/entities/metadata_export.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,25 @@
from itertools import groupby
from operator import itemgetter
from time import sleep

from django.contrib.contenttypes.models import ContentType
from django.db.models import F, Value
from django.db.models.query import Prefetch
from django.utils import timezone
from django.utils.text import slugify

from geniza.common.metadata_export import Exporter
from geniza.corpus.dates import standard_date_display
from geniza.corpus.models import Document
from geniza.corpus.models import Document, DocumentEventRelation, TextBlock
from geniza.entities.models import (
DocumentPlaceRelation,
DocumentPlaceRelationType,
Event,
Name,
Person,
PersonDocumentRelation,
PersonDocumentRelationType,
PersonPersonRelationType,
PersonPlaceRelation,
PersonPlaceRelationType,
Place,
Expand Down Expand Up @@ -160,6 +170,316 @@ def get_export_data_dict(self, person):
return outd


class PersonRelationsExporter(Exporter):
"""
A subclass of :class:`geniza.common.metadata_export.Exporter` that
exports information relating to :class:`~geniza.entities.models.Person`,
in particular, the related objects for a single model instance.
Extends :meth:`get_queryset` and :meth:`get_export_data_dict`.
"""

model = Person
csv_fields = [
"related_object_type",
"related_object_id",
"related_object_name",
"relationship_type",
"shared_documents",
]

def csv_filename(self):
"""Generate the appropriate CSV filename for model and time

:return: Filename string
:rtype: str
"""
str_time = timezone.now().strftime("%Y%m%dT%H%M%S")
person = super().get_queryset().first()
return f"geniza-{slugify(str(person))}-person-relations-{str_time}.csv"

def get_queryset(self):
"""Override get_queryset to get related items for the single item"""
person = super().get_queryset().first()
# union querysets to coalesce and normalize heterogenous data types
relations = (
person.from_person.values(
related_object_id=F("from_person"),
related_object_type=Value("Person"),
relationship_type_id=F("type"),
use_converse_typename=Value(True),
)
.union(
person.to_person.values(
related_object_id=F("to_person"),
related_object_type=Value("Person"),
relationship_type_id=F("type"),
use_converse_typename=Value(False),
)
)
.union(
person.personplacerelation_set.values(
related_object_id=F("place"),
related_object_type=Value("Place"),
relationship_type_id=F("type"),
use_converse_typename=Value(False),
)
)
.union(
person.persondocumentrelation_set.values(
related_object_id=F("document"),
related_object_type=Value("Document"),
relationship_type_id=F("type"),
use_converse_typename=Value(False),
)
)
.union(
person.personeventrelation_set.values(
related_object_id=F("event"),
related_object_type=Value("Event"),
# use -1 as this must be int, but there is no relationship
# type for event relations
relationship_type_id=Value(-1),
use_converse_typename=Value(False),
)
)
)
# populate additional data
return self.populate_relation_fields(list(relations))

def populate_relation_fields(self, relations):
"""Helper method called by :meth:`get_queryset` that prefetches
various fields on related objects, efficiently retrieving their
data for export in bulk"""

# the general rule here is: fetch all the related data with a values()
# queryset, cast as dict (keyed on id) to compute each queryset ahead
# of time, and access each field value from the dict by id in a loop.

# use single query to get names for people and places
related_people = [
r["related_object_id"]
for r in relations
if r["related_object_type"] == "Person"
]
related_places = [
r["related_object_id"]
for r in relations
if r["related_object_type"] == "Place"
]
names = list(
Name.objects.filter(
object_id__in=[*related_people, *related_places],
primary=True,
).values("object_id", "name", "content_type")
)
pers_contenttype_id = ContentType.objects.get_for_model(Person).pk
place_contenttype_id = ContentType.objects.get_for_model(Place).pk

# for people, places, documents: use single query each to get relation type names
person_relation_types = PersonPersonRelationType.objects.filter(
id__in=[
r["relationship_type_id"]
for r in relations
if r["related_object_type"] == "Person"
]
).values("id", "name", "converse_name")
person_relation_typedict = {t["id"]: t for t in person_relation_types}
place_relation_types = PersonPlaceRelationType.objects.filter(
id__in=[
r["relationship_type_id"]
for r in relations
if r["related_object_type"] == "Place"
]
).values("id", "name")
place_relation_typedict = {t["id"]: t["name"] for t in place_relation_types}
doc_relation_types = PersonDocumentRelationType.objects.filter(
id__in=[
r["relationship_type_id"]
for r in relations
if r["related_object_type"] == "Document"
]
).values("id", "name")
doc_relation_typedict = {t["id"]: t["name"] for t in doc_relation_types}

# get shared documents with People, Places, and Events
related_docs = [
r["related_object_id"]
for r in relations
if r["related_object_type"] == "Document"
]
related_events = [
r["related_object_id"]
for r in relations
if r["related_object_type"] == "Event"
]
shared_person_docs = list(
PersonDocumentRelation.objects.filter(
document__id__in=related_docs, person__id__in=related_people
).values("document__id", "person__id")
)
shared_person_docs = sorted(shared_person_docs, key=itemgetter("person__id"))
persondocs_dict = {
k: [d["document__id"] for d in v]
for k, v in groupby(shared_person_docs, key=itemgetter("person__id"))
}
shared_place_docs = list(
DocumentPlaceRelation.objects.filter(
document__id__in=related_docs, place__id__in=related_places
).values("document__id", "place__id")
)
shared_place_docs = sorted(shared_place_docs, key=itemgetter("place__id"))
placedocs_dict = {
k: [d["document__id"] for d in v]
for k, v in groupby(shared_place_docs, key=itemgetter("place__id"))
}
shared_event_docs = list(
DocumentEventRelation.objects.filter(
document__id__in=related_docs, event__id__in=related_events
).values("document__id", "event__id")
)
shared_event_docs = sorted(shared_event_docs, key=itemgetter("event__id"))
eventdocs_dict = {
k: [d["document__id"] for d in v]
for k, v in groupby(shared_event_docs, key=itemgetter("event__id"))
}

# to get Document names, need TextBlocks and Fragments
docs = Document.objects.prefetch_related(
Prefetch(
"textblock_set",
queryset=TextBlock.objects.select_related(
"fragment",
).prefetch_related(
"fragment__textblock_set",
"fragment__textblock_set__document",
),
)
).filter(id__in=related_docs)
docs_dict = {d.id: str(d) for d in docs}

# get Event names
events = Event.objects.filter(id__in=related_events).values("id", "name")
events_dict = {e["id"]: e["name"] for e in events}

# loop through all relations, update with additional data, and dedupe
prev_relation = None
# use all precomputed query results to populate additional data per obj
for rel in sorted(
relations,
key=itemgetter(
# sort for deduplication
"related_object_type",
"related_object_id",
"relationship_type_id",
),
):
obj_id = rel["related_object_id"]
if rel["related_object_type"] == "Person":
# get person name, relationship type from precomputed querysets
filtered_names = filter(
lambda n: n.get("object_id") == obj_id
and n.get("content_type") == pers_contenttype_id,
names,
)
rel_type = person_relation_typedict.get(rel["relationship_type_id"])
rel.update(
{
"related_object_name": next(filtered_names).get("name"),
"relationship_type": (
# handle converse type names for self-referential relationships
rel_type.get("converse_name")
if rel["use_converse_typename"]
and rel_type.get("converse_name")
# use name if should use name, or converse does not exist
else rel_type.get("name")
),
"shared_documents": ", ".join(
[
docs_dict.get(doc_id)
for doc_id in persondocs_dict.get(obj_id, [])
]
),
}
)
elif rel["related_object_type"] == "Place":
# get place name, relationship type from precomputed querysets
filtered_names = filter(
lambda n: n.get("object_id") == obj_id
and n.get("content_type") == place_contenttype_id,
names,
)
rel.update(
{
"related_object_name": next(filtered_names).get("name"),
"relationship_type": place_relation_typedict.get(
rel["relationship_type_id"]
),
"shared_documents": ", ".join(
[
docs_dict.get(doc_id)
for doc_id in placedocs_dict.get(obj_id, [])
]
),
}
)
elif rel["related_object_type"] == "Document":
# get doc name, doc relation type name from precomputed querysets
rel.update(
{
"related_object_name": docs_dict.get(obj_id),
"relationship_type": doc_relation_typedict.get(
rel["relationship_type_id"]
),
}
)
elif rel["related_object_type"] == "Event":
# get event name from precomputed names queryset; relation type
rel.update(
{
"related_object_name": events_dict.get(obj_id),
"shared_documents": ", ".join(
[
docs_dict.get(doc_id)
for doc_id in eventdocs_dict.get(obj_id, [])
]
),
}
)
# relationship type is not used for events

# dedupe items and combine relationship types
if (
prev_relation
and prev_relation["related_object_type"] == rel["related_object_type"]
and prev_relation["related_object_id"] == obj_id
):
# dedupe type by string matching since we can't match reverse relations by id
if (
rel.get("relationship_type", "").lower()
not in prev_relation.get("relationship_type", "").lower()
):
prev_relation[
"relationship_type"
] += f", {rel['relationship_type']}".lower()
relations.remove(rel)
else:
prev_relation = rel

return sorted(
relations,
# sort by object type, then name
key=lambda r: (r["related_object_type"], slugify(r["related_object_name"])),
)

def get_export_data_dict(self, obj):
"""
For efficiency, the dict is populated in :meth:`get_queryset`,
via :meth`populate_relation_fields`, as that method allows us to
retrieve values for multiple related objects of the same type in bulk.
"""
return dict(obj)


class PublicPlaceExporter(Exporter):
"""
A subclass of :class:`geniza.common.metadata_export.Exporter` that
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
{% extends "admin/change_form.html" %}

{% block object-tools-items %}
<li><a href="{% url 'admin:person-relations-csv' pk=original.id %}">Download relations as CSV</a></li>
{{ block.super }}
{% endblock %}

{# Render mixed normal and inline fieldsets #}
{% block field_sets %}
{% include "admin/snippets/mixed_inlines_fieldsets.html" %}
Expand Down
Loading
Loading