Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance of the latest_of_each method of the manager #1360

Merged
merged 5 commits into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Authors
- Anton Kulikov (`bigtimecriminal <https://github.com/bigtimecriminal>`_)
- Ben Lawson (`blawson <https://github.com/blawson>`_)
- Benjamin Mampaey (`bmampaey <https://github.com/bmampaey>`_)
- Berke Agababaoglu (`bagababaoglu <https://github.com/bagababaoglu>`_)
- Bheesham Persaud (`bheesham <https://github.com/bheesham>`_)
- `bradford281 <https://github.com/bradford281>`_
- Brian Armstrong (`barm <https://github.com/barm>`_)
Expand Down
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Unreleased

- Made ``skip_history_when_saving`` work when creating an object - not just when
updating an object (gh-1262)
- Improved performance of the ``latest_of_each()`` history manager method (gh-1360)

3.7.0 (2024-05-29)
------------------
Expand Down
65 changes: 23 additions & 42 deletions simple_history/manager.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from django.conf import settings
from django.db import connection, models
from django.db.models import OuterRef, QuerySet, Subquery
from django.db import models
from django.db.models import Exists, OuterRef, Q, QuerySet
from django.utils import timezone

from simple_history.utils import (
Expand Down Expand Up @@ -29,13 +29,11 @@ def __init__(self, *args, **kwargs):
self._as_of = None
self._pk_attr = self.model.instance_type._meta.pk.attname

def as_instances(self):
def as_instances(self) -> "HistoricalQuerySet":
"""
Return a queryset that generates instances instead of historical records.
Queries against the resulting queryset will translate `pk` into the
primary key field of the original type.

Returns a queryset.
"""
if not self._as_instances:
result = self.exclude(history_type="-")
Expand All @@ -44,7 +42,7 @@ def as_instances(self):
result = self._clone()
return result

def filter(self, *args, **kwargs):
def filter(self, *args, **kwargs) -> "HistoricalQuerySet":
"""
If a `pk` filter arrives and the queryset is returning instances
then the caller actually wants to filter based on the original
Expand All @@ -55,43 +53,26 @@ def filter(self, *args, **kwargs):
kwargs[self._pk_attr] = kwargs.pop("pk")
return super().filter(*args, **kwargs)

def latest_of_each(self):
def latest_of_each(self) -> "HistoricalQuerySet":
"""
Ensures results in the queryset are the latest historical record for each
primary key. Deletions are not removed.

Returns a queryset.
primary key. This includes deletion records.
"""
# If using MySQL, need to get a list of IDs in memory and then use them for the
# second query.
# Does mean two loops through the DB to get the full set, but still a speed
# improvement.
backend = connection.vendor
if backend == "mysql":
history_ids = {}
for item in self.order_by("-history_date", "-pk"):
if getattr(item, self._pk_attr) not in history_ids:
history_ids[getattr(item, self._pk_attr)] = item.pk
latest_historics = self.filter(history_id__in=history_ids.values())
elif backend == "postgresql":
latest_pk_attr_historic_ids = (
self.order_by(self._pk_attr, "-history_date", "-pk")
.distinct(self._pk_attr)
.values_list("pk", flat=True)
)
latest_historics = self.filter(history_id__in=latest_pk_attr_historic_ids)
else:
latest_pk_attr_historic_ids = (
self.filter(**{self._pk_attr: OuterRef(self._pk_attr)})
.order_by("-history_date", "-pk")
.values("pk")[:1]
)
latest_historics = self.filter(
history_id__in=Subquery(latest_pk_attr_historic_ids)
)
return latest_historics
# Subquery for finding the records that belong to the same history-tracked
# object as the record from the outer query (identified by `_pk_attr`),
# and that have a later `history_date` than the outer record.
# The very latest record of a history-tracked object should be excluded from
# this query - which will make it included in the `~Exists` query below.
later_records = self.filter(
Q(**{self._pk_attr: OuterRef(self._pk_attr)}),
Q(history_date__gt=OuterRef("history_date")),
)

# Filter the records to only include those for which the `later_records`
# subquery does not return any results.
return self.filter(~Exists(later_records))

def _select_related_history_tracked_objs(self):
def _select_related_history_tracked_objs(self) -> "HistoricalQuerySet":
"""
A convenience method that calls ``select_related()`` with all the names of
the model's history-tracked ``ForeignKey`` fields.
Expand All @@ -103,18 +84,18 @@ def _select_related_history_tracked_objs(self):
]
return self.select_related(*field_names)

def _clone(self):
def _clone(self) -> "HistoricalQuerySet":
c = super()._clone()
c._as_instances = self._as_instances
c._as_of = self._as_of
c._pk_attr = self._pk_attr
return c

def _fetch_all(self):
def _fetch_all(self) -> None:
super()._fetch_all()
self._instanceize()

def _instanceize(self):
def _instanceize(self) -> None:
"""
Convert the result cache to instances if possible and it has not already been
done. If a query extracts `.values(...)` then the result cache will not contain
Expand Down
122 changes: 96 additions & 26 deletions simple_history/tests/tests/test_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,66 @@
from simple_history.manager import SIMPLE_HISTORY_REVERSE_ATTR_NAME

from ..models import Choice, Document, Poll, RankedDocument
from .utils import HistoricalTestCase

User = get_user_model()


class AsOfTest(TestCase):
class LatestOfEachTestCase(HistoricalTestCase):
def test_filtered_instances_are_as_expected(self):
document1 = RankedDocument.objects.create(rank=10)
document2 = RankedDocument.objects.create(rank=20)
document2.rank = 21
document2.save()
document3 = RankedDocument.objects.create(rank=30)
document3.rank = 31
document3.save()
document3.delete()
document4 = RankedDocument.objects.create(rank=40)
document4_pk = document4.pk
document4.delete()
reincarnated_document4 = RankedDocument.objects.create(pk=document4_pk, rank=42)

record4, record3, record2, record1 = RankedDocument.history.latest_of_each()
self.assertRecordValues(
record1,
RankedDocument,
{
"rank": 10,
"id": document1.id,
"history_type": "+",
},
)
self.assertRecordValues(
record2,
RankedDocument,
{
"rank": 21,
"id": document2.id,
"history_type": "~",
},
)
self.assertRecordValues(
record3,
RankedDocument,
{
"rank": 31,
"id": document3.id,
"history_type": "-",
},
)
self.assertRecordValues(
record4,
RankedDocument,
{
"rank": 42,
"id": reincarnated_document4.id,
"history_type": "+",
},
)


class AsOfTestCase(TestCase):
model = Document

def setUp(self):
Expand Down Expand Up @@ -69,7 +124,7 @@ def test_modified(self):
self.assertEqual(as_of_list[0].changed_by, self.obj.changed_by)


class AsOfAdditionalTestCase(TestCase):
class AsOfTestCaseWithoutSetUp(TestCase):
def test_create_and_delete(self):
document = Document.objects.create()
now = datetime.now()
Expand Down Expand Up @@ -150,42 +205,57 @@ def test_historical_query_set(self):
"""
Demonstrates how the HistoricalQuerySet works to provide as_of functionality.
"""
document1 = RankedDocument.objects.create(rank=42)
document2 = RankedDocument.objects.create(rank=84)
document2.rank = 51
document1 = RankedDocument.objects.create(rank=10)
document2 = RankedDocument.objects.create(rank=20)
document2.rank = 21
document2.save()
document1.delete()
t1 = datetime.now()
document3 = RankedDocument.objects.create(rank=30) # noqa: F841
document2.rank = 22
document2.save()
t2 = datetime.now()

# look for historical records, get back a queryset
with self.assertNumQueries(1):
queryset = RankedDocument.history.filter(history_date__lte=t2)
self.assertEqual(queryset.count(), 4)
# 4 records before `t1` (for document 1 and 2), 2 after (for document 2 and 3)
queryset = RankedDocument.history.filter(history_date__lte=t1)
self.assertEqual(queryset.count(), 4)
self.assertEqual(RankedDocument.history.filter(history_date__gt=t1).count(), 2)

# only want the most recend records (provided by HistoricalQuerySet)
self.assertEqual(queryset.latest_of_each().count(), 2)
# `latest_of_each()` returns the most recent record of each document
with self.assertNumQueries(1):
self.assertEqual(queryset.latest_of_each().count(), 2)

# want to see the instances as of that time?
self.assertEqual(queryset.latest_of_each().as_instances().count(), 1)
# `as_instances()` returns the historical instances as of each record's time,
# but excludes deletion records (i.e. document 1's most recent record)
with self.assertNumQueries(1):
self.assertEqual(queryset.latest_of_each().as_instances().count(), 1)

# these new methods are idempotent
self.assertEqual(
queryset.latest_of_each()
.latest_of_each()
.as_instances()
.as_instances()
.count(),
1,
)
# (Duplicate calls to these methods should not change the number of queries,
# since they're idempotent)
with self.assertNumQueries(1):
self.assertEqual(
queryset.latest_of_each()
.latest_of_each()
.as_instances()
.as_instances()
.count(),
1,
)

# that was all the same as calling as_of!
self.assertEqual(
self.assertSetEqual(
# In conclusion, all of these methods combined...
set(
RankedDocument.history.filter(history_date__lte=t2)
RankedDocument.history.filter(history_date__lte=t1)
.latest_of_each()
.as_instances()
),
set(RankedDocument.history.as_of(t2)),
# ...are equivalent to calling `as_of()`!
set(RankedDocument.history.as_of(t1)),
)

self.assertEqual(RankedDocument.history.as_of(t1).get().rank, 21)
self.assertListEqual(
[d.rank for d in RankedDocument.history.as_of(t2)], [22, 30]
)


Expand Down
21 changes: 7 additions & 14 deletions simple_history/tests/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,6 @@
pre_create_historical_m2m_records,
pre_create_historical_record,
)
from simple_history.tests.tests.utils import (
database_router_override_settings,
database_router_override_settings_history_in_diff_db,
middleware_override_settings,
)
from simple_history.utils import get_history_model_for_model, update_change_reason

from ..external.models import (
Expand Down Expand Up @@ -126,6 +121,12 @@
UUIDModel,
WaterLevel,
)
from .utils import (
HistoricalTestCase,
database_router_override_settings,
database_router_override_settings_history_in_diff_db,
middleware_override_settings,
)

get_model = apps.get_model
User = get_user_model()
Expand All @@ -140,18 +141,10 @@ def get_fake_file(filename):
return fake_file


class HistoricalRecordsTest(TestCase):
class HistoricalRecordsTest(HistoricalTestCase):
def assertDatetimesEqual(self, time1, time2):
self.assertAlmostEqual(time1, time2, delta=timedelta(seconds=2))

def assertRecordValues(self, record, klass, values_dict):
for key, value in values_dict.items():
self.assertEqual(getattr(record, key), value)
self.assertEqual(record.history_object.__class__, klass)
for key, value in values_dict.items():
if key not in ["history_type", "history_change_reason"]:
self.assertEqual(getattr(record.history_object, key), value)

def test_create(self):
p = Poll(question="what's up?", pub_date=today)
p.save()
Expand Down
Loading
Loading