From d8cf0b8af4726550a6060d3186a6ed895db613b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Berke=20A=C4=9Fababao=C4=9Flu?= <35297035+bagababaoglu@users.noreply.github.com> Date: Mon, 22 Jul 2024 09:43:41 +0200 Subject: [PATCH] Improve performance of the latest_of_each method of the manager (#1360) * Improve performance of the latest_of_each method of the manager * Polished docs and code * Improved as_of() test case * Moved assertRecordValues() to new HistoricalTestCase This makes the method available for use in other test cases. * Added LatestOfEachTestCase --------- Co-authored-by: berke.agababaoglu Co-authored-by: Anders <6058745+ddabble@users.noreply.github.com> --- AUTHORS.rst | 1 + CHANGES.rst | 1 + simple_history/manager.py | 65 ++++------- simple_history/tests/tests/test_manager.py | 122 ++++++++++++++++----- simple_history/tests/tests/test_models.py | 21 ++-- simple_history/tests/tests/utils.py | 39 ++++++- 6 files changed, 164 insertions(+), 85 deletions(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index 3db6b7425..4c38bbbf9 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -19,6 +19,7 @@ Authors - Anton Kulikov (`bigtimecriminal `_) - Ben Lawson (`blawson `_) - Benjamin Mampaey (`bmampaey `_) +- Berke Agababaoglu (`bagababaoglu `_) - Bheesham Persaud (`bheesham `_) - `bradford281 `_ - Brian Armstrong (`barm `_) diff --git a/CHANGES.rst b/CHANGES.rst index f834d83c1..9c44563c3 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -6,6 +6,7 @@ Unreleased - Made ``skip_history_when_saving`` work when creating an object - not just when updating an object (gh-1262) +- Improved performance of the ``latest_of_each()`` history manager method (gh-1360) 3.7.0 (2024-05-29) ------------------ diff --git a/simple_history/manager.py b/simple_history/manager.py index d8f0b53e8..2d9bda656 100644 --- a/simple_history/manager.py +++ b/simple_history/manager.py @@ -1,6 +1,6 @@ from django.conf import settings -from django.db import connection, models -from django.db.models import OuterRef, QuerySet, Subquery +from django.db import models +from django.db.models import Exists, OuterRef, Q, QuerySet from django.utils import timezone from simple_history.utils import ( @@ -29,13 +29,11 @@ def __init__(self, *args, **kwargs): self._as_of = None self._pk_attr = self.model.instance_type._meta.pk.attname - def as_instances(self): + def as_instances(self) -> "HistoricalQuerySet": """ Return a queryset that generates instances instead of historical records. Queries against the resulting queryset will translate `pk` into the primary key field of the original type. - - Returns a queryset. """ if not self._as_instances: result = self.exclude(history_type="-") @@ -44,7 +42,7 @@ def as_instances(self): result = self._clone() return result - def filter(self, *args, **kwargs): + def filter(self, *args, **kwargs) -> "HistoricalQuerySet": """ If a `pk` filter arrives and the queryset is returning instances then the caller actually wants to filter based on the original @@ -55,43 +53,26 @@ def filter(self, *args, **kwargs): kwargs[self._pk_attr] = kwargs.pop("pk") return super().filter(*args, **kwargs) - def latest_of_each(self): + def latest_of_each(self) -> "HistoricalQuerySet": """ Ensures results in the queryset are the latest historical record for each - primary key. Deletions are not removed. - - Returns a queryset. + primary key. This includes deletion records. """ - # If using MySQL, need to get a list of IDs in memory and then use them for the - # second query. - # Does mean two loops through the DB to get the full set, but still a speed - # improvement. - backend = connection.vendor - if backend == "mysql": - history_ids = {} - for item in self.order_by("-history_date", "-pk"): - if getattr(item, self._pk_attr) not in history_ids: - history_ids[getattr(item, self._pk_attr)] = item.pk - latest_historics = self.filter(history_id__in=history_ids.values()) - elif backend == "postgresql": - latest_pk_attr_historic_ids = ( - self.order_by(self._pk_attr, "-history_date", "-pk") - .distinct(self._pk_attr) - .values_list("pk", flat=True) - ) - latest_historics = self.filter(history_id__in=latest_pk_attr_historic_ids) - else: - latest_pk_attr_historic_ids = ( - self.filter(**{self._pk_attr: OuterRef(self._pk_attr)}) - .order_by("-history_date", "-pk") - .values("pk")[:1] - ) - latest_historics = self.filter( - history_id__in=Subquery(latest_pk_attr_historic_ids) - ) - return latest_historics + # Subquery for finding the records that belong to the same history-tracked + # object as the record from the outer query (identified by `_pk_attr`), + # and that have a later `history_date` than the outer record. + # The very latest record of a history-tracked object should be excluded from + # this query - which will make it included in the `~Exists` query below. + later_records = self.filter( + Q(**{self._pk_attr: OuterRef(self._pk_attr)}), + Q(history_date__gt=OuterRef("history_date")), + ) + + # Filter the records to only include those for which the `later_records` + # subquery does not return any results. + return self.filter(~Exists(later_records)) - def _select_related_history_tracked_objs(self): + def _select_related_history_tracked_objs(self) -> "HistoricalQuerySet": """ A convenience method that calls ``select_related()`` with all the names of the model's history-tracked ``ForeignKey`` fields. @@ -103,18 +84,18 @@ def _select_related_history_tracked_objs(self): ] return self.select_related(*field_names) - def _clone(self): + def _clone(self) -> "HistoricalQuerySet": c = super()._clone() c._as_instances = self._as_instances c._as_of = self._as_of c._pk_attr = self._pk_attr return c - def _fetch_all(self): + def _fetch_all(self) -> None: super()._fetch_all() self._instanceize() - def _instanceize(self): + def _instanceize(self) -> None: """ Convert the result cache to instances if possible and it has not already been done. If a query extracts `.values(...)` then the result cache will not contain diff --git a/simple_history/tests/tests/test_manager.py b/simple_history/tests/tests/test_manager.py index 75bf94a78..acb9e0256 100644 --- a/simple_history/tests/tests/test_manager.py +++ b/simple_history/tests/tests/test_manager.py @@ -8,11 +8,66 @@ from simple_history.manager import SIMPLE_HISTORY_REVERSE_ATTR_NAME from ..models import Choice, Document, Poll, RankedDocument +from .utils import HistoricalTestCase User = get_user_model() -class AsOfTest(TestCase): +class LatestOfEachTestCase(HistoricalTestCase): + def test_filtered_instances_are_as_expected(self): + document1 = RankedDocument.objects.create(rank=10) + document2 = RankedDocument.objects.create(rank=20) + document2.rank = 21 + document2.save() + document3 = RankedDocument.objects.create(rank=30) + document3.rank = 31 + document3.save() + document3.delete() + document4 = RankedDocument.objects.create(rank=40) + document4_pk = document4.pk + document4.delete() + reincarnated_document4 = RankedDocument.objects.create(pk=document4_pk, rank=42) + + record4, record3, record2, record1 = RankedDocument.history.latest_of_each() + self.assertRecordValues( + record1, + RankedDocument, + { + "rank": 10, + "id": document1.id, + "history_type": "+", + }, + ) + self.assertRecordValues( + record2, + RankedDocument, + { + "rank": 21, + "id": document2.id, + "history_type": "~", + }, + ) + self.assertRecordValues( + record3, + RankedDocument, + { + "rank": 31, + "id": document3.id, + "history_type": "-", + }, + ) + self.assertRecordValues( + record4, + RankedDocument, + { + "rank": 42, + "id": reincarnated_document4.id, + "history_type": "+", + }, + ) + + +class AsOfTestCase(TestCase): model = Document def setUp(self): @@ -69,7 +124,7 @@ def test_modified(self): self.assertEqual(as_of_list[0].changed_by, self.obj.changed_by) -class AsOfAdditionalTestCase(TestCase): +class AsOfTestCaseWithoutSetUp(TestCase): def test_create_and_delete(self): document = Document.objects.create() now = datetime.now() @@ -150,42 +205,57 @@ def test_historical_query_set(self): """ Demonstrates how the HistoricalQuerySet works to provide as_of functionality. """ - document1 = RankedDocument.objects.create(rank=42) - document2 = RankedDocument.objects.create(rank=84) - document2.rank = 51 + document1 = RankedDocument.objects.create(rank=10) + document2 = RankedDocument.objects.create(rank=20) + document2.rank = 21 document2.save() document1.delete() + t1 = datetime.now() + document3 = RankedDocument.objects.create(rank=30) # noqa: F841 + document2.rank = 22 + document2.save() t2 = datetime.now() - # look for historical records, get back a queryset - with self.assertNumQueries(1): - queryset = RankedDocument.history.filter(history_date__lte=t2) - self.assertEqual(queryset.count(), 4) + # 4 records before `t1` (for document 1 and 2), 2 after (for document 2 and 3) + queryset = RankedDocument.history.filter(history_date__lte=t1) + self.assertEqual(queryset.count(), 4) + self.assertEqual(RankedDocument.history.filter(history_date__gt=t1).count(), 2) - # only want the most recend records (provided by HistoricalQuerySet) - self.assertEqual(queryset.latest_of_each().count(), 2) + # `latest_of_each()` returns the most recent record of each document + with self.assertNumQueries(1): + self.assertEqual(queryset.latest_of_each().count(), 2) - # want to see the instances as of that time? - self.assertEqual(queryset.latest_of_each().as_instances().count(), 1) + # `as_instances()` returns the historical instances as of each record's time, + # but excludes deletion records (i.e. document 1's most recent record) + with self.assertNumQueries(1): + self.assertEqual(queryset.latest_of_each().as_instances().count(), 1) - # these new methods are idempotent - self.assertEqual( - queryset.latest_of_each() - .latest_of_each() - .as_instances() - .as_instances() - .count(), - 1, - ) + # (Duplicate calls to these methods should not change the number of queries, + # since they're idempotent) + with self.assertNumQueries(1): + self.assertEqual( + queryset.latest_of_each() + .latest_of_each() + .as_instances() + .as_instances() + .count(), + 1, + ) - # that was all the same as calling as_of! - self.assertEqual( + self.assertSetEqual( + # In conclusion, all of these methods combined... set( - RankedDocument.history.filter(history_date__lte=t2) + RankedDocument.history.filter(history_date__lte=t1) .latest_of_each() .as_instances() ), - set(RankedDocument.history.as_of(t2)), + # ...are equivalent to calling `as_of()`! + set(RankedDocument.history.as_of(t1)), + ) + + self.assertEqual(RankedDocument.history.as_of(t1).get().rank, 21) + self.assertListEqual( + [d.rank for d in RankedDocument.history.as_of(t2)], [22, 30] ) diff --git a/simple_history/tests/tests/test_models.py b/simple_history/tests/tests/test_models.py index 44384ca91..d70ca3ce9 100644 --- a/simple_history/tests/tests/test_models.py +++ b/simple_history/tests/tests/test_models.py @@ -30,11 +30,6 @@ pre_create_historical_m2m_records, pre_create_historical_record, ) -from simple_history.tests.tests.utils import ( - database_router_override_settings, - database_router_override_settings_history_in_diff_db, - middleware_override_settings, -) from simple_history.utils import get_history_model_for_model, update_change_reason from ..external.models import ( @@ -126,6 +121,12 @@ UUIDModel, WaterLevel, ) +from .utils import ( + HistoricalTestCase, + database_router_override_settings, + database_router_override_settings_history_in_diff_db, + middleware_override_settings, +) get_model = apps.get_model User = get_user_model() @@ -140,18 +141,10 @@ def get_fake_file(filename): return fake_file -class HistoricalRecordsTest(TestCase): +class HistoricalRecordsTest(HistoricalTestCase): def assertDatetimesEqual(self, time1, time2): self.assertAlmostEqual(time1, time2, delta=timedelta(seconds=2)) - def assertRecordValues(self, record, klass, values_dict): - for key, value in values_dict.items(): - self.assertEqual(getattr(record, key), value) - self.assertEqual(record.history_object.__class__, klass) - for key, value in values_dict.items(): - if key not in ["history_type", "history_change_reason"]: - self.assertEqual(getattr(record.history_object, key), value) - def test_create(self): p = Poll(question="what's up?", pub_date=today) p.save() diff --git a/simple_history/tests/tests/utils.py b/simple_history/tests/tests/utils.py index 88d6d92f2..ae6fe949c 100644 --- a/simple_history/tests/tests/utils.py +++ b/simple_history/tests/tests/utils.py @@ -1,9 +1,9 @@ from enum import Enum +from typing import Type -import django from django.conf import settings - -from simple_history.tests.models import HistoricalModelWithHistoryInDifferentDb +from django.db.models import Model +from django.test import TestCase request_middleware = "simple_history.middleware.HistoryRequestMiddleware" @@ -14,6 +14,27 @@ } +class HistoricalTestCase(TestCase): + def assertRecordValues(self, record, klass: Type[Model], values_dict: dict): + """ + Fail if ``record`` doesn't contain the field values in ``values_dict``. + ``record.history_object`` is also checked. + History-tracked fields in ``record`` that are not in ``values_dict``, are not + checked. + + :param record: A historical record. + :param klass: The type of the history-tracked class of ``record``. + :param values_dict: Field names of ``record`` mapped to their expected values. + """ + for key, value in values_dict.items(): + self.assertEqual(getattr(record, key), value) + + self.assertEqual(record.history_object.__class__, klass) + for key, value in values_dict.items(): + if key not in ("history_type", "history_change_reason"): + self.assertEqual(getattr(record.history_object, key), value) + + class TestDbRouter: def db_for_read(self, model, **hints): if model._meta.app_label == "external": @@ -46,16 +67,25 @@ def allow_migrate(self, db, app_label, model_name=None, **hints): class TestModelWithHistoryInDifferentDbRouter: def db_for_read(self, model, **hints): + # Avoids circular importing + from ..models import HistoricalModelWithHistoryInDifferentDb + if model == HistoricalModelWithHistoryInDifferentDb: return OTHER_DB_NAME return None def db_for_write(self, model, **hints): + # Avoids circular importing + from ..models import HistoricalModelWithHistoryInDifferentDb + if model == HistoricalModelWithHistoryInDifferentDb: return OTHER_DB_NAME return None def allow_relation(self, obj1, obj2, **hints): + # Avoids circular importing + from ..models import HistoricalModelWithHistoryInDifferentDb + if isinstance(obj1, HistoricalModelWithHistoryInDifferentDb) or isinstance( obj2, HistoricalModelWithHistoryInDifferentDb ): @@ -63,6 +93,9 @@ def allow_relation(self, obj1, obj2, **hints): return None def allow_migrate(self, db, app_label, model_name=None, **hints): + # Avoids circular importing + from ..models import HistoricalModelWithHistoryInDifferentDb + if model_name == HistoricalModelWithHistoryInDifferentDb._meta.model_name: return db == OTHER_DB_NAME return None