Add option to expedite page indexing based on page count mismatches (#…

…569) * Add option to expedite page indexing based on page count mismatches ref #565 * Adjust verbosity for work/page mismatch output; add minimal test
Princeton-CDH · Feb 1, 2024 · de3a7ef · de3a7ef
1 parent 2419d97
commit de3a7ef
Show file tree

Hide file tree

Showing 2 changed files with 98 additions and 15 deletions.
diff --git a/ppa/archive/management/commands/index_pages.py b/ppa/archive/management/commands/index_pages.py
@@ -11,6 +11,7 @@
 from parasolr.django import SolrClient, SolrQuerySet
 
 from ppa.archive.models import DigitizedWork, Page
+from ppa.archive.solr import PageSearchQuerySet
 
 
 def page_index_data(work_q, page_data_q):
@@ -37,9 +38,7 @@ def process_index_queue(index_data_q, total_to_index, work_q):
 
     solr = SolrClient()
     progbar = progressbar.ProgressBar(
-        redirect_stdout=True, 
-        max_value=total_to_index,
-        max_error=False
+        redirect_stdout=True, max_value=total_to_index, max_error=False
     )
     count = 0
     while True:
@@ -80,6 +79,12 @@ def add_arguments(self, parser):
         parser.add_argument(
             "source_ids", nargs="*", help="List of specific items to index (optional)"
         )
+        parser.add_argument(
+            "--expedite",
+            help="Only index works with page count mismatch between Solr and database",
+            action="store_true",
+            default=False,
+        )
 
     def handle(self, *args, **kwargs):
         self.verbosity = kwargs.get("verbosity", self.v_normal)
@@ -91,7 +96,7 @@ def handle(self, *args, **kwargs):
         page_data_q = Queue()
         # populate the work queue with digitized works that have
         # page content to be indexed
-        source_ids = kwargs.get('source_ids', [])
+        source_ids = kwargs.get("source_ids", [])
 
         if not source_ids:
             digiworks = DigitizedWork.items_to_index()
@@ -100,6 +105,56 @@ def handle(self, *args, **kwargs):
             digiworks = DigitizedWork.objects.filter(source_id__in=source_ids)
             num_pages = sum([dw.page_count for dw in digiworks])
 
+        # if reindexing everything, check db totals against solr
+        if not source_ids and self.verbosity >= self.v_normal:
+            # check totals
+            solr_count = self.get_solr_totals()
+
+            work_diff = digiworks.count() - solr_count.get("work", 0)
+            page_diff = num_pages - solr_count.get("page", 0)
+
+            if self.verbosity >= self.v_normal:
+                if work_diff:
+                    self.stdout.write(f"{work_diff:,} works not indexed in Solr")
+                if page_diff:
+                    self.stdout.write(f"{page_diff:,} pages not indexed in Solr")
+
+        if kwargs.get("expedite"):
+            # find works with missing pages
+            facets = (
+                PageSearchQuerySet()
+                .filter(item_type="page")
+                .facet("group_id", limit=-1)
+                .get_facets()
+            )
+            mismatches = []
+            pages_per_work = facets.facet_fields["group_id"]
+            for digwork in DigitizedWork.items_to_index():
+                solr_page_count = pages_per_work.get(digwork.index_id(), 0)
+                if digwork.page_count != solr_page_count:
+                    # add to list of works to index
+                    mismatches.append(digwork)
+
+                    # in verbose mode, report details
+                    if self.verbosity > self.v_normal:
+                        diff_msg = ""
+                        if digwork.page_count > solr_page_count:
+                            diff_msg = f"missing {digwork.page_count - solr_page_count}"
+                        else:
+                            diff_msg = f"extra {solr_page_count - digwork.page_count}"
+
+                        self.stdout.write(
+                            f"{digwork} : {diff_msg} "
+                            + f"(db: {digwork.page_count}, solr: {solr_page_count})"
+                        )
+
+            if self.verbosity >= self.v_normal:
+                self.stdout.write(
+                    f"Indexing pages for {len(mismatches)} works with page count mismatches"
+                )
+            # only index works with page count mismatches
+            digiworks = mismatches
+
         for digwork in digiworks:
             work_q.put(digwork)
 
@@ -121,12 +176,16 @@ def handle(self, *args, **kwargs):
 
         # print a summary of solr totals by item type
         if self.verbosity >= self.v_normal:
-            facets = SolrQuerySet().all().facet("item_type").get_facets()
             item_totals = []
-            for item_type, total in facets.facet_fields.item_type.items():
+            for item_type, total in self.get_solr_totals().items():
                 item_totals.append(
                     "%d %s%s" % (total, item_type, "" if total == 1 else "s")
                 )
             self.stdout.write(
                 "\nItems in Solr by item type: %s" % (", ".join(item_totals))
             )
+
+    def get_solr_totals(self):
+        facets = SolrQuerySet().all().facet("item_type").get_facets()
+        # facet returns an ordered dict
+        return facets.facet_fields.item_type
diff --git a/ppa/archive/tests/test_commands.py b/ppa/archive/tests/test_commands.py
@@ -170,7 +170,6 @@ def test_import_digitizedwork(self):
     @patch("ppa.archive.management.commands.hathi_import.HathiBibliographicAPI")
     @patch("ppa.archive.management.commands.hathi_import.progressbar")
     def test_call_command(self, mockprogbar, mockhathi_bibapi):
-
         digwork = DigitizedWork(source_id="test.123")
 
         # patch methods with actual logic to check handle method behavior
@@ -183,7 +182,6 @@ def test_call_command(self, mockprogbar, mockhathi_bibapi):
         ) as mock_import_digwork, patch.object(
             digwork, "count_pages"
         ) as mock_count_pages:
-
             mock_htids = ["ab.1234", "cd.5678"]
             mock_get_htids.return_value = mock_htids
             mock_import_digwork.return_value = digwork
@@ -353,7 +351,9 @@ def test_process_index_queue(mock_solrclient, mock_progbar):
     mock_solrclient.return_value.update.index.assert_any_call(mockdata1)
     mock_solrclient.return_value.update.index.assert_any_call(mockdata2)
 
-    mock_progbar.ProgressBar.assert_called_with(redirect_stdout=True, max_value=total, max_error=False)
+    mock_progbar.ProgressBar.assert_called_with(
+        redirect_stdout=True, max_value=total, max_error=False
+    )
     progbar = mock_progbar.ProgressBar.return_value
     progbar.update.assert_any_call(4)
     progbar.update.assert_any_call(7)
@@ -400,12 +400,36 @@ def test_index_pages_quiet(self, mock_process, mock_progbar, mock_sleep):
 
     def test_index_pages_specific_ids(self, mock_process, mock_progbar, mock_sleep):
         stdout = StringIO()
-        source_ids=['chi.78013704', 'chi.13880510']
+        source_ids = ["chi.78013704", "chi.13880510"]
         call_command("index_pages", *source_ids, stdout=stdout, verbosity=0)
-        t1=DigitizedWork.objects.get(source_id=source_ids[0])
-        t2=DigitizedWork.objects.get(source_id=source_ids[1])
+        t1 = DigitizedWork.objects.get(source_id=source_ids[0])
+        t2 = DigitizedWork.objects.get(source_id=source_ids[1])
         page_count = t1.page_count + t2.page_count
         big_page_count = Page.total_to_index()
-        assert mock_process.call_args.kwargs.get('args')[1] == page_count      # behavior specifying source ids
-        assert mock_process.call_args.kwargs.get('args')[1] != big_page_count  # normal behavior without specifying source ids
-
+        assert (
+            mock_process.call_args.kwargs.get("args")[1] == page_count
+        )  # behavior specifying source ids
+        assert (
+            mock_process.call_args.kwargs.get("args")[1] != big_page_count
+        )  # normal behavior without specifying source ids
+
+    def test_index_pages_expedite(self, mock_process, mock_progbar, mock_sleep):
+        # test calling from command line
+        stdout = StringIO()
+        call_command("index_pages", stdout=stdout, expedite=True)
+        output = stdout.getvalue()
+        # should report on work/page indexing & mismatch
+        expected_strings = [
+            "works not indexed in Solr",
+            "pages not indexed in Solr",
+            "page count mismatches",
+        ]
+        for msg in expected_strings:
+            assert msg in output
+
+        # should suppress reporting if verbosity is 0
+        stdout = StringIO()
+        call_command("index_pages", stdout=stdout, expedite=True, verbosity=0)
+        output = stdout.getvalue()
+        for msg in expected_strings:
+            assert msg not in output