From de3a7ef71b8d3053d1a310f753a6e8b774f8f7f2 Mon Sep 17 00:00:00 2001 From: Rebecca Sutton Koeser Date: Thu, 1 Feb 2024 16:25:57 -0500 Subject: [PATCH] Add option to expedite page indexing based on page count mismatches (#569) * Add option to expedite page indexing based on page count mismatches ref #565 * Adjust verbosity for work/page mismatch output; add minimal test --- .../management/commands/index_pages.py | 71 +++++++++++++++++-- ppa/archive/tests/test_commands.py | 42 ++++++++--- 2 files changed, 98 insertions(+), 15 deletions(-) diff --git a/ppa/archive/management/commands/index_pages.py b/ppa/archive/management/commands/index_pages.py index 366bc0f4e..3f22c9c04 100644 --- a/ppa/archive/management/commands/index_pages.py +++ b/ppa/archive/management/commands/index_pages.py @@ -11,6 +11,7 @@ from parasolr.django import SolrClient, SolrQuerySet from ppa.archive.models import DigitizedWork, Page +from ppa.archive.solr import PageSearchQuerySet def page_index_data(work_q, page_data_q): @@ -37,9 +38,7 @@ def process_index_queue(index_data_q, total_to_index, work_q): solr = SolrClient() progbar = progressbar.ProgressBar( - redirect_stdout=True, - max_value=total_to_index, - max_error=False + redirect_stdout=True, max_value=total_to_index, max_error=False ) count = 0 while True: @@ -80,6 +79,12 @@ def add_arguments(self, parser): parser.add_argument( "source_ids", nargs="*", help="List of specific items to index (optional)" ) + parser.add_argument( + "--expedite", + help="Only index works with page count mismatch between Solr and database", + action="store_true", + default=False, + ) def handle(self, *args, **kwargs): self.verbosity = kwargs.get("verbosity", self.v_normal) @@ -91,7 +96,7 @@ def handle(self, *args, **kwargs): page_data_q = Queue() # populate the work queue with digitized works that have # page content to be indexed - source_ids = kwargs.get('source_ids', []) + source_ids = kwargs.get("source_ids", []) if not source_ids: digiworks = DigitizedWork.items_to_index() @@ -100,6 +105,56 @@ def handle(self, *args, **kwargs): digiworks = DigitizedWork.objects.filter(source_id__in=source_ids) num_pages = sum([dw.page_count for dw in digiworks]) + # if reindexing everything, check db totals against solr + if not source_ids and self.verbosity >= self.v_normal: + # check totals + solr_count = self.get_solr_totals() + + work_diff = digiworks.count() - solr_count.get("work", 0) + page_diff = num_pages - solr_count.get("page", 0) + + if self.verbosity >= self.v_normal: + if work_diff: + self.stdout.write(f"{work_diff:,} works not indexed in Solr") + if page_diff: + self.stdout.write(f"{page_diff:,} pages not indexed in Solr") + + if kwargs.get("expedite"): + # find works with missing pages + facets = ( + PageSearchQuerySet() + .filter(item_type="page") + .facet("group_id", limit=-1) + .get_facets() + ) + mismatches = [] + pages_per_work = facets.facet_fields["group_id"] + for digwork in DigitizedWork.items_to_index(): + solr_page_count = pages_per_work.get(digwork.index_id(), 0) + if digwork.page_count != solr_page_count: + # add to list of works to index + mismatches.append(digwork) + + # in verbose mode, report details + if self.verbosity > self.v_normal: + diff_msg = "" + if digwork.page_count > solr_page_count: + diff_msg = f"missing {digwork.page_count - solr_page_count}" + else: + diff_msg = f"extra {solr_page_count - digwork.page_count}" + + self.stdout.write( + f"{digwork} : {diff_msg} " + + f"(db: {digwork.page_count}, solr: {solr_page_count})" + ) + + if self.verbosity >= self.v_normal: + self.stdout.write( + f"Indexing pages for {len(mismatches)} works with page count mismatches" + ) + # only index works with page count mismatches + digiworks = mismatches + for digwork in digiworks: work_q.put(digwork) @@ -121,12 +176,16 @@ def handle(self, *args, **kwargs): # print a summary of solr totals by item type if self.verbosity >= self.v_normal: - facets = SolrQuerySet().all().facet("item_type").get_facets() item_totals = [] - for item_type, total in facets.facet_fields.item_type.items(): + for item_type, total in self.get_solr_totals().items(): item_totals.append( "%d %s%s" % (total, item_type, "" if total == 1 else "s") ) self.stdout.write( "\nItems in Solr by item type: %s" % (", ".join(item_totals)) ) + + def get_solr_totals(self): + facets = SolrQuerySet().all().facet("item_type").get_facets() + # facet returns an ordered dict + return facets.facet_fields.item_type diff --git a/ppa/archive/tests/test_commands.py b/ppa/archive/tests/test_commands.py index ce4848221..ccfb886ad 100644 --- a/ppa/archive/tests/test_commands.py +++ b/ppa/archive/tests/test_commands.py @@ -170,7 +170,6 @@ def test_import_digitizedwork(self): @patch("ppa.archive.management.commands.hathi_import.HathiBibliographicAPI") @patch("ppa.archive.management.commands.hathi_import.progressbar") def test_call_command(self, mockprogbar, mockhathi_bibapi): - digwork = DigitizedWork(source_id="test.123") # patch methods with actual logic to check handle method behavior @@ -183,7 +182,6 @@ def test_call_command(self, mockprogbar, mockhathi_bibapi): ) as mock_import_digwork, patch.object( digwork, "count_pages" ) as mock_count_pages: - mock_htids = ["ab.1234", "cd.5678"] mock_get_htids.return_value = mock_htids mock_import_digwork.return_value = digwork @@ -353,7 +351,9 @@ def test_process_index_queue(mock_solrclient, mock_progbar): mock_solrclient.return_value.update.index.assert_any_call(mockdata1) mock_solrclient.return_value.update.index.assert_any_call(mockdata2) - mock_progbar.ProgressBar.assert_called_with(redirect_stdout=True, max_value=total, max_error=False) + mock_progbar.ProgressBar.assert_called_with( + redirect_stdout=True, max_value=total, max_error=False + ) progbar = mock_progbar.ProgressBar.return_value progbar.update.assert_any_call(4) progbar.update.assert_any_call(7) @@ -400,12 +400,36 @@ def test_index_pages_quiet(self, mock_process, mock_progbar, mock_sleep): def test_index_pages_specific_ids(self, mock_process, mock_progbar, mock_sleep): stdout = StringIO() - source_ids=['chi.78013704', 'chi.13880510'] + source_ids = ["chi.78013704", "chi.13880510"] call_command("index_pages", *source_ids, stdout=stdout, verbosity=0) - t1=DigitizedWork.objects.get(source_id=source_ids[0]) - t2=DigitizedWork.objects.get(source_id=source_ids[1]) + t1 = DigitizedWork.objects.get(source_id=source_ids[0]) + t2 = DigitizedWork.objects.get(source_id=source_ids[1]) page_count = t1.page_count + t2.page_count big_page_count = Page.total_to_index() - assert mock_process.call_args.kwargs.get('args')[1] == page_count # behavior specifying source ids - assert mock_process.call_args.kwargs.get('args')[1] != big_page_count # normal behavior without specifying source ids - \ No newline at end of file + assert ( + mock_process.call_args.kwargs.get("args")[1] == page_count + ) # behavior specifying source ids + assert ( + mock_process.call_args.kwargs.get("args")[1] != big_page_count + ) # normal behavior without specifying source ids + + def test_index_pages_expedite(self, mock_process, mock_progbar, mock_sleep): + # test calling from command line + stdout = StringIO() + call_command("index_pages", stdout=stdout, expedite=True) + output = stdout.getvalue() + # should report on work/page indexing & mismatch + expected_strings = [ + "works not indexed in Solr", + "pages not indexed in Solr", + "page count mismatches", + ] + for msg in expected_strings: + assert msg in output + + # should suppress reporting if verbosity is 0 + stdout = StringIO() + call_command("index_pages", stdout=stdout, expedite=True, verbosity=0) + output = stdout.getvalue() + for msg in expected_strings: + assert msg not in output