Skip to content

Commit

Permalink
Add option to expedite page indexing based on page count mismatches (#…
Browse files Browse the repository at this point in the history
…569)

* Add option to expedite page indexing based on page count mismatches

ref #565

* Adjust verbosity for work/page mismatch output; add minimal test
  • Loading branch information
rlskoeser authored Feb 1, 2024
1 parent 2419d97 commit de3a7ef
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 15 deletions.
71 changes: 65 additions & 6 deletions ppa/archive/management/commands/index_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from parasolr.django import SolrClient, SolrQuerySet

from ppa.archive.models import DigitizedWork, Page
from ppa.archive.solr import PageSearchQuerySet


def page_index_data(work_q, page_data_q):
Expand All @@ -37,9 +38,7 @@ def process_index_queue(index_data_q, total_to_index, work_q):

solr = SolrClient()
progbar = progressbar.ProgressBar(
redirect_stdout=True,
max_value=total_to_index,
max_error=False
redirect_stdout=True, max_value=total_to_index, max_error=False
)
count = 0
while True:
Expand Down Expand Up @@ -80,6 +79,12 @@ def add_arguments(self, parser):
parser.add_argument(
"source_ids", nargs="*", help="List of specific items to index (optional)"
)
parser.add_argument(
"--expedite",
help="Only index works with page count mismatch between Solr and database",
action="store_true",
default=False,
)

def handle(self, *args, **kwargs):
self.verbosity = kwargs.get("verbosity", self.v_normal)
Expand All @@ -91,7 +96,7 @@ def handle(self, *args, **kwargs):
page_data_q = Queue()
# populate the work queue with digitized works that have
# page content to be indexed
source_ids = kwargs.get('source_ids', [])
source_ids = kwargs.get("source_ids", [])

if not source_ids:
digiworks = DigitizedWork.items_to_index()
Expand All @@ -100,6 +105,56 @@ def handle(self, *args, **kwargs):
digiworks = DigitizedWork.objects.filter(source_id__in=source_ids)
num_pages = sum([dw.page_count for dw in digiworks])

# if reindexing everything, check db totals against solr
if not source_ids and self.verbosity >= self.v_normal:
# check totals
solr_count = self.get_solr_totals()

work_diff = digiworks.count() - solr_count.get("work", 0)
page_diff = num_pages - solr_count.get("page", 0)

if self.verbosity >= self.v_normal:
if work_diff:
self.stdout.write(f"{work_diff:,} works not indexed in Solr")
if page_diff:
self.stdout.write(f"{page_diff:,} pages not indexed in Solr")

if kwargs.get("expedite"):
# find works with missing pages
facets = (
PageSearchQuerySet()
.filter(item_type="page")
.facet("group_id", limit=-1)
.get_facets()
)
mismatches = []
pages_per_work = facets.facet_fields["group_id"]
for digwork in DigitizedWork.items_to_index():
solr_page_count = pages_per_work.get(digwork.index_id(), 0)
if digwork.page_count != solr_page_count:
# add to list of works to index
mismatches.append(digwork)

# in verbose mode, report details
if self.verbosity > self.v_normal:
diff_msg = ""
if digwork.page_count > solr_page_count:
diff_msg = f"missing {digwork.page_count - solr_page_count}"
else:
diff_msg = f"extra {solr_page_count - digwork.page_count}"

self.stdout.write(
f"{digwork} : {diff_msg} "
+ f"(db: {digwork.page_count}, solr: {solr_page_count})"
)

if self.verbosity >= self.v_normal:
self.stdout.write(
f"Indexing pages for {len(mismatches)} works with page count mismatches"
)
# only index works with page count mismatches
digiworks = mismatches

for digwork in digiworks:
work_q.put(digwork)

Expand All @@ -121,12 +176,16 @@ def handle(self, *args, **kwargs):

# print a summary of solr totals by item type
if self.verbosity >= self.v_normal:
facets = SolrQuerySet().all().facet("item_type").get_facets()
item_totals = []
for item_type, total in facets.facet_fields.item_type.items():
for item_type, total in self.get_solr_totals().items():
item_totals.append(
"%d %s%s" % (total, item_type, "" if total == 1 else "s")
)
self.stdout.write(
"\nItems in Solr by item type: %s" % (", ".join(item_totals))
)

def get_solr_totals(self):
facets = SolrQuerySet().all().facet("item_type").get_facets()
# facet returns an ordered dict
return facets.facet_fields.item_type
42 changes: 33 additions & 9 deletions ppa/archive/tests/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,6 @@ def test_import_digitizedwork(self):
@patch("ppa.archive.management.commands.hathi_import.HathiBibliographicAPI")
@patch("ppa.archive.management.commands.hathi_import.progressbar")
def test_call_command(self, mockprogbar, mockhathi_bibapi):

digwork = DigitizedWork(source_id="test.123")

# patch methods with actual logic to check handle method behavior
Expand All @@ -183,7 +182,6 @@ def test_call_command(self, mockprogbar, mockhathi_bibapi):
) as mock_import_digwork, patch.object(
digwork, "count_pages"
) as mock_count_pages:

mock_htids = ["ab.1234", "cd.5678"]
mock_get_htids.return_value = mock_htids
mock_import_digwork.return_value = digwork
Expand Down Expand Up @@ -353,7 +351,9 @@ def test_process_index_queue(mock_solrclient, mock_progbar):
mock_solrclient.return_value.update.index.assert_any_call(mockdata1)
mock_solrclient.return_value.update.index.assert_any_call(mockdata2)

mock_progbar.ProgressBar.assert_called_with(redirect_stdout=True, max_value=total, max_error=False)
mock_progbar.ProgressBar.assert_called_with(
redirect_stdout=True, max_value=total, max_error=False
)
progbar = mock_progbar.ProgressBar.return_value
progbar.update.assert_any_call(4)
progbar.update.assert_any_call(7)
Expand Down Expand Up @@ -400,12 +400,36 @@ def test_index_pages_quiet(self, mock_process, mock_progbar, mock_sleep):

def test_index_pages_specific_ids(self, mock_process, mock_progbar, mock_sleep):
stdout = StringIO()
source_ids=['chi.78013704', 'chi.13880510']
source_ids = ["chi.78013704", "chi.13880510"]
call_command("index_pages", *source_ids, stdout=stdout, verbosity=0)
t1=DigitizedWork.objects.get(source_id=source_ids[0])
t2=DigitizedWork.objects.get(source_id=source_ids[1])
t1 = DigitizedWork.objects.get(source_id=source_ids[0])
t2 = DigitizedWork.objects.get(source_id=source_ids[1])
page_count = t1.page_count + t2.page_count
big_page_count = Page.total_to_index()
assert mock_process.call_args.kwargs.get('args')[1] == page_count # behavior specifying source ids
assert mock_process.call_args.kwargs.get('args')[1] != big_page_count # normal behavior without specifying source ids

assert (
mock_process.call_args.kwargs.get("args")[1] == page_count
) # behavior specifying source ids
assert (
mock_process.call_args.kwargs.get("args")[1] != big_page_count
) # normal behavior without specifying source ids

def test_index_pages_expedite(self, mock_process, mock_progbar, mock_sleep):
# test calling from command line
stdout = StringIO()
call_command("index_pages", stdout=stdout, expedite=True)
output = stdout.getvalue()
# should report on work/page indexing & mismatch
expected_strings = [
"works not indexed in Solr",
"pages not indexed in Solr",
"page count mismatches",
]
for msg in expected_strings:
assert msg in output

# should suppress reporting if verbosity is 0
stdout = StringIO()
call_command("index_pages", stdout=stdout, expedite=True, verbosity=0)
output = stdout.getvalue()
for msg in expected_strings:
assert msg not in output

0 comments on commit de3a7ef

Please sign in to comment.