From 6169c4554680f7c9c6dab85dc83ec09980b4c05e Mon Sep 17 00:00:00 2001
From: Rebecca Sutton Koeser <rlskoeser@users.noreply.github.com>
Date: Thu, 29 Feb 2024 14:30:57 -0500
Subject: [PATCH 01/71] search within cluster revisions (#593)

* Don't include keyword search term when linking to cluster search

fixes #545

* Adjust result language so it is accurate when searching within a cluster

fixes #545

* Update unit tests for change to language and cluster search link
---
 ppa/archive/templates/archive/snippets/cluster_work.html | 4 ++--
 ppa/archive/templates/archive/snippets/search_form.html  | 2 +-
 ppa/archive/tests/test_views.py                          | 8 ++++++--
 3 files changed, 9 insertions(+), 5 deletions(-)
diff --git a/ppa/archive/templates/archive/snippets/cluster_work.html b/ppa/archive/templates/archive/snippets/cluster_work.html
index 20881b55..8a0fdad2 100644
--- a/ppa/archive/templates/archive/snippets/cluster_work.html
+++ b/ppa/archive/templates/archive/snippets/cluster_work.html
@@ -1,6 +1,6 @@
 {# NOTE: expects cluster_id to be passed in #}
 <div class="cluster-work icon container" >
     {# TODO: include count if we can easily get it ; search and browse x digitized works within cluster #}
-    {# ONLY include keyword search parameter, no other filters or sort options #}
-    <a href="{% url "archive:list" %}?cluster={{ cluster_id }}{% if request.GET.query %}&query={{ request.GET.query }}{% endif %}">search and browse within cluster</a>
+    {# do NOT include any search parameters, to avoid unintentionally hiding cluster results #}
+    <a href="{% url "archive:list" %}?cluster={{ cluster_id }}">search and browse within cluster</a>
 </div>
diff --git a/ppa/archive/templates/archive/snippets/search_form.html b/ppa/archive/templates/archive/snippets/search_form.html
index 706db13c..85ac6d61 100644
--- a/ppa/archive/templates/archive/snippets/search_form.html
+++ b/ppa/archive/templates/archive/snippets/search_form.html
@@ -82,7 +82,7 @@
     </div>
     <input type="submit" class="sr-only sr-only-focusable" aria-label="submit search">
     <div class="workscount ui center aligned text container">
-        <p class="count">Displaying {{ paginator.count|intcomma }} digitized work{{ paginator.count|pluralize }} or clusters of works</p>
+        <p class="count">Displaying {{ paginator.count|intcomma }} digitized work{{ paginator.count|pluralize }}{% if not search_form.cluster.value %} or clusters of works{% endif %}</p>
         <p class="zotero">Work citations can be exported to <a href="https://www.zotero.org/support/getting_stuff_into_your_library">Zotero</a></p>
         <div class="loader">
             <img src="{% static 'img/loader/search-loader.gif' %}" alt="">
diff --git a/ppa/archive/tests/test_views.py b/ppa/archive/tests/test_views.py
index 3091f207..2d5ec96a 100644
--- a/ppa/archive/tests/test_views.py
+++ b/ppa/archive/tests/test_views.py
@@ -631,10 +631,10 @@ def test_keyword_search(self):
 
         self.assertContains(response, "search and browse within cluster", count=1)
 
-        # link preserves keyword arg only but not any other parameters
+        # cluster link should not preserve ANY search parameters
         self.assertContains(
             response,
-            "<a href='/archive/?cluster=treatisewinter&query=wintry'>search and browse within cluster</a>",  # noqa: E501
+            "<a href='/archive/?cluster=treatisewinter'>search and browse within cluster</a>",  # noqa: E501
             html=True,
         )
         self.assertNotContains(
@@ -763,6 +763,10 @@ def test_search_within_cluster(self):
         self.assertContains(
             response, "You are searching and browsing within a cluster."
         )
+        # this cluster only has one record
+        self.assertContains(response, "Displaying 1 digitized work")
+        # search within cluster should not report containing clusters of works
+        self.assertNotContains(response, "or clusters of works")
 
         # should link back to main archive search
         self.assertContains(response, reverse("archive:list"))

From 9a25d258909e868fb6aa57fd1c4448688480ff3f Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 29 Feb 2024 14:53:16 -0500
Subject: [PATCH 02/71] Don't display uncategorized collection in search if
 facet count is zero

fixes #542
---
 ppa/archive/forms.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ppa/archive/forms.py b/ppa/archive/forms.py
index 608233fa..37ba3e99 100644
--- a/ppa/archive/forms.py
+++ b/ppa/archive/forms.py
@@ -400,8 +400,8 @@ def set_choices_from_facets(self, facets):
                         choices.append((itervalue, label))
 
                 # if there are any items not in a collection, add an option
-                # so they will be findable
-                if NO_COLLECTION_LABEL in facet_dict:
+                # so they will be findable; only include if facet count is non-zero
+                if facet_dict.get(NO_COLLECTION_LABEL, 0):
                     choices.append(
                         (
                             ModelMultipleChoiceFieldWithEmpty.EMPTY_ID,

From ff397b5fc6ff374b251acd358c3dc4cf2c9cd430 Mon Sep 17 00:00:00 2001
From: Rebecca Sutton Koeser <rlskoeser@users.noreply.github.com>
Date: Tue, 5 Mar 2024 15:12:40 -0500
Subject: [PATCH 03/71] Manage command to update hathitrust page counts (#594);
 do not save object in count_pages method

* Manage command to update hathitrust page counts

* Update ppa/archive/management/commands/update_hathi_pagecounts.py

Co-authored-by: Laure Thompson <602628+laurejt@users.noreply.github.com>

* Revise count_pages method so it does not automatically save the object

* Clean up formatting and remove unused import per @laurejt feedback

---------

Co-authored-by: Laure Thompson <602628+laurejt@users.noreply.github.com>
---
 ppa/archive/import_util.py                    |  7 +-
 .../management/commands/hathi_import.py       | 25 ++++--
 .../commands/update_hathi_pagecounts.py       | 87 +++++++++++++++++++
 ppa/archive/models.py                         | 13 ++-
 ppa/archive/tests/test_models.py              |  5 +-
 5 files changed, 118 insertions(+), 19 deletions(-)
 create mode 100644 ppa/archive/management/commands/update_hathi_pagecounts.py

diff --git a/ppa/archive/import_util.py b/ppa/archive/import_util.py
index 07180bdf..84f4e176 100644
--- a/ppa/archive/import_util.py
+++ b/ppa/archive/import_util.py
@@ -173,7 +173,8 @@ class HathiImporter(DigitizedWorkImporter):
             hathi.HathiItemForbidden: "Permission denied to download data.",
             RSYNC_ERROR: "Failed to sync data",
             # only saw this one on day, but this was what it was
-            JSONDecodeError: "HathiTrust catalog temporarily unavailable (malformed response).",
+            JSONDecodeError: "HathiTrust catalog temporarily unavailable "
+            + "(malformed response).",
         }
     )
 
@@ -256,7 +257,6 @@ def rsync_data(self):
             # temporary preserve file for dev
             delete=False,
         ) as fp:
-
             file_paths = list(self.pairtree_paths.values())
             # sorting makes rsync more efficient
             file_paths.sort()
@@ -337,6 +337,9 @@ def import_digitizedwork(self, htid, log_msg_src, user):
             if digwork:
                 # populate page count
                 digwork.count_pages()
+                # save the page count to the database
+                if digwork.has_changed("page_count"):
+                    digwork.save()
                 self.imported_works.append(digwork)
 
             self.results[htid] = self.SUCCESS
diff --git a/ppa/archive/management/commands/hathi_import.py b/ppa/archive/management/commands/hathi_import.py
index ce17fa7d..817fd6d0 100644
--- a/ppa/archive/management/commands/hathi_import.py
+++ b/ppa/archive/management/commands/hathi_import.py
@@ -122,9 +122,7 @@ def handle(self, *args, **kwargs):
 
         if self.options["progress"]:
             progbar = progressbar.ProgressBar(
-                redirect_stdout=True,
-                max_value=self.stats["total"],
-                max_error=False
+                redirect_stdout=True, max_value=self.stats["total"], max_error=False
             )
         else:
             progbar = None
@@ -148,7 +146,13 @@ def handle(self, *args, **kwargs):
             # count pages in the pairtree zip file and update digwork page count
             try:
                 self.stats["pages"] += digwork.count_pages()
-            except (storage_exceptions.ObjectNotFoundException, IndexError):  # IndexError on filepath
+                # update page count in the database
+                if digwork.has_changed("page_count"):
+                    digwork.save()
+            except (
+                storage_exceptions.ObjectNotFoundException,
+                IndexError,
+            ):  # IndexError on filepath
                 self.stderr.write("%s not found in datastore" % digwork.source_id)
 
             if progbar:
@@ -156,7 +160,8 @@ def handle(self, *args, **kwargs):
 
         summary = (
             "\nProcessed {:,d} item{} for import."
-            + "\nAdded {:,d}; updated {:,d}; skipped {:,d}; {:,d} error{}; imported {:,d} page{}."
+            + "\nAdded {:,d}; updated {:,d}; skipped {:,d}; "
+            + "{:,d} error{}; imported {:,d} page{}."
         )
         summary = summary.format(
             self.stats["total"],
@@ -172,7 +177,7 @@ def handle(self, *args, **kwargs):
         self.stdout.write(summary)
 
     def initialize_pairtrees(self):
-        """Initiaulize pairtree storage clients for each
+        """Initialize pairtree storage clients for each
         subdirectory in the configured **HATHI_DATA** path."""
 
         # if the configured directory does not exist or is not
@@ -192,8 +197,12 @@ def initialize_pairtrees(self):
                 # may be in there, and so forth.
                 if os.path.isdir(ht_data_dir):
                     prefix = os.path.basename(ht_data_dir)
-                    logger.debug(f'Initializing pair tree in ({ht_data_dir}) [prefix={prefix}]')
-                    hathi_ptree = pairtree_client.PairtreeStorageClient(prefix, ht_data_dir)
+                    logger.debug(
+                        f"Initializing pair tree in ({ht_data_dir}) [prefix={prefix}]"
+                    )
+                    hathi_ptree = pairtree_client.PairtreeStorageClient(
+                        prefix, ht_data_dir
+                    )
                     # store initialized pairtree client by prefix for later use
                     self.hathi_pairtree[prefix] = hathi_ptree
 
diff --git a/ppa/archive/management/commands/update_hathi_pagecounts.py b/ppa/archive/management/commands/update_hathi_pagecounts.py
new file mode 100644
index 00000000..c1b172a0
--- /dev/null
+++ b/ppa/archive/management/commands/update_hathi_pagecounts.py
@@ -0,0 +1,87 @@
+from django.conf import settings
+from django.core.management.base import BaseCommand
+from django.contrib.admin.models import CHANGE, LogEntry
+from django.contrib.auth.models import User
+from django.contrib.contenttypes.models import ContentType
+from pairtree import storage_exceptions
+from parasolr.django.signals import IndexableSignalHandler
+
+from ppa.archive.models import DigitizedWork
+
+
+class Command(BaseCommand):
+    """Update database page counts for non-excerpted HathiTrust digitized items.
+    By default, runs on all non-excerpted, public HathiTrust items.
+    """
+
+    help = __doc__
+
+    #: normal verbosity level
+    v_normal = 1
+    #: verbosity level for the current run; defaults to 1 / normal
+    verbosity = v_normal
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "source_ids", nargs="*", help="List of specific items to update (optional)"
+        )
+
+    def handle(self, *args, **kwargs):
+        self.verbosity = kwargs.get("verbosity", self.verbosity)
+        source_ids = kwargs.get("source_ids", [])
+        # page count does not affect solr indexing, so disconnect signal handler
+        IndexableSignalHandler.disconnect()
+
+        script_user = User.objects.get(username=settings.SCRIPT_USERNAME)
+        digwork_contentype = ContentType.objects.get_for_model(DigitizedWork)
+
+        # find all non-excerpted, non-suppressed hathi volumes
+        hathi_vols = DigitizedWork.objects.filter(
+            source=DigitizedWork.HATHI,
+            item_type=DigitizedWork.FULL,
+            status=DigitizedWork.PUBLIC,
+        )
+        # if source ids are specified, limit to those records only
+        if source_ids:
+            hathi_vols = hathi_vols.filter(source_id__in=source_ids)
+
+        stats = {"updated": 0, "unchanged": 0, "missing_data": 0}
+
+        for digwork in hathi_vols:
+            try:
+                # store the current page count
+                old_page_count = digwork.page_count
+                # recalculate page count from pairtree data
+                # NOTE: this method automatically saves if page count changes
+                digwork.page_count = digwork.count_pages()
+                if digwork.has_changed("page_count"):
+                    digwork.save()
+                    stats["updated"] += 1
+                    # create a log entry documenting page count change
+                    LogEntry.objects.log_action(
+                        user_id=script_user.pk,
+                        content_type_id=digwork_contentype.pk,
+                        object_id=digwork.pk,
+                        object_repr=str(digwork),
+                        change_message=f"Recalculated page count (was {old_page_count}, "
+                        + f"now {digwork.page_count})",
+                        action_flag=CHANGE,
+                    )
+
+                else:
+                    stats["unchanged"] += 1
+
+            except storage_exceptions.ObjectNotFoundException:
+                if self.verbosity >= self.v_normal:
+                    self.stderr.write(
+                        self.style.WARNING(f"Pairtree data for {digwork} not found")
+                    )
+                stats["missing_data"] += 1
+
+        # report a summary of what was done
+        if self.verbosity >= self.v_normal:
+            self.stdout.write(
+                f"Volumes with updated page count: {stats['updated']:,}"
+                + f"\n\tPage count unchanged: {stats['unchanged']:,}"
+                + f"\n\tMissing pairtree data: {stats['missing_data']:,}"
+            )
diff --git a/ppa/archive/models.py b/ppa/archive/models.py
index 1e63123d..f7959722 100644
--- a/ppa/archive/models.py
+++ b/ppa/archive/models.py
@@ -252,7 +252,7 @@ def cluster_save(sender, instance, **kwargs):
                 logger.debug(
                     "cluster id has changed, reindexing %d works and %d pages",
                     works.count(),
-                    page_count["page_count"],
+                    page_count.get("page_count", 0),
                 )
                 DigitizedWork.index_items(works)
                 # reindex pages (this may be slow...)
@@ -907,7 +907,8 @@ def count_pages(self, ptree_client=None):
         number of files in the zipfile within the pairtree content (Hathi-specific).
         Raises :class:`pairtree.storage_exceptions.ObjectNotFoundException`
         if the data is not found in the pairtree storage. Returns page count
-        found; saves the object if the count changes."""
+        found; updates the `page_count` attribute on the current instance,
+        but does NOT save the object."""
 
         # if this item has a page span defined, calculate number of pages
         # based on the number of pages across all spans
@@ -941,11 +942,9 @@ def count_pages(self, ptree_client=None):
         # NOTE: could also count pages via mets file, but that's slower
         # than counting via zipfile name list
 
-        # store page count in the database if changed
-        if self.page_count != page_count:
-            self.page_count = page_count
-            self.save()
-
+        # update page count on the instance, but don't save changes
+        self.page_count = page_count
+        # return the total
         return page_count
 
     @property
diff --git a/ppa/archive/tests/test_models.py b/ppa/archive/tests/test_models.py
index 906ebfe2..2aaec020 100644
--- a/ppa/archive/tests/test_models.py
+++ b/ppa/archive/tests/test_models.py
@@ -633,8 +633,9 @@ def test_count_pages(self, mockzipfile):
 
         # return total and digitized work page counts updated
         assert page_count == 2
-        digwork = DigitizedWork.objects.get(source_id=digwork.source_id)
-        assert digwork.page_count == 2
+        # does NOT save automatically
+        db_digwork = DigitizedWork.objects.get(source_id=digwork.source_id)
+        assert db_digwork.page_count is None
 
         # should ignore non-text files
         page_files = ["0001.txt", "00002.txt", "00001.jp2", "00002.jp2"]

From 9f514dd73fd75acef6ebf9692ced270db1caff89 Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Tue, 5 Mar 2024 15:20:48 -0500
Subject: [PATCH 04/71] Fix for pairtree prefix and version file issue (#611)

* Added code to create pairtree prefix and version files when the pairtree directories exist, but the files needed by the pairtree package do not. NOTE: This update only corrects the issue for HathiTrust imports via the admin interface but not bulk imports.

* Refactored HathiObject for better handling of volume and pairtree identifiers.
---
 ppa/archive/hathi.py            | 60 +++++++++++++++++++--------------
 ppa/archive/tests/test_hathi.py | 59 +++++++++++++++++++++++++-------
 2 files changed, 81 insertions(+), 38 deletions(-)

diff --git a/ppa/archive/hathi.py b/ppa/archive/hathi.py
index 4ff11beb..0ee723e2 100644
--- a/ppa/archive/hathi.py
+++ b/ppa/archive/hathi.py
@@ -242,36 +242,47 @@ class HathiObject:
     """An object for working with a HathiTrust item with data in a
     locally configured pairtree datastore."""
 
-    hathi_id = None
+    # Pairtree version statement usd by pairtree package
+    pairtree_version_stmt = (
+        "This directory conforms to Pairtree Version 0.1. Updated spec: " +
+        "http://www.cdlib.org/inside/diglib/pairtree/pairtreespec.html"
+        )
 
     def __init__(self, hathi_id):
+        # HathiTrust record id
         self.hathi_id = hathi_id
-
-    @cached_property
-    def pairtree_prefix(self):
-        """pairtree prefix (first portion of the hathi id, short-form
-        identifier for owning institution)"""
-        return self.hathi_id.split(".", 1)[0]
-
-    @cached_property
-    def pairtree_id(self):
-        """pairtree identifier (second portion of source id)"""
-        return self.hathi_id.split(".", 1)[1]
-
-    @cached_property
-    def content_dir(self):
-        """content directory for this work within the appropriate
-        pairtree"""
-        # contents are stored in a directory named based on a
-        # pairtree encoded version of the id
-        return pairtree_path.id_encode(self.pairtree_id)
+        # Identifiers for owning institution and volume which form the overall
+        # HathiTrust record id: [lib_id].[vol_id]
+        self.lib_id, self.vol_id = hathi_id.split(".", 1)
+        # Pairtree prefix
+        self.pairtree_prefix = f"{self.lib_id}."
+        # Content directory for this work within the appropriate pairtree
+        # which is based on a pairtree encoded version of the volume id
+        self.content_dir = pairtree_path.id_encode(self.vol_id)
 
     def pairtree_client(self):
         """Initialize a pairtree client for the pairtree datastore this
-        object belongs to, based on its Hathi prefix id."""
+        object belongs to, based on its HathiTrust record id."""
+        store_dir = os.path.join(settings.HATHI_DATA, self.lib_id)
+        
+        # Check if store_dir exists, check if pairtree files exist
+        if os.path.isdir(store_dir):
+            # Check if "pairtree_prefix" file exists. If not, create it.
+            pairtree_prefix_fn = os.path.join(store_dir, "pairtree_prefix")
+            if not os.path.isfile(pairtree_prefix_fn):
+                with open(pairtree_prefix_fn, mode='w') as writer:
+                    writer.write(self.pairtree_prefix)
+            # Check if "pairtree_version0_1" file exists. If not, create it.
+            # Note: Mimicking paitree packages behavior. File contents are not
+            #       actually verified
+            pairtree_vn_fn = os.path.join(store_dir, "pairtree_version0_1")
+            if not os.path.isfile(pairtree_vn_fn):
+                with open(pairtree_vn_fn, mode="w") as writer:
+                    writer.write(self.pairtree_version_stmt)
+
         return pairtree_client.PairtreeStorageClient(
             self.pairtree_prefix,
-            os.path.join(settings.HATHI_DATA, self.pairtree_prefix),
+            store_dir,
         )
 
     def pairtree_object(self, ptree_client=None, create=False):
@@ -287,13 +298,13 @@ def pairtree_object(self, ptree_client=None, create=False):
             ptree_client = self.pairtree_client()
 
         # return the pairtree object for current work
-        return ptree_client.get_object(self.pairtree_id, create_if_doesnt_exist=create)
+        return ptree_client.get_object(self.vol_id, create_if_doesnt_exist=create)
 
     def delete_pairtree_data(self):
         """Delete pairtree object from the pairtree datastore."""
         logger.info("Deleting pairtree data for %s", self.hathi_id)
         try:
-            self.pairtree_client().delete_object(self.pairtree_id)
+            self.pairtree_client().delete_object(self.vol_id)
         except storage_exceptions.ObjectNotFoundException:
             # data is already gone; warn, but not an error
             logger.warning(
@@ -314,7 +325,6 @@ def _content_path(self, ext, ptree_client=None):
             raise storage_exceptions.PartNotFoundException
         return os.path.join(pairtree_obj.id_to_dirpath(), self.content_dir, filepaths[0])
 
-        
     def zipfile_path(self, ptree_client=None):
         """path to zipfile within the hathi contents for this work"""
         return self._content_path("zip", ptree_client=ptree_client)
diff --git a/ppa/archive/tests/test_hathi.py b/ppa/archive/tests/test_hathi.py
index 5b269143..3ef2aca2 100644
--- a/ppa/archive/tests/test_hathi.py
+++ b/ppa/archive/tests/test_hathi.py
@@ -1,5 +1,5 @@
 import json
-import os.path
+import os
 import tempfile
 from datetime import date
 from unittest.mock import Mock, patch
@@ -222,17 +222,50 @@ class TestHathiObject:
 
     ht_tempdir = tempfile.TemporaryDirectory(prefix="ht_text_pd")
 
-    def test_pairtree_prefix(self):
+    def test_init(self):
         hobj = hathi.HathiObject(hathi_id="uva.1234")
-        assert hobj.pairtree_prefix == "uva"
+        assert hobj.lib_id == "uva"
+        assert hobj.vol_id == "1234"
+        assert hobj.pairtree_prefix == "uva."
+        assert hobj.content_dir == pairtree_path.id_encode(hobj.vol_id)
 
-    def test_pairtree_id(self):
-        hobj = hathi.HathiObject(hathi_id="uva.1234")
-        assert hobj.pairtree_id == "1234"
-
-    def test_content_dir(self):
+    @override_settings(HATHI_DATA=ht_tempdir.name)
+    def test_pairtree_client(self):
         hobj = hathi.HathiObject(hathi_id="uva.1234")
-        assert hobj.content_dir == pairtree_path.id_encode(hobj.pairtree_id)
+        store_dir = os.path.join(settings.HATHI_DATA, hobj.lib_id)
+
+        # Case 1: Initialize client without directory
+        ptree_client = hobj.pairtree_client()
+
+        # assert file "pairtree_prefix" exists with correct contents
+        ptree_pfx_fn = os.path.join(store_dir, "pairtree_prefix")
+        with open(ptree_pfx_fn) as reader:
+            ptree_pfx_contents = reader.read()
+        assert ptree_pfx_contents == hobj.pairtree_prefix
+      
+        # assert file "pairtree_version0_1" exists with correct contents
+        ptree_vn_fn = os.path.join(store_dir, "pairtree_version0_1")
+        with open(ptree_vn_fn) as reader:
+          ptree_vn_contents = reader.read()
+        assert ptree_vn_contents == hobj.pairtree_version_stmt
+        
+        # Case 2: initialize client with directory but without files
+        os.remove(ptree_pfx_fn)
+        os.remove(ptree_vn_fn)
+
+        ptree_client = hobj.pairtree_client()
+
+        # assert file "pairtree_prefix" exists with correct contents
+        ptree_pfx_fn = os.path.join(store_dir, "pairtree_prefix")
+        with open(ptree_pfx_fn) as reader:
+            ptree_pfx_contents = reader.read()
+        assert ptree_pfx_contents == hobj.pairtree_prefix
+      
+        # assert file "pairtree_version0_1" exists with correct contents
+        ptree_vn_fn = os.path.join(store_dir, "pairtree_version0_1")
+        with open(ptree_vn_fn) as reader:
+          ptree_vn_contents = reader.read()
+        assert ptree_vn_contents == hobj.pairtree_version_stmt
 
     @patch("ppa.archive.hathi.pairtree_client")
     @override_settings(HATHI_DATA=ht_tempdir.name)
@@ -243,11 +276,11 @@ def test_pairtree_object(self, mock_pairtree_client):
         # client initialized
         mock_pairtree_client.PairtreeStorageClient.assert_called_with(
             hobj.pairtree_prefix,
-            os.path.join(settings.HATHI_DATA, hobj.pairtree_prefix),
+            os.path.join(settings.HATHI_DATA, hobj.lib_id),
         )
         # object retrieved
         mock_pairtree_client.PairtreeStorageClient.return_value.get_object.assert_called_with(
-            hobj.pairtree_id, create_if_doesnt_exist=False
+            hobj.vol_id, create_if_doesnt_exist=False
         )
         # object returned
         assert (
@@ -263,7 +296,7 @@ def test_pairtree_object(self, mock_pairtree_client):
         mock_pairtree_client.PairtreeStorageClient.assert_not_called()
         # should get object from my client
         my_ptree_client.get_object.assert_called_with(
-            hobj.pairtree_id, create_if_doesnt_exist=False
+            hobj.vol_id, create_if_doesnt_exist=False
         )
 
     @override_settings(HATHI_DATA=ht_tempdir.name)
@@ -330,7 +363,7 @@ def test_delete_pairtree_data(self):
             mock_pairtree_client.assert_called()
             # should call delete boject
             mock_pairtree_client.return_value.delete_object.assert_called_with(
-                hobj.pairtree_id
+                hobj.vol_id
             )
 
             # should not raise an exception if deletion fails

From 85bbf8f336fd5e6d0edcaa0dd0e1a7447c22207c Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 7 Mar 2024 09:34:03 -0500
Subject: [PATCH 05/71] Update codecov action for javascript unit tests to v4

---
 .github/workflows/unit-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 1e348837..40922993 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -28,7 +28,7 @@ jobs:
             ${{ runner.os }}-node-
       - run: npm ci
       - run: npm run test:unit
-      - uses: codecov/codecov-action@v3
+      - uses: codecov/codecov-action@v4
         with:
           flags: javascript
 

From 6fc542041568f1b36fda1f79f87a78ff5ffbdbe6 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 7 Mar 2024 10:53:42 -0500
Subject: [PATCH 06/71] Add method to get first page in original page range for
 excerpts

- make first page method explicit that it is digital
- test both digital and original first page methods
---
 ppa/archive/hathi.py             | 30 ++++++++++++++++++++++--------
 ppa/archive/models.py            | 29 ++++++++++++++++++++++++++---
 ppa/archive/tests/test_models.py | 13 +++++++++++++
 3 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/ppa/archive/hathi.py b/ppa/archive/hathi.py
index 0ee723e2..eeb7bc18 100644
--- a/ppa/archive/hathi.py
+++ b/ppa/archive/hathi.py
@@ -189,7 +189,7 @@ class StructMapPage(_METS):
          <METS:fptr FILEID="IMG00000001"/>
        <METS:file SIZE="1003" ID="HTML00000496" MIMETYPE="text/html" CREATED="2017-03-20T10:40:21Z"
          CHECKSUM="f0a326c10b2a6dc9ae5e3ede261c9897" SEQ="00000496" CHECKSUMTYPE="MD5">
-    """
+    """  # noqa: E501
 
     @cached_property
     def display_label(self):
@@ -244,9 +244,9 @@ class HathiObject:
 
     # Pairtree version statement usd by pairtree package
     pairtree_version_stmt = (
-        "This directory conforms to Pairtree Version 0.1. Updated spec: " +
-        "http://www.cdlib.org/inside/diglib/pairtree/pairtreespec.html"
-        )
+        "This directory conforms to Pairtree Version 0.1. Updated spec: "
+        + "http://www.cdlib.org/inside/diglib/pairtree/pairtreespec.html"
+    )
 
     def __init__(self, hathi_id):
         # HathiTrust record id
@@ -264,13 +264,13 @@ def pairtree_client(self):
         """Initialize a pairtree client for the pairtree datastore this
         object belongs to, based on its HathiTrust record id."""
         store_dir = os.path.join(settings.HATHI_DATA, self.lib_id)
-        
+
         # Check if store_dir exists, check if pairtree files exist
         if os.path.isdir(store_dir):
             # Check if "pairtree_prefix" file exists. If not, create it.
             pairtree_prefix_fn = os.path.join(store_dir, "pairtree_prefix")
             if not os.path.isfile(pairtree_prefix_fn):
-                with open(pairtree_prefix_fn, mode='w') as writer:
+                with open(pairtree_prefix_fn, mode="w") as writer:
                     writer.write(self.pairtree_prefix)
             # Check if "pairtree_version0_1" file exists. If not, create it.
             # Note: Mimicking paitree packages behavior. File contents are not
@@ -320,10 +320,12 @@ def _content_path(self, ext, ptree_client=None):
         parts = pairtree_obj.list_parts(self.content_dir)
         # find the first zipfile in the list (should only be one)
         filepaths = [part for part in parts if part.endswith(ext)]
-        if not filepaths: 
+        if not filepaths:
             # An error has occurred -- there is no zip file here in parts
             raise storage_exceptions.PartNotFoundException
-        return os.path.join(pairtree_obj.id_to_dirpath(), self.content_dir, filepaths[0])
+        return os.path.join(
+            pairtree_obj.id_to_dirpath(), self.content_dir, filepaths[0]
+        )
 
     def zipfile_path(self, ptree_client=None):
         """path to zipfile within the hathi contents for this work"""
@@ -332,3 +334,15 @@ def zipfile_path(self, ptree_client=None):
     def metsfile_path(self, ptree_client=None):
         """path to mets xml file within the hathi contents for this work"""
         return self._content_path(".mets.xml", ptree_client=ptree_client)
+
+    def mets_xml(self) -> MinimalMETS:
+        """load METS xml file from pairtree and initialize as an instance
+        of :class:`MinimalMETS`
+
+        :rtype: :class:`MinimalMETS`
+        :raises: :class:`storage_exceptions.ObjectNotFoundException` if the
+            object is not found in pairtree storage
+        :raises: :class:`storage_exceptions.PartNotFoundException` if the
+            mets.xml flie is not found in pairtree storage for this object
+        """
+        return xmlmap.load_xmlobject_from_file(self.metsfile_path(), MinimalMETS)
diff --git a/ppa/archive/models.py b/ppa/archive/models.py
index f7959722..569f7175 100644
--- a/ppa/archive/models.py
+++ b/ppa/archive/models.py
@@ -11,7 +11,6 @@
 from django.core.exceptions import ValidationError
 from django.db import models
 from django.urls import reverse
-from eulxml.xmlmap import load_xmlobject_from_file
 from flags import Flags
 from intspan import ParseError as IntSpanParseError
 from intspan import intspan
@@ -24,7 +23,7 @@
 from wagtail.snippets.models import register_snippet
 
 from ppa.archive.gale import GaleAPI, MARCRecordNotFound, get_marc_record
-from ppa.archive.hathi import HathiBibliographicAPI, HathiObject, MinimalMETS
+from ppa.archive.hathi import HathiBibliographicAPI, HathiObject
 
 logger = logging.getLogger(__name__)
 
@@ -811,9 +810,32 @@ def populate_from_bibdata(self, bibdata):
 
     def first_page(self):
         """Number of the first page in range, if this is an excerpt"""
+        # return digital page for now; may be switching to original
+        # or this method may be going away
+        return self.first_page_digital()
+
+    def first_page_digital(self):
+        """Number of the first page in range (digital pages / page index),
+        if this is an excerpt.
+
+        :return: first page number for digital page range; None if no page range
+        :rtype: int, None
+        """
         if self.pages_digital:
             return list(self.page_span)[0]
 
+    def first_page_original(self):
+        """Number of the first page in range (original page numbering)
+        if this is an excerpt
+
+        :return: first page number for original page range; None if no page range
+        :rtype: str, None
+        """
+        # use regex since it handles all cases (intspan only works for a subset)
+        match = re.match(r"([\da-z]+)([,-]|\b)", self.pages_orig)
+        if match:
+            return match.group(1)
+
     def index_id(self):
         """use source id + first page in range (if any) as solr identifier"""
         first_page = self.first_page()
@@ -949,6 +971,7 @@ def count_pages(self, ptree_client=None):
 
     @property
     def page_span(self):
+        # TODO: relabel to make it explicit that this is digital pages?
         # convert the specified page numbers into an intspan
         # if empty, returns an empty set
         return intspan(self.pages_digital)
@@ -1140,7 +1163,7 @@ def hathi_page_index_data(cls, digwork):
 
         # load mets record to pull metadata about the images
         try:
-            mmets = load_xmlobject_from_file(digwork.hathi.metsfile_path(), MinimalMETS)
+            mmets = digwork.hathi.mets_xml()
         except storage_exceptions.ObjectNotFoundException:
             logger.error(
                 "Pairtree data for %s not found but status is %s",
diff --git a/ppa/archive/tests/test_models.py b/ppa/archive/tests/test_models.py
index 2aaec020..b97a9ef5 100644
--- a/ppa/archive/tests/test_models.py
+++ b/ppa/archive/tests/test_models.py
@@ -778,6 +778,19 @@ def test_page_range_validation(self):
             work.clean_fields()
         assert "start value should exceed stop (355-35)" in str(err)
 
+    def test_first_page_digital(self):
+        assert DigitizedWork(pages_digital="133-135").first_page_digital() == 133
+
+    def test_first_page_original(self):
+        # citation-style page range (second number is incomplete)
+        assert DigitizedWork(pages_orig="133-5").first_page_original() == "133"
+        # single page number
+        assert DigitizedWork(pages_orig="133").first_page_original() == "133"
+        # discontinuous page range
+        assert DigitizedWork(pages_orig="133, 134").first_page_original() == "133"
+        # roman numreals
+        assert DigitizedWork(pages_orig="iii-xiv").first_page_original() == "iii"
+
     def test_is_suppressed(self):
         work = DigitizedWork(source_id="chi.79279237")
         assert not work.is_suppressed

From 63d9079e9d6bb4cea93ae4dcf3e3617e635778c9 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 7 Mar 2024 11:17:06 -0500
Subject: [PATCH 07/71] Preliminary manage command to check on excerpt page
 range mismatches

ref #560
---
 .../commands/check_hathi_excerpts.py          | 128 ++++++++++++++++++
 1 file changed, 128 insertions(+)
 create mode 100644 ppa/archive/management/commands/check_hathi_excerpts.py

diff --git a/ppa/archive/management/commands/check_hathi_excerpts.py b/ppa/archive/management/commands/check_hathi_excerpts.py
new file mode 100644
index 00000000..ede456d4
--- /dev/null
+++ b/ppa/archive/management/commands/check_hathi_excerpts.py
@@ -0,0 +1,128 @@
+import csv
+
+from django.core.management.base import BaseCommand
+from pairtree import storage_exceptions
+
+from intspan import intspan
+
+from ppa.archive.models import DigitizedWork
+from ppa.archive.templatetags.ppa_tags import hathi_page_url
+
+
+class Command(BaseCommand):
+    """Check page alignment for excerpted HathiTrust digitized items."""
+
+    help = __doc__
+
+    #: normal verbosity level
+    v_normal = 1
+    #: verbosity level for the current run; defaults to 1 / normal
+    verbosity = v_normal
+
+    def handle(self, *args, **kwargs):
+        self.verbosity = kwargs.get("verbosity", self.verbosity)
+
+        # find all excerpted, non-suppressed hathi volumes
+        hathi_vols = DigitizedWork.objects.filter(
+            source=DigitizedWork.HATHI,
+            status=DigitizedWork.PUBLIC,
+        ).exclude(item_type=DigitizedWork.FULL)
+
+        output_fields = [
+            "source_id",
+            "unique_id",
+            "pages_orig",
+            "pages_digital",
+            "orig_label_match",
+            "pages_digital_corrected",
+            "old_hathi_start",
+            "new_hathi_start",
+            "notes",
+        ]
+
+        with open("ppa-excerpt-pagecheck.csv", "w") as csvfile:
+            csvwriter = csv.DictWriter(csvfile, fieldnames=output_fields)
+            csvwriter.writeheader()
+
+            for digwork in hathi_vols:
+                info = {
+                    "source_id": digwork.source_id,
+                    # source id + first page (currently digital, will be switching to original)
+                    "unique_id": digwork.index_id(),
+                    "pages_orig": digwork.pages_orig,
+                    "pages_digital": digwork.pages_digital,
+                    "old_hathi_start": hathi_page_url(
+                        digwork.source_id, digwork.first_page_digital()
+                    ),
+                }
+                # NOTE: mets loading copied from hathi_page_index_data method
+                # worth movint to a method on the hathi object?
+                try:
+                    mmets = digwork.hathi.mets_xml()
+                except storage_exceptions.ObjectNotFoundException:
+                    # document the error in the output csv, stop processing
+                    info["notes"] = "pairtree data not found"
+                    csvwriter.writerow(info)
+                    continue
+                except storage_exceptions.PartNotFoundException:
+                    info["notes"] = "error loading mets file (part not found)"
+                    csvwriter.writerow(info)
+                    continue
+
+                # make a list of page labels and order from mets structmap
+                page_info = [
+                    {"order": page.order, "label": page.orderlabel}
+                    # also have access to label (@LABEL vs @ORDERLABEL)
+                    for page in mmets.structmap_pages
+                ]
+
+                # use digital page range to get the first page in the mets
+                # that would be included with current digital range (1-based index)
+                try:
+                    excerpt_first_page = page_info[digwork.first_page_digital() + 1]
+                except IndexError:
+                    if digwork.first_page_digital() >= len(page_info):
+                        excerpt_first_page[-1]
+                        info["notes"] = "digital page out of range; trying last page"
+
+                # some mets records don't have labels
+                # or, label attribute may be present but empty
+                # do we need to check if all pages are missing labels?
+                if (
+                    excerpt_first_page["label"] is None
+                    or excerpt_first_page["label"].strip() == ""
+                ):
+                    # add a note that mets doesn't have labels, stop processing
+                    info["notes"] = "no page label in METS structmap"
+                    csvwriter.writerow(info)
+                    continue
+
+                # check if METS page label for the first page in range
+                # matches the desired first original page
+                if excerpt_first_page["label"] != str(digwork.first_page_original()):
+                    info["orig_label_match"] = "N"
+                    # if they don't match, can we calculate the offset?
+                    # (only works for numeric page labels)
+                    try:
+                        diff = int(digwork.first_page_original()) - int(
+                            excerpt_first_page["label"]
+                        )
+                        # calculate the expected new digital page range
+                        # - apply the difference to each number in range,
+                        #   since we do have some discontinuous ranges
+                        # - convert back to intspan so we can output in
+                        #   page range format (1-3 or 1-3,5)
+                        new_range = [n + diff for n in digwork.page_span]
+                        info["pages_digital_corrected"] = intspan(new_range)
+                        info["new_hathi_start"] = hathi_page_url(
+                            digwork.source_id, new_range[0]
+                        )
+                    except ValueError as err:
+                        info["notes"] = "could not calculate page offset (%s)" % err
+
+                else:
+                    info["orig_label_match"] = "Y"
+                    info["notes"] = "page labels match"
+
+                # either way, write out the info
+                csvwriter.writerow(info)

From 0c89dd24043b625fff4b882416508f536354a00e Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 7 Mar 2024 12:49:14 -0500
Subject: [PATCH 08/71] Handle rsync for more records at once, add optional
 output dir param

---
 ppa/archive/import_util.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/ppa/archive/import_util.py b/ppa/archive/import_util.py
index 84f4e176..734f8add 100644
--- a/ppa/archive/import_util.py
+++ b/ppa/archive/import_util.py
@@ -159,7 +159,14 @@ def import_digitizedwork(self, log_msg_src=None, user=None):
 
 class HathiImporter(DigitizedWorkImporter):
     """Logic for creating new :class:`~ppa.archive.models.DigitizedWork`
-    records from HathiTrust. For use in views and manage commands."""
+    records from HathiTrust. For use in views and manage commands.
+
+    :param list source_ids: list of HathiTrust source ids (htid) to
+        synchronize (optional)
+    :param bool rsync_output: determines whether rsync itemized report
+        is enabled (default: False)
+    :param str output_dir: base directory for rsync output file (optional)
+    """
 
     #: rsync error
     RSYNC_ERROR = 4
@@ -178,10 +185,11 @@ class HathiImporter(DigitizedWorkImporter):
         }
     )
 
-    def __init__(self, source_ids=None, rsync_output=False):
+    def __init__(self, source_ids=None, rsync_output=False, output_dir=""):
         super().__init__(source_ids)
         # track whether (and how much) rsync output is desired
         self.rsync_output = rsync_output
+        self.output_dir = output_dir
 
     def filter_invalid_ids(self):
         """Remove any ids that don't look valid. At minimum, must
@@ -247,7 +255,15 @@ def pairtree_paths(self):
     def rsync_data(self):
         """Use rsync to retrieve data for the volumes to be imported."""
 
-        logger.info("rsyncing pairtree data for %s", ", ".join(self.source_ids))
+        # limit the number of ids included in the log message
+        log_detail = ""
+        rsync_count = len(self.source_ids)
+        if rsync_count <= 12:
+            log_detail = ", ".join(self.source_ids)
+        else:
+            log_detail = "%d volumes" % rsync_count
+
+        logger.info("rsyncing pairtree data for %s", log_detail)
 
         # create temp file with list of paths to synchronize
         with tempfile.NamedTemporaryFile(
@@ -271,8 +287,9 @@ def rsync_data(self):
             # if rsync output requested, include itemize and log fileargs
             output_opts = ""
             if self.rsync_output:
-                outputfilename = "ppa_hathi_rsync_%s.log" % datetime.now().strftime(
-                    "%Y%m%d-%H%M%S"
+                outputfilename = os.path.join(
+                    self.output_dir,
+                    "ppa_hathi_rsync_%s.log" % datetime.now().strftime("%Y%m%d-%H%M%S"),
                 )
                 # output requested: always log content to a file
                 output_opts = "--log-file=%s" % outputfilename
@@ -292,7 +309,7 @@ def rsync_data(self):
                 subprocess.run(args=rsync_cmd.split(), check=True)
             except subprocess.CalledProcessError as err:
                 logger.error(
-                    "HathiTrust rsync failed — %s / command: %s"
+                    "HathiTrust rsync failed — %s / command: %s"
                     % (self.RSYNC_RETURN_CODES[err.returncode], rsync_cmd)
                 )
 

From 55559278f218651040ae7aa825316ad4fc70852b Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 7 Mar 2024 12:51:35 -0500
Subject: [PATCH 09/71] Update hathi_rsync command for bulk rsync, report on
 updated htids

---
 .../management/commands/hathi_rsync.py        | 76 +++++++++++++++----
 1 file changed, 61 insertions(+), 15 deletions(-)

diff --git a/ppa/archive/management/commands/hathi_rsync.py b/ppa/archive/management/commands/hathi_rsync.py
index cc160564..94244405 100644
--- a/ppa/archive/management/commands/hathi_rsync.py
+++ b/ppa/archive/management/commands/hathi_rsync.py
@@ -1,4 +1,8 @@
-from django.core.management.base import BaseCommand, CommandError
+import os.path
+from datetime import datetime
+
+from django.core.management.base import BaseCommand
+from pairtree import path2id
 
 from ppa.archive.import_util import HathiImporter
 from ppa.archive.models import DigitizedWork
@@ -26,22 +30,64 @@ def handle(self, *args, **kwargs):
         # use ids specified via command line when present
         htids = kwargs.get("htids", [])
 
-        # if hathi ids not specified via command line,
-        # get all non-suppressed hathi records
-        if not htids:
-            htids = DigitizedWork.objects.filter(
-                status=DigitizedWork.PUBLIC, source=DigitizedWork.HATHI
-            ).values_list("source_id", flat=True)
+        # by default, sync data for all non-suppressed hathi source ids
+        digworks = DigitizedWork.objects.filter(
+            status=DigitizedWork.PUBLIC, source=DigitizedWork.HATHI
+        )
 
-        # NOTE: if htid is specified, should we verify that it's
-        # in the db and not suppressed? (should import first if not)
+        # if htids are specified via parameter, use them to filter
+        # the queryset, to ensure we only sync records that are
+        # in the database and not suppressed
+        if htids:
+            digworks = digworks.filter(source_id__in=htids)
+        # NOTE: report here on any skipped ids?
 
-        self.stdout.write(
-            self.style.SUCCESS("Synchronizing data for %d records" % len(htids))
-        )
-        # even if verbosity is zero we want an output file
+        # generate a list of unique source ids from the queryset
+        hathi_ids = digworks.values_list("source_id", flat=True).distinct()
+        self.stdout.write("Synchronizing data for %d records" % len(hathi_ids))
+        # we always want itemized rsync output, so we can report
+        # on which volumes were updated
         htimporter = HathiImporter(
-            source_ids=htids, rsync_output=self.verbosity or True
+            source_ids=hathi_ids, rsync_output=True, output_dir="/tmp"
         )
         logfile = htimporter.rsync_data()
-        self.stdout.write(self.style.SUCCESS("rsync output is in %s" % logfile))
+
+        # read the rsync itemized output to identify records where file
+        # sizes changed
+        updated_ids = set()
+        with open(logfile) as rsync_output:
+            for line in rsync_output:
+                # if a line indicates that a file was updated due
+                # to a change in size, use the path to determine the hathi id
+                if " >f.s" in line:
+                    # rsync itemized output is white-space delimited;
+                    # last element is the filename that was updated
+                    filename = line.rsplit()[-1].strip()
+                    # we only care about zip files and mets.xml files
+                    if not filename.endswith(".zip") and not filename.endswith(".xml"):
+                        continue
+                    # reconstruct the hathi id from the filepath
+                    ht_prefix, pairtree_dir = filename.split("/pairtree_root/")
+                    # get the directory one level up from the updated file
+                    pairtree_id = os.path.dirname(os.path.dirname(pairtree_dir))
+                    # use pairtree to determine the id based on the path
+                    # (handles special characters like those used in ARKs)
+                    htid = f"{ht_prefix}.{path2id(pairtree_id)}"
+                    updated_ids.add(htid)
+
+        # should this behavior only be when updating all?
+        # if specific htids are specified on the command line, maybe report on them only?
+        if updated_ids:
+            outfilename = "ppa_rsync_updated_htids_%s.txt" % datetime.now().strftime(
+                "%Y%m%d-%H%M%S"
+            )
+            with open(outfilename, "w") as outfile:
+                outfile.write("\n".join(sorted(updated_ids)))
+            success_msg = (
+                f"File sizes changed for {len(updated_ids)} hathi ids; "
+                + f"full list in {outfilename}"
+            )
+        else:
+            success_msg = "rsync completed; no changes to report"
+
+        self.stdout.write(self.style.SUCCESS(success_msg))

From 77f35d073a18c995c2e987d6c650b6464ff55312 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 7 Mar 2024 13:21:13 -0500
Subject: [PATCH 10/71] Update tests for change to where mets xml is loaded

---
 ppa/archive/tests/test_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ppa/archive/tests/test_models.py b/ppa/archive/tests/test_models.py
index b97a9ef5..75668791 100644
--- a/ppa/archive/tests/test_models.py
+++ b/ppa/archive/tests/test_models.py
@@ -1011,7 +1011,7 @@ def test_hathi_page_index_data(self, mockzipfile):
         mets = load_xmlobject_from_file(TestDigitizedWork.metsfile, hathi.MinimalMETS)
         with patch.object(DigitizedWork, "hathi") as mock_hathiobj:
             mock_hathiobj.zipfile_path.return_value = "/path/to/79279237.zip"
-            mock_hathiobj.metsfile_path.return_value = TestDigitizedWork.metsfile
+            mock_hathiobj.mets_xml.return_value = mets
             mock_hathiobj.content_dir = "data"
 
             page_data = Page.page_index_data(work)

From 7be1c08af48acf9959e4dddf966512d53e928f7b Mon Sep 17 00:00:00 2001
From: Rebecca Sutton Koeser <rlskoeser@users.noreply.github.com>
Date: Thu, 7 Mar 2024 14:50:29 -0500
Subject: [PATCH 11/71] Update ppa/archive/import_util.py

Co-authored-by: Laure Thompson <602628+laurejt@users.noreply.github.com>
---
 ppa/archive/import_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ppa/archive/import_util.py b/ppa/archive/import_util.py
index 734f8add..4bcd5ad0 100644
--- a/ppa/archive/import_util.py
+++ b/ppa/archive/import_util.py
@@ -258,7 +258,7 @@ def rsync_data(self):
         # limit the number of ids included in the log message
         log_detail = ""
         rsync_count = len(self.source_ids)
-        if rsync_count <= 12:
+        if rsync_count <= 10:
             log_detail = ", ".join(self.source_ids)
         else:
             log_detail = "%d volumes" % rsync_count

From 4108474ab08af15199628e6a40eac444215a7433 Mon Sep 17 00:00:00 2001
From: Rebecca Sutton Koeser <rlskoeser@users.noreply.github.com>
Date: Thu, 7 Mar 2024 14:54:05 -0500
Subject: [PATCH 12/71] Update ppa/archive/management/commands/hathi_rsync.py

Co-authored-by: Laure Thompson <602628+laurejt@users.noreply.github.com>
---
 ppa/archive/management/commands/hathi_rsync.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ppa/archive/management/commands/hathi_rsync.py b/ppa/archive/management/commands/hathi_rsync.py
index 94244405..f337c6a0 100644
--- a/ppa/archive/management/commands/hathi_rsync.py
+++ b/ppa/archive/management/commands/hathi_rsync.py
@@ -67,7 +67,7 @@ def handle(self, *args, **kwargs):
                     if not filename.endswith(".zip") and not filename.endswith(".xml"):
                         continue
                     # reconstruct the hathi id from the filepath
-                    ht_prefix, pairtree_dir = filename.split("/pairtree_root/")
+                    ht_prefix, pairtree_dir = filename.split("/pairtree_root/", 1)
                     # get the directory one level up from the updated file
                     pairtree_id = os.path.dirname(os.path.dirname(pairtree_dir))
                     # use pairtree to determine the id based on the path

From 4d716788d5637c5246ff5ac3ea0c267fb68d5c39 Mon Sep 17 00:00:00 2001
From: Rebecca Sutton Koeser <rlskoeser@users.noreply.github.com>
Date: Thu, 7 Mar 2024 15:45:48 -0500
Subject: [PATCH 13/71] Update ppa/archive/management/commands/hathi_rsync.py

Co-authored-by: Laure Thompson <602628+laurejt@users.noreply.github.com>
---
 ppa/archive/management/commands/hathi_rsync.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ppa/archive/management/commands/hathi_rsync.py b/ppa/archive/management/commands/hathi_rsync.py
index f337c6a0..127c42ee 100644
--- a/ppa/archive/management/commands/hathi_rsync.py
+++ b/ppa/archive/management/commands/hathi_rsync.py
@@ -43,7 +43,7 @@ def handle(self, *args, **kwargs):
         # NOTE: report here on any skipped ids?
 
         # generate a list of unique source ids from the queryset
-        hathi_ids = digworks.values_list("source_id", flat=True).distinct()
+        working_htids = digworks.values_list("source_id", flat=True).distinct()
         self.stdout.write("Synchronizing data for %d records" % len(hathi_ids))
         # we always want itemized rsync output, so we can report
         # on which volumes were updated

From c2333785430ad3296648f5359060b815d8974849 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 7 Mar 2024 16:21:34 -0500
Subject: [PATCH 14/71] Report skipped htid; use proper temp directory; report
 any changed files

per feedback from @laurejt
---
 .../management/commands/hathi_rsync.py        | 74 ++++++++++++++-----
 1 file changed, 56 insertions(+), 18 deletions(-)

diff --git a/ppa/archive/management/commands/hathi_rsync.py b/ppa/archive/management/commands/hathi_rsync.py
index 127c42ee..4b9532ee 100644
--- a/ppa/archive/management/commands/hathi_rsync.py
+++ b/ppa/archive/management/commands/hathi_rsync.py
@@ -1,4 +1,6 @@
+import csv
 import os.path
+import tempfile
 from datetime import datetime
 
 from django.core.management.base import BaseCommand
@@ -40,29 +42,50 @@ def handle(self, *args, **kwargs):
         # in the database and not suppressed
         if htids:
             digworks = digworks.filter(source_id__in=htids)
-        # NOTE: report here on any skipped ids?
 
         # generate a list of unique source ids from the queryset
         working_htids = digworks.values_list("source_id", flat=True).distinct()
-        self.stdout.write("Synchronizing data for %d records" % len(hathi_ids))
+
+        # if htids were explicitly specified, report if any are skipped
+        if htids:
+            skipped_htids = set(htids) - set(working_htids)
+            if skipped_htids:
+                self.stdout.write(
+                    self.style.NOTICE(
+                        "Some ids not found in public HathiTrust volumes; skipping %s"
+                        % " ".join(skipped_htids)
+                    )
+                )
+
+        # bail out if there's nothing to do
+        # (e.g., explicit htids only and none valid)
+        if not working_htids:
+            return
+
+        self.stdout.write("Synchronizing data for %d records" % len(working_htids))
+
+        # create a tempdir for rsync logfile; will automatically be cleaned up
+        output_dir = tempfile.TemporaryDirectory(prefix="ppa-rsync_")
         # we always want itemized rsync output, so we can report
-        # on which volumes were updated
+        # on which htids have updated content
         htimporter = HathiImporter(
-            source_ids=hathi_ids, rsync_output=True, output_dir="/tmp"
+            source_ids=working_htids, rsync_output=True, output_dir=output_dir.name
         )
         logfile = htimporter.rsync_data()
 
-        # read the rsync itemized output to identify records where file
-        # sizes changed
-        updated_ids = set()
+        # read the rsync itemized output to identify and report on changes
+        updated_files = []
         with open(logfile) as rsync_output:
             for line in rsync_output:
-                # if a line indicates that a file was updated due
-                # to a change in size, use the path to determine the hathi id
-                if " >f.s" in line:
-                    # rsync itemized output is white-space delimited;
+                # check for a line indicating that a file was updated
+                if " >f" in line:
+                    # rsync itemized output is white-space delimited
+                    parts = line.split()
                     # last element is the filename that was updated
-                    filename = line.rsplit()[-1].strip()
+                    filename = parts[-1]
+                    # itemized info flags preced the filename
+                    flags = parts[-2]
+
                     # we only care about zip files and mets.xml files
                     if not filename.endswith(".zip") and not filename.endswith(".xml"):
                         continue
@@ -73,19 +96,34 @@ def handle(self, *args, **kwargs):
                     # use pairtree to determine the id based on the path
                     # (handles special characters like those used in ARKs)
                     htid = f"{ht_prefix}.{path2id(pairtree_id)}"
-                    updated_ids.add(htid)
+                    updated_files.append(
+                        {
+                            "htid": htid,
+                            "filename": os.path.basename(filename),
+                            # rsync itemized flags look like >f.st....
+                            # or >f+++++++ for new files
+                            "size_changed": flags[3] == "s",
+                            "modification_time": flags[4] == "t",
+                            "rsync_flags": flags,
+                        }
+                    )
 
         # should this behavior only be when updating all?
         # if specific htids are specified on the command line, maybe report on them only?
-        if updated_ids:
-            outfilename = "ppa_rsync_updated_htids_%s.txt" % datetime.now().strftime(
+        if updated_files:
+            outfilename = "ppa_rsync_changes_%s.csv" % datetime.now().strftime(
                 "%Y%m%d-%H%M%S"
             )
+            fields = updated_files[0].keys()
+            print(fields)
             with open(outfilename, "w") as outfile:
-                outfile.write("\n".join(sorted(updated_ids)))
+                csvwriter = csv.DictWriter(outfile, fieldnames=fields)
+                csvwriter.writeheader()
+                csvwriter.writerows(updated_files)
+            updated_htids = set([i["htid"] for i in updated_files])
             success_msg = (
-                f"File sizes changed for {len(updated_ids)} hathi ids; "
-                + f"full list in {outfilename}"
+                f"Updated {len(updated_files)} files for {len(updated_htids)} volumes; "
+                + f"full details in {outfilename}"
             )
         else:
             success_msg = "rsync completed; no changes to report"

From 1cab68d9852fe7318394ac6acfef43a8c398fffd Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 7 Mar 2024 16:50:15 -0500
Subject: [PATCH 15/71] Implement & test validation for rsync output directory

---
 ppa/archive/import_util.py            | 16 ++++++++++++++--
 ppa/archive/tests/test_import_util.py | 21 ++++++++++++++++++++-
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/ppa/archive/import_util.py b/ppa/archive/import_util.py
index 4bcd5ad0..f22376aa 100644
--- a/ppa/archive/import_util.py
+++ b/ppa/archive/import_util.py
@@ -165,7 +165,10 @@ class HathiImporter(DigitizedWorkImporter):
         synchronize (optional)
     :param bool rsync_output: determines whether rsync itemized report
         is enabled (default: False)
-    :param str output_dir: base directory for rsync output file (optional)
+    :param str output_dir: base directory for rsync output file
+        (required if `rsync_output` is True)
+    :raises ValueError: if output_dir is unset when rsync_output is True or
+        if output_dir is not an existing directory
     """
 
     #: rsync error
@@ -185,10 +188,19 @@ class HathiImporter(DigitizedWorkImporter):
         }
     )
 
-    def __init__(self, source_ids=None, rsync_output=False, output_dir=""):
+    def __init__(self, source_ids=None, rsync_output=False, output_dir=None):
         super().__init__(source_ids)
         # track whether (and how much) rsync output is desired
         self.rsync_output = rsync_output
+        # if rsync output is enabled, output directory is required
+        if self.rsync_output:
+            if output_dir is None:
+                raise ValueError("output_dir is required when rsync_output is enabled")
+            elif not os.path.isdir(output_dir):
+                raise ValueError(
+                    "rsync output dir %s is not an existing directory", output_dir
+                )
+
         self.output_dir = output_dir
 
     def filter_invalid_ids(self):
diff --git a/ppa/archive/tests/test_import_util.py b/ppa/archive/tests/test_import_util.py
index 9b012328..86d37fcc 100644
--- a/ppa/archive/tests/test_import_util.py
+++ b/ppa/archive/tests/test_import_util.py
@@ -25,7 +25,6 @@ class TestHathiImporter(TestCase):
     fixtures = ["sample_digitized_works"]
 
     def test_filter_existing_ids(self):
-
         digwork_ids = DigitizedWork.objects.values_list("source_id", flat=True)
 
         # all existing - all should be flagged as existing
@@ -269,6 +268,26 @@ def test_rsync_data(self, mocksubprocess):
         assert "ppa_hathi_pathlist" in cmd_args[-3]
 
 
+def test_hathiimporter_init(tmp_path_factory):
+    # no rsync output, no output dir
+    htimporter = HathiImporter(["hvd.1234", "nyp.334455"])
+    assert htimporter.rsync_output is False
+    assert htimporter.output_dir is None
+
+    # rsync output requested with no output dir
+    with pytest.raises(ValueError, match="output_dir is required"):
+        HathiImporter(rsync_output=True)
+
+    # rsync output requested with non-existent output dir
+    with pytest.raises(ValueError, match="not an existing directory"):
+        # rsync output requested with invalid output dir
+        HathiImporter(rsync_output=True, output_dir="/tmp/foo/bar")
+
+    # with valid output dir
+    tmpdir = tmp_path_factory.mktemp("output")
+    assert HathiImporter(rsync_output=True, output_dir=str(tmpdir))
+
+
 class TestGaleImporter(TestCase):
     @patch("ppa.archive.import_util.GaleAPI")
     def test_add_items_noop(self, mock_gale_api):

From 4cdfeaf7943249a4f82fb0de284fdc1d89519d6a Mon Sep 17 00:00:00 2001
From: Rebecca Sutton Koeser <rlskoeser@users.noreply.github.com>
Date: Thu, 7 Mar 2024 17:43:59 -0500
Subject: [PATCH 16/71] Feature/excerpt revisions (#616)

* When suppressing excerpt, only delete data if the last from that volume

fixes #591

* Enable admin save as new to copy record #591

* Add help text to clarify when page count is calculated

* Add help text for page count field
---
 ppa/archive/admin.py                          |  3 ++
 ...0020_digitizedwork_page_count_help_text.py | 21 +++++++++++
 ppa/archive/models.py                         | 36 ++++++++++++++-----
 ppa/archive/tests/test_models.py              | 26 ++++++++++++++
 4 files changed, 77 insertions(+), 9 deletions(-)
 create mode 100644 ppa/archive/migrations/0020_digitizedwork_page_count_help_text.py

diff --git a/ppa/archive/admin.py b/ppa/archive/admin.py
index b9ac2fbd..6849fbe9 100644
--- a/ppa/archive/admin.py
+++ b/ppa/archive/admin.py
@@ -76,6 +76,9 @@ def get_queryset(self):
 class DigitizedWorkAdmin(ExportActionMixin, ExportMixin, admin.ModelAdmin):
     resource_class = DigitizedWorkResource  # resource for export
 
+    # enable "save as new" button to copy and create a new record
+    save_as = True
+
     list_display = (
         "display_title",
         "subtitle",
diff --git a/ppa/archive/migrations/0020_digitizedwork_page_count_help_text.py b/ppa/archive/migrations/0020_digitizedwork_page_count_help_text.py
new file mode 100644
index 00000000..bc750bd6
--- /dev/null
+++ b/ppa/archive/migrations/0020_digitizedwork_page_count_help_text.py
@@ -0,0 +1,21 @@
+# Generated by Django 5.0.2 on 2024-03-07 22:15
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("archive", "0019_alter_cluster_options"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="digitizedwork",
+            name="page_count",
+            field=models.PositiveIntegerField(
+                blank=True,
+                help_text="Automatically calculated on import; recalculated on save when digital page range changes",
+                null=True,
+            ),
+        ),
+    ]
diff --git a/ppa/archive/models.py b/ppa/archive/models.py
index f7959722..c311ff59 100644
--- a/ppa/archive/models.py
+++ b/ppa/archive/models.py
@@ -380,8 +380,13 @@ class DigitizedWork(ModelIndexable, TrackChangesModel):
     publisher = models.TextField(blank=True)
     # Needs to be integer to allow aggregating max/min, filtering by date
     pub_date = models.PositiveIntegerField("Publication Date", null=True, blank=True)
-    #: number of pages in the work
-    page_count = models.PositiveIntegerField(null=True, blank=True)
+    #: number of pages in the work (or page range, for an excerpt)
+    page_count = models.PositiveIntegerField(
+        null=True,
+        blank=True,
+        help_text="Automatically calculated on import; "
+        + "recalculated on save when digital page range changes",
+    )
     #: public notes field for this work
     public_notes = models.TextField(
         blank=True,
@@ -559,12 +564,25 @@ def hathi(self):
     def save(self, *args, **kwargs):
         # if status has changed so that object is now suppressed,
         # do some cleanup
-        if self.has_changed("status") and self.status == self.SUPPRESSED:
-            # remove indexed page content from Solr
-            self.solr.update.delete_by_query('source_id:"%s"' % self.source_id)
+        if self.has_changed("status") and self.status == DigitizedWork.SUPPRESSED:
+            # remove indexed page content from Solr using index id
+            # (i.e., if excerpt, should only remove content for this excerpt,
+            # not all excerpts in this volume)
+            self.solr.update.delete_by_query('group_id_s:"%s"' % self.index_id())
             # if this is a HathiTrust item, remove pairtree data
             if self.source == DigitizedWork.HATHI:
-                self.hathi.delete_pairtree_data()
+                # if this is a full work (not excerpted), remove
+                # if this is an excerpt, should only remove if there are no other
+                # public excerpts from this volume
+                if (
+                    self.item_type == DigitizedWork.FULL
+                    or not DigitizedWork.objects.filter(
+                        status=DigitizedWork.PUBLIC, source_id=self.source_id
+                    )
+                    .exclude(pk=self.pk)
+                    .exists()
+                ):
+                    self.hathi.delete_pairtree_data()
 
         # Solr identifier is based on combination of source id and first page;
         # if either changes, remove the old record from Solr before saving
@@ -582,10 +600,10 @@ def save(self, *args, **kwargs):
             self.pages_digital = new_pages_digital
 
         if self.has_changed("pages_digital"):
-            # if there is a page range set now, update page count and index
+            # update the page count if possible (i.e., not a Gale record)
+            self.page_count = self.count_pages()
+            # if there is a page range set, update page count and index
             if self.pages_digital:
-                # recalculate page total based on current range
-                self.page_count = self.count_pages()
                 # update index to remove all pages that are no longer in range
                 self.solr.update.delete_by_query(
                     'source_id:"%s" AND item_type:page NOT order:(%s)'
diff --git a/ppa/archive/tests/test_models.py b/ppa/archive/tests/test_models.py
index 2aaec020..926532c4 100644
--- a/ppa/archive/tests/test_models.py
+++ b/ppa/archive/tests/test_models.py
@@ -689,6 +689,32 @@ def test_save_suppress(self):
             work.save()
             mock_hathiobj.delete_pairtree_data.assert_not_called()
 
+    def test_save_suppress_excerpt(self):
+        work = DigitizedWork(source_id="chi.79279237", item_type=DigitizedWork.EXCERPT)
+        with patch.object(work, "hathi") as mock_hathiobj:
+            # no change in status - nothing should happen
+            work.save()
+            mock_hathiobj.delete_pairtree_data.assert_not_called()
+
+            # change status to suppressed, no other excerpts in this volume
+            # - data should be deleted
+            work.status = work.SUPPRESSED
+            work.save()
+            assert mock_hathiobj.delete_pairtree_data.call_count == 1
+
+            # second public excerpt from the same valoume
+            DigitizedWork.objects.create(
+                source_id="chi.79279237",
+                item_type=DigitizedWork.EXCERPT,
+                pages_orig="3-5",
+                pages_digital="5-7",
+            )
+            # reset mock so we can check it is not called
+            mock_hathiobj.delete_pairtree_data.reset_mock()
+            work.status = work.SUPPRESSED
+            work.save()
+            assert mock_hathiobj.delete_pairtree_data.call_count == 0
+
     def test_save_sourceid(self):
         # if source_id changes, old id should be removed from solr index
         work = DigitizedWork.objects.create(

From 99c7059fa9f247c383d287a09d905d5e7ba339ec Mon Sep 17 00:00:00 2001
From: Rebecca Sutton Koeser <rlskoeser@users.noreply.github.com>
Date: Fri, 8 Mar 2024 11:49:37 -0500
Subject: [PATCH 17/71] Update ppa/archive/management/commands/hathi_rsync.py

Co-authored-by: Laure Thompson <602628+laurejt@users.noreply.github.com>
---
 ppa/archive/management/commands/hathi_rsync.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ppa/archive/management/commands/hathi_rsync.py b/ppa/archive/management/commands/hathi_rsync.py
index 4b9532ee..5183ae69 100644
--- a/ppa/archive/management/commands/hathi_rsync.py
+++ b/ppa/archive/management/commands/hathi_rsync.py
@@ -83,7 +83,7 @@ def handle(self, *args, **kwargs):
                     parts = line.split()
                     # last element is the filename that was updated
                     filename = parts[-1]
-                    # itemized info flags preced the filename
+                    # itemized info flags precede the filename
                     flags = parts[-2]
 
                     # we only care about zip files and mets.xml files

From d4e714516c232dfc7d9c2127726b0108933fdb04 Mon Sep 17 00:00:00 2001
From: Rebecca Sutton Koeser <rlskoeser@users.noreply.github.com>
Date: Fri, 8 Mar 2024 11:50:01 -0500
Subject: [PATCH 18/71] Update ppa/archive/import_util.py

Co-authored-by: Laure Thompson <602628+laurejt@users.noreply.github.com>
---
 ppa/archive/import_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ppa/archive/import_util.py b/ppa/archive/import_util.py
index f22376aa..fa046a90 100644
--- a/ppa/archive/import_util.py
+++ b/ppa/archive/import_util.py
@@ -198,7 +198,7 @@ def __init__(self, source_ids=None, rsync_output=False, output_dir=None):
                 raise ValueError("output_dir is required when rsync_output is enabled")
             elif not os.path.isdir(output_dir):
                 raise ValueError(
-                    "rsync output dir %s is not an existing directory", output_dir
+                    f"rsync output dir {output_dir} is not an existing directory"
                 )
 
         self.output_dir = output_dir

From b758fa22ddeaa9ab8956b6540d3543e05fb20070 Mon Sep 17 00:00:00 2001
From: Rebecca Sutton Koeser <rlskoeser@users.noreply.github.com>
Date: Fri, 8 Mar 2024 11:50:20 -0500
Subject: [PATCH 19/71] Update ppa/archive/tests/test_import_util.py

Co-authored-by: Laure Thompson <602628+laurejt@users.noreply.github.com>
---
 ppa/archive/tests/test_import_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ppa/archive/tests/test_import_util.py b/ppa/archive/tests/test_import_util.py
index 86d37fcc..6ea089ba 100644
--- a/ppa/archive/tests/test_import_util.py
+++ b/ppa/archive/tests/test_import_util.py
@@ -281,7 +281,7 @@ def test_hathiimporter_init(tmp_path_factory):
     # rsync output requested with non-existent output dir
     with pytest.raises(ValueError, match="not an existing directory"):
         # rsync output requested with invalid output dir
-        HathiImporter(rsync_output=True, output_dir="/tmp/foo/bar")
+        HathiImporter(rsync_output=True, output_dir="/foo/bar/baz")
 
     # with valid output dir
     tmpdir = tmp_path_factory.mktemp("output")

From 380e04b61f7cb5af157502e533599e340b2f2101 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Fri, 8 Mar 2024 12:40:33 -0500
Subject: [PATCH 20/71] Improve readability and formatting based on suggestions
 from @laurejt

Co-authored-by: Laure Thompson <602628+laurejt@users.noreply.github.com>
---
 ppa/archive/management/commands/hathi_rsync.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/ppa/archive/management/commands/hathi_rsync.py b/ppa/archive/management/commands/hathi_rsync.py
index 5183ae69..1dad0f05 100644
--- a/ppa/archive/management/commands/hathi_rsync.py
+++ b/ppa/archive/management/commands/hathi_rsync.py
@@ -4,6 +4,7 @@
 from datetime import datetime
 
 from django.core.management.base import BaseCommand
+from django.template.defaultfilters import pluralize
 from pairtree import path2id
 
 from ppa.archive.import_util import HathiImporter
@@ -52,17 +53,21 @@ def handle(self, *args, **kwargs):
             if skipped_htids:
                 self.stdout.write(
                     self.style.NOTICE(
-                        "Some ids not found in public HathiTrust volumes; skipping %s"
-                        % " ".join(skipped_htids)
+                        f"{len(skipped_htids)} id{pluralize(skipped_htids)} "
+                        + "not found in public HathiTrust volumes; "
+                        + f"skipping {' '.join(skipped_htids)}"
                     )
                 )
 
         # bail out if there's nothing to do
         # (e.g., explicit htids only and none valid)
         if not working_htids:
+            self.stdout.write("No records to synchronize; stopping")
             return
 
-        self.stdout.write("Synchronizing data for %d records" % len(working_htids))
+        self.stdout.write(
+            f"Synchronizing data for {len(working_htids)} record{pluralize(working_htids)}"
+        )
 
         # create a tempdir for rsync logfile; will automatically be cleaned up
         output_dir = tempfile.TemporaryDirectory(prefix="ppa-rsync_")
@@ -111,8 +116,8 @@ def handle(self, *args, **kwargs):
         # should this behavior only be when updating all?
         # if specific htids are specified on the command line, maybe report on them only?
         if updated_files:
-            outfilename = "ppa_rsync_changes_%s.csv" % datetime.now().strftime(
-                "%Y%m%d-%H%M%S"
+            outfilename = "ppa_rsync_changes_{time}.csv".format(
+                time=datetime.now().strftime("%Y%m%d-%H%M%S")
             )
             fields = updated_files[0].keys()
             print(fields)

From 11a2815a7ba2c8a3d4153e486811b12bc115563d Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Fri, 8 Mar 2024 12:45:15 -0500
Subject: [PATCH 21/71] Remove debug print statement; document how csv header
 row is populated

---
 ppa/archive/management/commands/hathi_rsync.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ppa/archive/management/commands/hathi_rsync.py b/ppa/archive/management/commands/hathi_rsync.py
index 1dad0f05..bbf66cf2 100644
--- a/ppa/archive/management/commands/hathi_rsync.py
+++ b/ppa/archive/management/commands/hathi_rsync.py
@@ -119,8 +119,8 @@ def handle(self, *args, **kwargs):
             outfilename = "ppa_rsync_changes_{time}.csv".format(
                 time=datetime.now().strftime("%Y%m%d-%H%M%S")
             )
+            # use keys from the first row to populate csv header row
             fields = updated_files[0].keys()
-            print(fields)
             with open(outfilename, "w") as outfile:
                 csvwriter = csv.DictWriter(outfile, fieldnames=fields)
                 csvwriter.writeheader()

From 50d67b08fbc0883716319fb713525e242afc393f Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Fri, 8 Mar 2024 13:58:07 -0500
Subject: [PATCH 22/71] Remove extraneous tabs in page count script summary
 output

---
 ppa/archive/management/commands/update_hathi_pagecounts.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ppa/archive/management/commands/update_hathi_pagecounts.py b/ppa/archive/management/commands/update_hathi_pagecounts.py
index c1b172a0..0bf945cf 100644
--- a/ppa/archive/management/commands/update_hathi_pagecounts.py
+++ b/ppa/archive/management/commands/update_hathi_pagecounts.py
@@ -82,6 +82,6 @@ def handle(self, *args, **kwargs):
         if self.verbosity >= self.v_normal:
             self.stdout.write(
                 f"Volumes with updated page count: {stats['updated']:,}"
-                + f"\n\tPage count unchanged: {stats['unchanged']:,}"
-                + f"\n\tMissing pairtree data: {stats['missing_data']:,}"
+                + f"\nPage count unchanged: {stats['unchanged']:,}"
+                + f"\nMissing pairtree data: {stats['missing_data']:,}"
             )

From 3538f5db81cbc2edaac03c33572c6d171f71cdb3 Mon Sep 17 00:00:00 2001
From: Rebecca Sutton Koeser <rlskoeser@users.noreply.github.com>
Date: Tue, 19 Mar 2024 12:45:20 -0400
Subject: [PATCH 23/71] Implement and test 303 redirect for multiple cluster
 params (#621)

* Implement and test 303 redirect for multiple cluster params

fixes #619

* Fix incorrect mock patch target
---
 ppa/archive/tests/test_views.py | 14 +++++++++++++-
 ppa/archive/views.py            | 28 ++++++++++++++++------------
 2 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/ppa/archive/tests/test_views.py b/ppa/archive/tests/test_views.py
index 2d5ec96a..8cfef2b0 100644
--- a/ppa/archive/tests/test_views.py
+++ b/ppa/archive/tests/test_views.py
@@ -329,7 +329,7 @@ def test_nonhathi_display(self):
         self.assertNotContains(response, "View external record")
 
         # search term should be ignored for items without fulltext
-        with patch("ppa.archive.views.SolrQuerySet") as mock_solrq:
+        with patch("ppa.archive.views.PageSearchQuerySet") as mock_solrq:
             response = self.client.get(thesis.get_absolute_url(), {"query": "lady"})
             # not called at all
             assert mock_solrq.call_count == 0
@@ -1193,6 +1193,18 @@ def test_get_queryset(self):
             mock_qs.order_by.assert_called_with("sort_title")  # default sort
             mock_qs.work_filter.assert_called_with(author="Robert")
 
+    def test_too_many_clusters(self):
+        archive_list_url = reverse("archive:list")
+        response = self.client.get(archive_list_url, {"cluster": ["one", "two"]})
+        # if there is more than one cluster param,
+        # should redirect to archive search with a 303 See Other status code
+        assert response.status_code == 303
+        assert response["Location"] == archive_list_url
+        # single cluster should be fine
+        assert self.client.get(archive_list_url, {"cluster": "one"}).status_code == 200
+        # no cluster should also be fine
+        assert self.client.get(archive_list_url).status_code == 200
+
 
 class TestImportView(TestCase):
     superuser = {"username": "super", "password": str(uuid.uuid4())}
diff --git a/ppa/archive/views.py b/ppa/archive/views.py
index a9f37a4c..4bbf965a 100644
--- a/ppa/archive/views.py
+++ b/ppa/archive/views.py
@@ -1,9 +1,4 @@
-import csv
 import logging
-from collections import OrderedDict, defaultdict
-from http import HTTPStatus
-from json.decoder import JSONDecodeError
-from pprint import pprint
 
 import requests
 from django.contrib import messages
@@ -12,18 +7,14 @@
 from django.core.paginator import Paginator
 from django.http import (
     Http404,
-    HttpResponse,
     HttpResponsePermanentRedirect,
-    HttpResponseRedirect,
 )
 from django.shortcuts import get_object_or_404, redirect, render
 from django.urls import reverse
 from django.utils.http import urlencode
-from django.utils.timezone import now
 from django.views.generic import DetailView, ListView
 from django.views.generic.base import RedirectView, TemplateView
 from django.views.generic.edit import FormView
-from parasolr.django import SolrQuerySet
 from parasolr.django.views import SolrLastModifiedMixin
 
 from ppa.archive.forms import (
@@ -59,6 +50,18 @@ class DigitizedWorkListView(AjaxTemplateMixin, SolrLastModifiedMixin, ListView):
     # keyword query; assume no search terms unless set
     query = None
 
+    def get(self, *args, **kwargs):
+        # a bug used to allow aggregation of multiple cluster params,
+        # which is not supported; if detected, redirect to archive search
+        cluster_param = self.request.GET.getlist("cluster")
+        if cluster_param and len(cluster_param) > 1:
+            response = HttpResponsePermanentRedirect(reverse("archive:list"))
+            response.status_code = 303  # See other
+            return response
+
+        # otherwise, process response normally
+        return super(DigitizedWorkListView, self).get(*args, **kwargs)
+
     def get_queryset(self, **kwargs):
         form_opts = self.request.GET.copy()
         # if relevance sort is requested but there is no keyword search
@@ -243,8 +246,8 @@ def get_context_data(self, **kwargs):
             # or an error status set on the response
             context["error"] = "Something went wrong."
 
-        page_groups_keys = set(page_groups.keys())
-        page_highlights_keys = set(page_highlights.keys())
+        set(page_groups.keys())
+        set(page_highlights.keys())
         context.update(
             {
                 "search_form": self.form,
@@ -354,7 +357,8 @@ def get_context_data(self, **kwargs):
             # only return fields needed for page result display,
             # configure highlighting on page text content
             solr_pageq = (
-                PageSearchQuerySet()  # NOTE: Addition of an aliased queryset changes the _s keys below
+                # NOTE: Addition of an aliased queryset changes the _s keys below
+                PageSearchQuerySet()
                 .search(content="(%s)" % query)
                 .filter(group_id='"%s"' % digwork.index_id(), item_type="page")
                 .highlight("content", snippets=3, method="unified")

From 468496e41af3c02a7575d5250531b238c4fdc7cb Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Tue, 19 Mar 2024 13:55:19 -0400
Subject: [PATCH 24/71] Fix 1-based indexing when checking excerpt page ranges

Provide output to notify filename of report
---
 ppa/archive/management/commands/check_hathi_excerpts.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/ppa/archive/management/commands/check_hathi_excerpts.py b/ppa/archive/management/commands/check_hathi_excerpts.py
index ede456d4..5838566b 100644
--- a/ppa/archive/management/commands/check_hathi_excerpts.py
+++ b/ppa/archive/management/commands/check_hathi_excerpts.py
@@ -40,7 +40,8 @@ def handle(self, *args, **kwargs):
             "notes",
         ]
 
-        with open("ppa-excerpt-pagecheck.csv", "w") as csvfile:
+        report_filename = "ppa-excerpt-pagecheck.csv"
+        with open(report_filename, "w") as csvfile:
             csvwriter = csv.DictWriter(csvfile, fieldnames=output_fields)
             csvwriter.writeheader()
 
@@ -78,8 +79,9 @@ def handle(self, *args, **kwargs):
 
                 # use digital page range to get the first page in the mets
                 # that would be included with current digital range (1-based index)
+
                 try:
-                    excerpt_first_page = page_info[digwork.first_page_digital() + 1]
+                    excerpt_first_page = page_info[digwork.first_page_digital() - 1]
                 except IndexError:
                     if digwork.first_page_digital() >= len(page_info):
                         excerpt_first_page[-1]
@@ -126,3 +128,5 @@ def handle(self, *args, **kwargs):
 
                 # either way, write out the info
                 csvwriter.writerow(info)
+
+        self.stdout.write(f"Excerpt page check report available in {report_filename}")

From 0fdb842f1f1c6724d656b9f9a1dc7009ceebf70e Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Thu, 21 Mar 2024 16:22:53 -0400
Subject: [PATCH 25/71] Feature/collect version labels (#624)

Adds script for collecting version labels of HathiTrust records
---------
Co-authored-by: Rebecca Sutton Koeser <rlskoeser@users.noreply.github.com>
---
 scripts/README.md                             |  24 +
 scripts/get_version_labels.py                 |  94 ++++
 scripts/ht-excerpts-2023-09-20.txt            | 517 +++++++++++++++++
 .../version-labels-2024-03-20.tsv             | 518 ++++++++++++++++++
 .../version-labels-2024-03-21.tsv             | 518 ++++++++++++++++++
 5 files changed, 1671 insertions(+)
 create mode 100644 scripts/README.md
 create mode 100644 scripts/get_version_labels.py
 create mode 100644 scripts/ht-excerpts-2023-09-20.txt
 create mode 100644 scripts/version-labels/version-labels-2024-03-20.tsv
 create mode 100644 scripts/version-labels/version-labels-2024-03-21.tsv

diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 00000000..8a999eb6
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,24 @@
+# PPA Scripts
+
+This directory contains stand-alone scripts associated with the Princeton
+Prosody Archive that are not part of the web application proper.
+
+At this time, these scripts do not have any additional requirements.
+
+## HathiTrust "Version" Timestamps
+This script extracts and saves the version timestamp information from the
+public HathiTrust interface for a set of HathiTrust volumes. By default,
+the set of volumes corresponds to PPA excerpt records (based on an exported
+report).
+
+- `get_version_labels.py`: The script to run. This script extracts HathiTrust
+volume identifiers (htids) from a text file containing one htid per line. By
+default, the input file is `ht-excerpts-2023-09-20.txt`, but an alternative file
+can be specified as input. It writes its output as a tsv
+with columns corresponding to htids and their extracted version timestamps.
+    - input: Input `.txt` file. If none specified,
+    `ht-excerpts-2023-09-20.txt`.
+    - output: `version-labels/version-labels-[current date].tsv`. If this file
+    already exists, then the output file corresponds to a new (non-existing)
+    file `version-labels/version-labels-[current date]-[i].tsv` where `i`
+    is the smallest possible, postive integer.
diff --git a/scripts/get_version_labels.py b/scripts/get_version_labels.py
new file mode 100644
index 00000000..4c10a878
--- /dev/null
+++ b/scripts/get_version_labels.py
@@ -0,0 +1,94 @@
+"""
+Extract version labels from HathiTrust volume pages.
+"""
+import sys
+import os.path
+import re
+import time
+import datetime
+import requests
+
+
+def get_version_label(htid):
+    """
+    Extract the HathiTrust "version label" from a record's catalog page.
+    Returns the corresponding timestamp, returns None if the HTTP request fails.
+    """
+    # a script block in HT record page includes a number of HT.params including version timestamp
+    re_pattern = r'HT.params.versionLabel = "([^"]+)";'
+    catalog_url = f"https://hdl.handle.net/2027/{htid}"
+    try:
+        r = requests.get(catalog_url, timeout=5)
+    except requests.exceptions.Timeout:
+        # Handle timeouts gracefully (catch and continue)
+        print(f"Warning: request timed out for '{htid}'")
+        return
+    if r.status_code == requests.codes['ok']:
+        # Extract version_label from response text
+        version_label = re.findall(re_pattern, r.text)
+        if version_label:
+            return version_label[0]
+        else:
+            print(f"Warning: {htid} missing versionLabel!")
+    else:
+        print(f"Warning: bad/unexpected response for '{htid}'")
+
+
+def get_version_labels(htids, wait_time=1):
+    """
+    Extracts the HathiTrust "version label" for each record within htids.
+    Returns a list of the extracted htid-timestamp pairs.
+    """
+    version_pairs = []
+    n_skipped = 0
+    n_htids = len(htids)
+    for i, htid in enumerate(htids):
+        if i:
+            # Wait wait_time seconds between requests
+            time.sleep(wait_time)
+            # show progress
+        if i % 10 == 0:
+            print(f"Progress: {i}/{n_htids}")
+        version_label = get_version_label(htid)
+        if version_label:
+            version_pairs.append((htid, version_label))
+        else:
+            n_skipped += 1
+    if n_skipped:
+        print(f"Warning: Failed to gather versions for {n_skipped} volumes")
+    return version_pairs
+
+
+if __name__ == "__main__":
+    if len(sys.argv) not in [1,2]:
+        print("Usage: ([htids list])")
+        sys.exit(1)
+
+    # Check if an input file has been provided
+    in_tsv = "ht-excerpts-2023-09-20.txt" # Default value
+    if len(sys.argv) == 2:
+        in_tsv = sys.argv[1]
+
+    # Determine output file
+    out_pfx = os.path.join("version-labels",
+                           f"version-labels-{datetime.date.today()}")
+    out_tsv = f"{out_pfx}.tsv"
+    i = 0
+    while os.path.isfile(out_tsv):
+        # File exists, so update increment and add index
+        i += 1
+        out_tsv = f"{out_pfx}-{i}.tsv"
+
+    # Get htids
+    htids = []
+    with open(in_tsv) as reader:
+        for line in reader:
+            htid = line.strip()
+            htids.append(htid)
+    version_pairs = get_version_labels(htids)
+
+    # Write version labels to file
+    with open(out_tsv, mode='w') as writer:
+        writer.write(f"htid\tversion_label\n")
+        for htid, version_label in version_pairs:
+            writer.write(f"{htid}\t{version_label}\n")
diff --git a/scripts/ht-excerpts-2023-09-20.txt b/scripts/ht-excerpts-2023-09-20.txt
new file mode 100644
index 00000000..1a5f096b
--- /dev/null
+++ b/scripts/ht-excerpts-2023-09-20.txt
@@ -0,0 +1,517 @@
+hvd.32044090278565
+nyp.33433081683744
+uc1.b3924132
+mdp.39015026482151
+uiug.30112106245936
+hvd.32044009576562
+nyp.33433067294433
+coo.31924065856167
+uc1.ax0002627784
+wu.89001946482
+uc1.b3311895
+hvd.32044048963128
+njp.32101076199213
+coo.31924051399685
+njp.32101076472800
+njp.32101076472859
+nyp.33433074380688
+hvd.32044019842491
+uc1.32106001559381
+mdp.39015024071642
+hvd.hx28d7
+uva.x002111617
+uc1.$b161790
+uc1.$b683534
+nyp.33433076066723
+miun.ajd7522.0001.001
+hvd.hxv9b7
+coo.31924065856167
+uc1.$b275098
+nyp.33433081676979
+coo.31924066177589
+hvd.32044011432754
+njp.32101076384609
+pst.000020068974
+mdp.39015087700681
+njp.32101076403078
+njp.32101075673655
+udel.31741113248746
+hvd.hn34f5
+inu.30000066028642
+uc1.b3794203
+uc1.b3794204
+inu.30000099671491
+hvd.hnqbsu
+mdp.39015020696541
+njp.32101075672541
+mdp.39015060441675
+hvd.32044031554363
+hvd.32044048963185
+njp.32101076403300
+coo.31924057525382
+coo.31924065580551
+mdp.39015008833884
+njp.32101010945275
+njp.32101076201159
+inu.30000099671632
+yale.39002004065844
+hvd.32044098627870
+inu.30000092253941
+mdp.39015060429308
+mdp.39015060429464
+uc1.b3385165
+uiug.30112042710548
+uc1.b3548551
+uc1.b3850894
+hvd.hnqbts
+mdp.39015024071824
+hvd.32044050827351
+coo.31924065585840
+mdp.39015048893195
+mdp.39015053252139
+mdp.39015059489032
+uc1.b3385486
+inu.30000099671665
+mdp.39015060425942
+coo.31924066146733
+uc1.b2905408
+hvd.32044010335081
+coo.31924057525861
+njp.32101076201167
+hvd.32044014419220
+mdp.39015053262393
+mdp.39015060429746
+uc1.b2972967
+umn.31951000742933f
+hvd.hwqu51
+mdp.39015060425751
+uiuo.ark:/13960/t4qk01n82
+njp.32101063578718
+mdp.39015009286215
+nyp.33433082488895
+hvd.32044098628217
+hvd.32044043851013
+njp.32101076199171
+mdp.39015060426742
+nyp.33433082488911
+njp.32101075672806
+mdp.39015033845549
+nyp.33433076055809
+njp.32101076425980
+coo.31924057522082
+mdp.39015004858224
+coo.31924065856167
+uiug.30112042290434
+uc1.b3011277
+mdp.39015003346247
+mdp.39015049192902
+njp.32101076379989
+njp.32101076533932
+chi.78323978
+chi.55229744
+njp.32101073758805
+uc2.ark:/13960/t4bp05b0f
+mdp.39015059488877
+chi.78323841
+mdp.39015005484020
+hvd.32044010332070
+njp.32101047468010
+uc1.b3627386
+hvd.32044092658095
+hvd.32044014692362
+nyp.33433000183008
+njp.32101080222720
+njp.32101074834787
+uc1.31158010000023
+nyp.33433067366678
+hvd.hwa2b7
+hvd.32044103001111
+umn.31951t00020309x
+mdp.39015008880067
+chi.78023993
+hvd.32044092738798
+nyp.33433081658886
+mdp.39015043572422
+njp.32101007684655
+uc1.32106020079791
+hvd.32044092797216
+mdp.39015059846678
+mdp.39015036664038
+chi.78013704
+mdp.39015059395619
+uc1.b3627386
+hvd.32044048963011
+nyp.33433081646642
+nyp.33433076055809
+mdp.39015060429415
+hvd.hxkepr
+uiug.30112001676896
+mdp.39015060429423
+chi.12153205
+mdp.39015026482151
+mdp.39015063933546
+inu.30000099860342
+njp.32101076425485
+umn.31951p00293997r
+wu.89001946482
+uc1.b3311895
+hvd.32044092624287
+uc1.b3627386
+uc1.$b31654
+inu.30000099671764
+hvd.32044094024825
+njp.32101020794176
+hvd.32044012913166
+njp.32101076451556
+njp.32101076472966
+mdp.39015060429597
+coo.31924066328299
+coo.31924066328299
+njp.32101076180460
+mdp.39015012991363
+njp.32101075673754
+njp.32101075672665
+msu.31293018462196
+njp.32101077262895
+hvd.32044092797182
+mdp.39015043572588
+nyp.33433081659017
+uc1.b2900825
+mdp.39015060424127
+njp.32101075672905
+hvd.32044010396893
+mdp.39015010329087
+njp.32101076890142
+mdp.39015010328121
+uc1.b2972398
+hvd.ah3rmv
+mdp.39015048909868
+njp.32101074443415
+mdp.39015078153817
+hvd.32044040731473
+pst.000068744151
+njp.32101065266668
+njp.32101065266668
+hvd.32044086720679
+coo.31924057531109
+mdp.39015060429332
+chi.19606141
+hvd.32044010495000
+hvd.32044024285587
+njp.32101076201175
+uva.x030453236
+mdp.39015043572661
+uc1.b2905409
+mdp.39015060429555
+mdp.39015060429498
+mdp.39015008601109
+nyp.33433074853965
+njp.32101068158847
+hvd.hnlia3
+loc.ark:/13960/t3gx4tr12
+hvd.tz1l4c
+mdp.39015060430108
+hvd.tz1l4w
+njp.32101010945226
+mdp.39015031048211
+njp.32101077276523
+njp.32101076782703
+hvd.32044050831999
+mdp.39015012330885
+uc1.b3546679
+njp.32101075673481
+mdp.39015020441104
+coo.31924106553286
+njp.32101076472792
+mdp.39015043572372
+hvd.32044050827351
+mdp.39015063944592
+njp.32101063578650
+njp.32101076433125
+mdp.39015048893252
+coo.31924065856167
+njp.32101075716934
+coo.31924057525606
+coo.31924062186204
+uc1.b3385173
+mdp.39015059402357
+njp.32101075672624
+hvd.ah3kfk
+njp.32101076795333
+njp.32101023869397
+uc1.$b312189
+njp.32101007684614
+hvd.32044092624352
+uva.x000240890
+nyp.33433067294433
+nyp.33433067294433
+coo.31924007186517
+njp.32101047468002
+hvd.32044009907841
+hvd.32044021008149
+njp.32101076201183
+njp.32101076457744
+mdp.39015060430082
+uc1.b3385513
+mdp.39015053252139
+inu.30000092253925
+pst.000008820648
+mdp.39015008570205
+coo.31924075116701
+nyp.33433089908747
+hvd.32044086791217
+njp.32101072577347
+njp.32101071985772
+mdp.39015056480562
+umn.31951002792969k
+nyp.33433087345637
+mdp.39015022469087
+inu.30000084048762
+njp.32101023869397
+mdp.39015008095047
+njp.32101077260600
+njp.32101077260618
+nyp.33433074380704
+inu.30000104007657
+njp.32101076457702
+mdp.39015043800013
+nyp.33433004518415
+hvd.32044038399135
+njp.32101077288247
+njp.32101076199130
+njp.32101076530979
+uc1.c2608792
+njp.32101076530979
+nyp.33433074380662
+hvd.32044011856838
+uc1.b3919785
+wu.89001946482
+uc1.b3311895
+uc2.ark:/13960/t8w95458t
+mdp.39015013094217
+mdp.39015008305289
+njp.32101076530979
+hvd.hwp8ba
+njp.32101076457066
+coo1.ark:/13960/t4bp0n867
+njp.32101076530979
+uc1.b2972949
+njp.32101045352828
+njp.32101047467988
+hvd.32044086759800
+uc1.b3885866
+mdp.39015060429357
+njp.32101077288569
+aeu.ark:/13960/t1pg22p71
+nyp.33433082488887
+njp.32101064475831
+uc1.31175035197097
+njp.32101076880150
+coo.31924008821047
+nyp.33433074380720
+hvd.32044038400958
+mdp.39015030932753
+nyp.33433074380696
+njp.32101076889508
+njp.32101077288239
+hvd.32044098627268
+hvd.32044092634013
+hvd.32044014683114
+hvd.32044058190059
+umn.31951002804000l
+uc1.$b661479
+hvd.32044092645134
+mdp.39015060429357
+mdp.39015074687149
+mdp.39015035805772
+pst.000068744458
+mdp.39015016898432
+uc1.b3924130
+uc1.b3924129
+njp.32101075672509
+uc1.b3293449
+loc.ark:/13960/t1fj37n7j
+loc.ark:/13960/t9280zf6z
+njp.32101076200664
+njp.32101076403425
+mdp.39015060429340
+mdp.39015003348201
+mdp.39015067091739
+mdp.39015059896285
+mdp.39015060430397
+inu.30000099860565
+njp.32101073025528
+hvd.32044090276395
+hvd.hwilnp
+dul1.ark:/13960/t6d23816n
+njp.32101075716934
+njp.32101075716934
+njp.32101076199239
+hvd.hnqbsv
+nyp.33433081647616
+njp.32101037023239
+njp.32101037601646
+njp.32101063578791
+hvd.32044012418034
+ucm.5326809190
+mdp.39015054289338
+njp.32101074443332
+njp.32101074443399
+njp.32101074443415
+njp.32101021580343
+inu.30000099860326
+njp.32101076041084
+njp.32101075672749
+njp.32101075729960
+hvd.32044098627433
+uc1.b3885859
+mdp.39015060429530
+njp.32101077288213
+mdp.39076000323746
+inu.32000000683138
+mdp.39015027588287
+mdp.39015073107768
+coo.31924057525671
+hvd.32044092711480
+uc1.b3924126
+hvd.32044038400958
+hvd.32044092797232
+njp.32101076384435
+njp.32101076378189
+hvd.32044009957044
+njp.32101076378536
+hvd.hnqbsx
+hvd.32044012418034
+nyp.33433081756896
+chi.55220547
+njp.32101076889979
+chi.79213384
+uiuo.ark:/13960/t4qk01n82
+hvd.32044048962955
+hvd.32044092677376
+uc1.32106001646766
+uc1.32106015528877
+nyp.33433081672853
+njp.32101076426079
+njp.32101076040946
+mdp.39015048893823
+uc1.b2974316
+uc2.ark:/13960/t8ff3wr3q
+hvd.hxkepr
+njp.32101076457728
+coo.31924069259624
+hvd.32044092640663
+njp.32101075673622
+mdp.39015049192910
+hvd.32044092796093
+mdp.39015030866506
+hvd.32044092797190
+chi.12755443
+hvd.32044048963029
+hvd.hnqbsr
+mdp.39015048893831
+hvd.32044048963136
+hvd.32044103001129
+hvd.hxe6bx
+uiug.30112001676896
+inu.30000099671525
+inu.30000099671624
+uiug.30112046384886
+hvd.hnqbtn
+njp.32101065266304
+njp.32101076201183
+mdp.39015060429381
+njp.32101075672871
+hvd.32044010396893
+uc1.b3385477
+nyp.33433082219621
+njp.32101064467036
+hvd.32044031571342
+hvd.32044054989868
+inu.30000099671566
+mdp.39015060429399
+mdp.39015033845689
+hvd.32044098628274
+mdp.39015059397953
+nyp.33433075914071
+mdp.39015008095104
+njp.32101065270892
+hvd.32044092711431
+coo.31924066518758
+uc1.ax0003129954
+njp.32101076472909
+mdp.39015060429480
+mdp.39015030936325
+hvd.32044098641632
+njp.32101075672855
+chi.79279237
+hvd.32044011590692
+njp.32101076472917
+mdp.39015059402340
+umn.31951002792970z
+njp.32101076472958
+mdp.39015060430116
+njp.32101076472933
+mdp.39015060430058
+mdp.39015060430371
+mdp.39015014523602
+hvd.32044098641343
+njp.32101007893256
+njp.32101063551608
+hvd.32044092754100
+mdp.39015041879613
+nyp.33433074829270
+njp.32101075672608
+hvd.hnqbst
+hvd.hxe6bz
+hvd.32044010396893
+nyp.33433082219902
+coo1.ark:/13960/t3st84m4q
+mdp.39015036664079
+loc.ark:/13960/t0xp7hp6s
+uc1.c2641998
+hvd.32044092796085
+hvd.32044092797208
+mdp.39015008095153
+mdp.39015060429506
+uc1.$b272656
+hvd.hnle8h
+nyp.33433000182992
+inu.30000099671723
+mdp.39015060429589
+hvd.hnqbtj
+mdp.39015060429522
+hvd.hxe6c3
+nyp.33433074894126
+njp.32101075672632
+mdp.39015043572539
+hvd.hx2hrd
+njp.32101047467996
+njp.32101047468002
+coo1.ark:/13960/t70v9287j
+njp.32101077879508
+mdp.39015060429449
+mdp.39015087701341
+hvd.32044074313453
+njp.32101076457785
+mdp.39015060429548
+chi.78013677
+mdp.39015015383279
+nyp.33433076071004
+hvd.hnqbtr
+inu.30000099671541
+coo1.ark:/13960/t9n30f16x
+mdp.39015073107529
+njp.32101063578627
+coo.31924062189661
+njp.32101076471414
+coo.31924066146733
+mdp.39015060429431
+mdp.39015049192894
+uc1.b2972410
+mdp.39015062280055
+inu.30000099671558
+inu.30000104005750
+njp.32101077262788
+mdp.39015049192928
+njp.32101075672616
diff --git a/scripts/version-labels/version-labels-2024-03-20.tsv b/scripts/version-labels/version-labels-2024-03-20.tsv
new file mode 100644
index 00000000..9cc72944
--- /dev/null
+++ b/scripts/version-labels/version-labels-2024-03-20.tsv
@@ -0,0 +1,518 @@
+htid	version_label
+hvd.32044090278565	2022-12-04 12:12 UTC
+nyp.33433081683744	2022-11-10 15:09 UTC
+uc1.b3924132	2022-06-16 11:33 UTC
+mdp.39015026482151	2023-07-22 18:55 UTC
+uiug.30112106245936	2024-02-28 15:57 UTC
+hvd.32044009576562	2023-10-07 22:52 UTC
+nyp.33433067294433	2023-05-14 08:02 UTC
+coo.31924065856167	2024-02-18 07:26 UTC
+uc1.ax0002627784	2023-10-27 18:39 UTC
+wu.89001946482	2023-07-21 14:01 UTC
+uc1.b3311895	2023-05-24 06:50 UTC
+hvd.32044048963128	2023-10-08 22:39 UTC
+njp.32101076199213	2022-05-17 22:07 UTC
+coo.31924051399685	2023-08-01 22:52 UTC
+njp.32101076472800	2023-04-21 11:37 UTC
+njp.32101076472859	2022-05-18 00:07 UTC
+nyp.33433074380688	2023-07-08 15:15 UTC
+hvd.32044019842491	2023-03-22 10:56 UTC
+uc1.32106001559381	2022-07-11 01:01 UTC
+mdp.39015024071642	2023-06-27 17:47 UTC
+hvd.hx28d7	2023-03-01 03:52 UTC
+uva.x002111617	2022-11-11 10:19 UTC
+uc1.$b161790	2022-10-17 00:45 UTC
+uc1.$b683534	2023-05-04 12:10 UTC
+nyp.33433076066723	2022-10-02 13:14 UTC
+miun.ajd7522.0001.001	2012-07-26 23:46 UTC
+hvd.hxv9b7	2023-01-01 14:29 UTC
+coo.31924065856167	2024-02-18 07:26 UTC
+uc1.$b275098	2022-09-12 12:04 UTC
+nyp.33433081676979	2024-03-01 09:07 UTC
+coo.31924066177589	2023-10-03 14:18 UTC
+hvd.32044011432754	2023-01-14 16:05 UTC
+njp.32101076384609	2023-04-21 10:48 UTC
+pst.000020068974	2023-08-10 06:15 UTC
+mdp.39015087700681	2021-02-17 20:42 UTC
+njp.32101076403078	2022-05-17 23:09 UTC
+njp.32101075673655	2023-06-02 07:36 UTC
+udel.31741113248746	2016-07-08 20:19 UTC
+hvd.hn34f5	2022-12-05 08:54 UTC
+inu.30000066028642	2019-05-02 19:27 UTC
+uc1.b3794203	2023-05-31 10:25 UTC
+uc1.b3794204	2022-06-03 23:44 UTC
+inu.30000099671491	2022-12-16 15:44 UTC
+hvd.hnqbsu	2023-03-19 17:15 UTC
+mdp.39015020696541	2023-07-05 07:31 UTC
+njp.32101075672541	2023-04-21 04:15 UTC
+mdp.39015060441675	2023-04-15 03:09 UTC
+hvd.32044031554363	2022-12-17 13:26 UTC
+hvd.32044048963185	2023-10-08 22:38 UTC
+njp.32101076403300	2022-10-25 02:18 UTC
+coo.31924057525382	2024-02-10 13:52 UTC
+coo.31924065580551	2023-09-11 05:00 UTC
+mdp.39015008833884	2023-07-25 00:32 UTC
+njp.32101010945275	2023-04-20 07:48 UTC
+njp.32101076201159	2024-03-19 13:08 UTC
+inu.30000099671632	2023-05-30 09:46 UTC
+yale.39002004065844	2011-06-01 03:25 UTC
+hvd.32044098627870	2022-12-29 10:25 UTC
+inu.30000092253941	2022-12-20 16:29 UTC
+mdp.39015060429308	2023-04-15 03:56 UTC
+mdp.39015060429464	2023-04-06 06:50 UTC
+uc1.b3385165	2023-12-18 07:01 UTC
+uiug.30112042710548	2024-02-17 18:52 UTC
+uc1.b3548551	2022-10-09 05:09 UTC
+uc1.b3850894	2022-09-29 04:50 UTC
+hvd.hnqbts	2023-03-19 17:16 UTC
+mdp.39015024071824	2023-03-12 01:33 UTC
+hvd.32044050827351	2023-03-22 13:37 UTC
+coo.31924065585840	2023-10-03 14:21 UTC
+mdp.39015048893195	2023-06-27 08:17 UTC
+mdp.39015053252139	2022-12-16 11:29 UTC
+mdp.39015059489032	2023-07-08 05:46 UTC
+uc1.b3385486	2023-06-12 02:37 UTC
+inu.30000099671665	2022-12-16 15:45 UTC
+mdp.39015060425942	2022-10-30 02:47 UTC
+coo.31924066146733	2023-08-11 17:47 UTC
+uc1.b2905408	2022-10-17 15:18 UTC
+hvd.32044010335081	2022-12-28 23:29 UTC
+coo.31924057525861	2024-02-10 13:52 UTC
+njp.32101076201167	2024-03-19 13:08 UTC
+hvd.32044014419220	2022-12-09 18:23 UTC
+mdp.39015053262393	2022-12-15 12:43 UTC
+mdp.39015060429746	2023-04-18 12:44 UTC
+uc1.b2972967	2023-08-31 14:52 UTC
+umn.31951000742933f	2023-09-29 21:37 UTC
+hvd.hwqu51	2023-02-28 12:31 UTC
+mdp.39015060425751	2023-07-06 13:33 UTC
+uiuo.ark:/13960/t4qk01n82	2014-06-04 10:01 UTC
+njp.32101063578718	2022-07-16 03:13 UTC
+mdp.39015009286215	2023-07-29 07:08 UTC
+nyp.33433082488895	2024-03-03 11:56 UTC
+hvd.32044098628217	2022-12-29 10:25 UTC
+hvd.32044043851013	2023-07-09 10:01 UTC
+njp.32101076199171	2024-03-19 13:11 UTC
+mdp.39015060426742	2023-07-06 13:32 UTC
+nyp.33433082488911	2023-09-22 15:27 UTC
+njp.32101075672806	2023-04-21 04:17 UTC
+mdp.39015033845549	2023-11-26 06:21 UTC
+nyp.33433076055809	2022-10-05 13:13 UTC
+njp.32101076425980	2023-04-21 11:02 UTC
+coo.31924057522082	2023-08-05 16:39 UTC
+mdp.39015004858224	2023-11-08 13:59 UTC
+coo.31924065856167	2024-02-18 07:26 UTC
+uiug.30112042290434	2024-02-20 04:47 UTC
+uc1.b3011277	2023-05-26 14:44 UTC
+mdp.39015003346247	2023-07-22 14:29 UTC
+mdp.39015049192902	2023-07-22 21:13 UTC
+njp.32101076379989	2022-05-17 22:44 UTC
+njp.32101076533932	2022-05-18 00:35 UTC
+chi.78323978	2023-08-07 17:05 UTC
+chi.55229744	2024-02-26 06:56 UTC
+njp.32101073758805	2023-03-06 23:56 UTC
+uc2.ark:/13960/t4bp05b0f	2018-12-08 13:43 UTC
+mdp.39015059488877	2023-07-08 05:46 UTC
+chi.78323841	2023-11-30 13:22 UTC
+mdp.39015005484020	2022-10-02 20:16 UTC
+hvd.32044010332070	2023-02-25 03:53 UTC
+njp.32101047468010	2023-09-22 21:04 UTC
+uc1.b3627386	2022-09-23 11:58 UTC
+hvd.32044092658095	2023-01-16 13:14 UTC
+hvd.32044014692362	2023-03-21 10:43 UTC
+nyp.33433000183008	2023-05-18 23:09 UTC
+njp.32101080222720	2023-10-28 19:10 UTC
+njp.32101074834787	2023-06-02 09:40 UTC
+uc1.31158010000023	2023-08-24 03:53 UTC
+nyp.33433067366678	2024-03-04 02:01 UTC
+hvd.hwa2b7	2023-03-20 06:00 UTC
+hvd.32044103001111	2023-01-17 20:33 UTC
+umn.31951t00020309x	2023-10-01 17:01 UTC
+mdp.39015008880067	2022-10-12 19:31 UTC
+chi.78023993	2023-08-04 04:30 UTC
+hvd.32044092738798	2023-01-16 14:43 UTC
+nyp.33433081658886	2023-09-21 08:38 UTC
+mdp.39015043572422	2024-01-12 15:17 UTC
+njp.32101007684655	2024-03-04 21:45 UTC
+uc1.32106020079791	2022-10-16 18:19 UTC
+hvd.32044092797216	2023-01-16 15:02 UTC
+mdp.39015059846678	2022-11-22 09:32 UTC
+mdp.39015036664038	2022-10-06 15:36 UTC
+chi.78013704	2023-08-04 04:31 UTC
+mdp.39015059395619	2023-07-07 08:28 UTC
+uc1.b3627386	2022-09-23 11:58 UTC
+hvd.32044048963011	2023-10-08 22:39 UTC
+nyp.33433081646642	2022-05-01 03:24 UTC
+nyp.33433076055809	2022-10-05 13:13 UTC
+mdp.39015060429415	2023-04-06 06:50 UTC
+hvd.hxkepr	2023-03-03 05:40 UTC
+uiug.30112001676896	2024-02-28 02:20 UTC
+mdp.39015060429423	2023-04-18 12:44 UTC
+chi.12153205	2024-02-12 15:40 UTC
+mdp.39015026482151	2023-07-22 18:55 UTC
+mdp.39015063933546	2023-12-31 21:55 UTC
+inu.30000099860342	2022-12-20 16:30 UTC
+njp.32101076425485	2023-07-06 01:56 UTC
+umn.31951p00293997r	2023-01-14 04:12 UTC
+wu.89001946482	2023-07-21 14:01 UTC
+uc1.b3311895	2023-05-24 06:50 UTC
+hvd.32044092624287	2023-01-16 15:26 UTC
+uc1.b3627386	2022-09-23 11:58 UTC
+uc1.$b31654	2022-05-29 17:18 UTC
+inu.30000099671764	2023-01-31 17:30 UTC
+hvd.32044094024825	2023-01-16 21:26 UTC
+njp.32101020794176	2023-11-01 10:30 UTC
+hvd.32044012913166	2022-12-09 14:08 UTC
+njp.32101076451556	2023-10-26 04:46 UTC
+njp.32101076472966	2022-11-10 16:08 UTC
+mdp.39015060429597	2023-04-18 12:44 UTC
+coo.31924066328299	2023-12-02 03:03 UTC
+coo.31924066328299	2023-12-02 03:03 UTC
+njp.32101076180460	2023-07-14 12:36 UTC
+mdp.39015012991363	2023-04-05 13:23 UTC
+njp.32101075673754	2023-04-21 04:19 UTC
+njp.32101075672665	2022-11-05 21:32 UTC
+msu.31293018462196	2022-05-28 02:13 UTC
+njp.32101077262895	2023-06-30 10:52 UTC
+hvd.32044092797182	2023-01-16 15:02 UTC
+mdp.39015043572588	2023-08-15 11:16 UTC
+nyp.33433081659017	2024-03-05 00:04 UTC
+uc1.b2900825	2022-10-17 13:59 UTC
+mdp.39015060424127	2023-04-15 03:55 UTC
+njp.32101075672905	2022-11-05 23:42 UTC
+hvd.32044010396893	2022-12-09 08:13 UTC
+mdp.39015010329087	2023-03-31 15:46 UTC
+njp.32101076890142	2022-05-18 01:07 UTC
+mdp.39015010328121	2023-09-04 14:39 UTC
+uc1.b2972398	2023-08-31 14:51 UTC
+hvd.ah3rmv	2023-03-18 08:21 UTC
+mdp.39015048909868	2022-07-27 21:17 UTC
+njp.32101074443415	2023-07-06 19:28 UTC
+mdp.39015078153817	2022-12-15 18:56 UTC
+hvd.32044040731473	2023-10-08 19:59 UTC
+pst.000068744151	2023-12-10 02:20 UTC
+njp.32101065266668	2022-09-04 00:16 UTC
+njp.32101065266668	2022-09-04 00:16 UTC
+hvd.32044086720679	2022-12-10 15:06 UTC
+coo.31924057531109	2024-02-18 05:18 UTC
+mdp.39015060429332	2023-04-18 12:44 UTC
+chi.19606141	2024-02-27 20:53 UTC
+hvd.32044010495000	2022-12-31 00:36 UTC
+hvd.32044024285587	2022-12-31 02:07 UTC
+njp.32101076201175	2024-03-19 13:08 UTC
+uva.x030453236	2022-08-26 14:39 UTC
+mdp.39015043572661	2022-09-30 17:45 UTC
+uc1.b2905409	2022-10-05 17:01 UTC
+mdp.39015060429555	2023-04-18 12:44 UTC
+mdp.39015060429498	2023-04-15 03:56 UTC
+mdp.39015008601109	2022-11-23 18:27 UTC
+nyp.33433074853965	2022-12-24 03:37 UTC
+njp.32101068158847	2023-05-20 00:32 UTC
+hvd.hnlia3	2023-03-19 14:38 UTC
+loc.ark:/13960/t3gx4tr12	2011-03-17 01:33 UTC
+hvd.tz1l4c	2023-03-21 11:20 UTC
+mdp.39015060430108	2023-04-15 03:57 UTC
+hvd.tz1l4w	2023-05-31 12:29 UTC
+njp.32101010945226	2023-07-05 23:53 UTC
+mdp.39015031048211	2023-04-10 09:02 UTC
+njp.32101077276523	2023-04-21 17:28 UTC
+njp.32101076782703	2023-06-19 12:20 UTC
+hvd.32044050831999	2023-03-22 13:37 UTC
+mdp.39015012330885	2023-07-05 05:13 UTC
+uc1.b3546679	2023-06-11 10:35 UTC
+njp.32101075673481	2023-04-21 04:18 UTC
+mdp.39015020441104	2022-12-21 00:42 UTC
+coo.31924106553286	2023-12-06 22:25 UTC
+njp.32101076472792	2023-04-21 11:37 UTC
+mdp.39015043572372	2022-09-30 17:45 UTC
+hvd.32044050827351	2023-03-22 13:37 UTC
+mdp.39015063944592	2022-11-19 07:56 UTC
+njp.32101063578650	2023-04-20 09:35 UTC
+njp.32101076433125	2023-09-26 15:14 UTC
+mdp.39015048893252	2022-12-15 12:05 UTC
+coo.31924065856167	2024-02-18 07:26 UTC
+njp.32101075716934	2024-03-19 12:06 UTC
+coo.31924057525606	2024-02-10 15:13 UTC
+coo.31924062186204	2023-12-01 18:53 UTC
+uc1.b3385173	2023-06-12 02:44 UTC
+mdp.39015059402357	2022-07-27 23:27 UTC
+njp.32101075672624	2023-04-21 04:16 UTC
+hvd.ah3kfk	2023-03-18 07:39 UTC
+njp.32101076795333	2024-03-19 15:01 UTC
+njp.32101023869397	2023-04-20 08:03 UTC
+uc1.$b312189	2022-09-12 05:57 UTC
+njp.32101007684614	2024-03-04 21:45 UTC
+hvd.32044092624352	2023-01-16 13:00 UTC
+uva.x000240890	2022-09-18 04:24 UTC
+nyp.33433067294433	2023-05-14 08:02 UTC
+nyp.33433067294433	2023-05-14 08:02 UTC
+coo.31924007186517	2023-11-30 00:54 UTC
+njp.32101047468002	2023-09-22 21:04 UTC
+hvd.32044009907841	2023-10-07 23:22 UTC
+hvd.32044021008149	2023-10-08 11:00 UTC
+njp.32101076201183	2023-07-03 04:24 UTC
+njp.32101076457744	2023-04-21 11:15 UTC
+mdp.39015060430082	2023-07-06 13:29 UTC
+uc1.b3385513	2023-12-18 06:59 UTC
+mdp.39015053252139	2022-12-16 11:29 UTC
+inu.30000092253925	2023-04-10 00:21 UTC
+pst.000008820648	2023-08-07 19:42 UTC
+mdp.39015008570205	2023-06-05 00:18 UTC
+coo.31924075116701	2023-09-27 19:07 UTC
+nyp.33433089908747	2023-08-08 14:35 UTC
+hvd.32044086791217	2022-12-09 17:53 UTC
+njp.32101072577347	2024-03-17 02:19 UTC
+njp.32101071985772	2022-12-25 02:44 UTC
+mdp.39015056480562	2022-11-24 12:09 UTC
+umn.31951002792969k	2022-09-14 23:40 UTC
+nyp.33433087345637	2023-05-26 11:58 UTC
+mdp.39015022469087	2023-08-06 16:32 UTC
+inu.30000084048762	2023-04-21 01:25 UTC
+njp.32101023869397	2023-04-20 08:03 UTC
+mdp.39015008095047	2022-12-15 21:50 UTC
+njp.32101077260600	2022-05-18 01:32 UTC
+njp.32101077260618	2022-05-18 01:32 UTC
+nyp.33433074380704	2023-07-05 06:03 UTC
+inu.30000104007657	2023-03-30 18:06 UTC
+njp.32101076457702	2023-04-21 11:15 UTC
+mdp.39015043800013	2023-04-13 10:29 UTC
+nyp.33433004518415	2024-03-08 20:02 UTC
+hvd.32044038399135	2023-01-14 23:49 UTC
+njp.32101077288247	2023-04-21 15:12 UTC
+njp.32101076199130	2023-07-11 11:08 UTC
+njp.32101076530979	2023-05-20 05:04 UTC
+uc1.c2608792	2023-09-27 23:40 UTC
+njp.32101076530979	2023-05-20 05:04 UTC
+nyp.33433074380662	2023-05-31 03:09 UTC
+hvd.32044011856838	2022-12-31 00:39 UTC
+uc1.b3919785	2022-09-30 00:16 UTC
+wu.89001946482	2023-07-21 14:01 UTC
+uc1.b3311895	2023-05-24 06:50 UTC
+uc2.ark:/13960/t8w95458t	2010-05-03 22:15 UTC
+mdp.39015013094217	2023-07-22 15:04 UTC
+mdp.39015008305289	2023-06-15 15:36 UTC
+njp.32101076530979	2023-05-20 05:04 UTC
+hvd.hwp8ba	2023-04-07 15:28 UTC
+njp.32101076457066	2023-04-05 18:58 UTC
+coo1.ark:/13960/t4bp0n867	2022-11-07 18:14 UTC
+njp.32101076530979	2023-05-20 05:04 UTC
+uc1.b2972949	2023-05-26 05:19 UTC
+njp.32101045352828	2023-09-22 21:20 UTC
+njp.32101047467988	2023-09-22 21:04 UTC
+hvd.32044086759800	2022-12-10 14:16 UTC
+uc1.b3885866	2022-09-29 21:15 UTC
+mdp.39015060429357	2023-04-18 12:44 UTC
+njp.32101077288569	2022-05-18 02:20 UTC
+aeu.ark:/13960/t1pg22p71	2014-09-15 07:08 UTC
+nyp.33433082488887	2023-01-24 13:51 UTC
+njp.32101064475831	2024-03-16 21:24 UTC
+uc1.31175035197097	2022-05-24 06:26 UTC
+njp.32101076880150	2023-05-13 23:22 UTC
+coo.31924008821047	2022-11-25 23:34 UTC
+nyp.33433074380720	2023-06-01 19:48 UTC
+hvd.32044038400958	2023-10-08 18:51 UTC
+mdp.39015030932753	2022-12-15 07:29 UTC
+nyp.33433074380696	2023-07-11 10:18 UTC
+njp.32101076889508	2023-10-26 04:28 UTC
+njp.32101077288239	2023-04-21 15:12 UTC
+hvd.32044098627268	2022-12-29 10:23 UTC
+hvd.32044092634013	2023-01-15 12:33 UTC
+hvd.32044014683114	2023-10-08 04:35 UTC
+hvd.32044058190059	2023-01-15 05:02 UTC
+umn.31951002804000l	2024-02-02 16:44 UTC
+uc1.$b661479	2023-05-03 21:36 UTC
+hvd.32044092645134	2023-01-15 11:07 UTC
+mdp.39015060429357	2023-04-18 12:44 UTC
+mdp.39015074687149	2023-07-20 18:19 UTC
+mdp.39015035805772	2023-01-29 00:15 UTC
+pst.000068744458	2023-11-01 15:38 UTC
+mdp.39015016898432	2023-01-13 01:34 UTC
+uc1.b3924130	2022-06-10 04:29 UTC
+uc1.b3924129	2022-06-16 11:52 UTC
+njp.32101075672509	2023-04-21 04:16 UTC
+uc1.b3293449	2022-08-31 05:38 UTC
+loc.ark:/13960/t1fj37n7j	2011-03-12 18:03 UTC
+loc.ark:/13960/t9280zf6z	2011-03-19 03:14 UTC
+njp.32101076200664	2023-04-04 23:21 UTC
+njp.32101076403425	2022-05-17 23:10 UTC
+mdp.39015060429340	2023-04-18 12:44 UTC
+mdp.39015003348201	2023-11-24 05:38 UTC
+mdp.39015067091739	2023-04-10 13:54 UTC
+mdp.39015059896285	2023-08-06 17:42 UTC
+mdp.39015060430397	2023-04-18 12:45 UTC
+inu.30000099860565	2022-12-16 15:46 UTC
+njp.32101073025528	2023-10-25 16:37 UTC
+hvd.32044090276395	2023-06-22 04:37 UTC
+hvd.hwilnp	2023-03-20 08:08 UTC
+dul1.ark:/13960/t6d23816n	2014-01-10 17:15 UTC
+njp.32101075716934	2024-03-19 12:06 UTC
+njp.32101075716934	2024-03-19 12:06 UTC
+njp.32101076199239	2023-07-04 12:30 UTC
+hvd.hnqbsv	2023-03-19 17:15 UTC
+nyp.33433081647616	2024-03-01 04:11 UTC
+njp.32101037023239	2023-10-28 04:07 UTC
+njp.32101037601646	2023-11-01 09:45 UTC
+njp.32101063578791	2024-03-06 13:16 UTC
+hvd.32044012418034	2023-10-08 02:17 UTC
+ucm.5326809190	2022-05-22 09:08 UTC
+mdp.39015054289338	2022-11-24 12:41 UTC
+njp.32101074443332	2024-03-19 10:54 UTC
+njp.32101074443399	2023-07-12 14:30 UTC
+njp.32101074443415	2023-07-06 19:28 UTC
+njp.32101021580343	2023-04-20 08:03 UTC
+inu.30000099860326	2022-12-16 15:47 UTC
+njp.32101076041084	2023-08-02 17:35 UTC
+njp.32101075672749	2023-04-21 04:17 UTC
+njp.32101075729960	2024-03-19 11:48 UTC
+hvd.32044098627433	2022-12-29 10:24 UTC
+uc1.b3885859	2023-06-02 19:17 UTC
+mdp.39015060429530	2023-04-18 12:44 UTC
+njp.32101077288213	2023-04-08 12:24 UTC
+mdp.39076000323746	2023-08-13 21:22 UTC
+inu.32000000683138	2024-01-19 09:06 UTC
+mdp.39015027588287	2024-01-25 09:54 UTC
+mdp.39015073107768	2024-01-05 21:48 UTC
+coo.31924057525671	2024-02-10 13:52 UTC
+hvd.32044092711480	2023-01-16 15:29 UTC
+uc1.b3924126	2022-06-05 01:58 UTC
+hvd.32044038400958	2023-10-08 18:51 UTC
+hvd.32044092797232	2023-01-16 15:42 UTC
+njp.32101076384435	2023-03-26 09:14 UTC
+njp.32101076378189	2023-10-25 19:32 UTC
+hvd.32044009957044	2023-10-07 23:28 UTC
+njp.32101076378536	2023-10-25 19:32 UTC
+hvd.hnqbsx	2023-03-26 02:32 UTC
+hvd.32044012418034	2023-10-08 02:17 UTC
+nyp.33433081756896	2023-01-18 18:29 UTC
+chi.55220547	2024-02-12 15:35 UTC
+njp.32101076889979	2023-10-27 10:54 UTC
+chi.79213384	2024-02-11 10:28 UTC
+uiuo.ark:/13960/t4qk01n82	2014-06-04 10:01 UTC
+hvd.32044048962955	2023-02-26 13:35 UTC
+hvd.32044092677376	2023-10-10 01:49 UTC
+uc1.32106001646766	2023-05-10 19:57 UTC
+uc1.32106015528877	2022-06-21 04:43 UTC
+nyp.33433081672853	2023-09-21 13:26 UTC
+njp.32101076426079	2023-04-21 11:04 UTC
+njp.32101076040946	2023-09-26 18:29 UTC
+mdp.39015048893823	2023-07-01 02:53 UTC
+uc1.b2974316	2023-06-07 20:53 UTC
+uc2.ark:/13960/t8ff3wr3q	2010-04-29 11:05 UTC
+hvd.hxkepr	2023-03-03 05:40 UTC
+njp.32101076457728	2023-04-21 11:15 UTC
+coo.31924069259624	2023-10-14 20:07 UTC
+hvd.32044092640663	2023-01-17 00:26 UTC
+njp.32101075673622	2024-03-19 10:31 UTC
+mdp.39015049192910	2023-07-03 08:47 UTC
+hvd.32044092796093	2023-01-16 15:01 UTC
+mdp.39015030866506	2023-11-08 17:54 UTC
+hvd.32044092797190	2023-01-16 15:02 UTC
+chi.12755443	2024-02-27 20:53 UTC
+hvd.32044048963029	2024-01-03 09:15 UTC
+hvd.hnqbsr	2023-03-19 17:15 UTC
+mdp.39015048893831	2023-06-27 22:57 UTC
+hvd.32044048963136	2024-01-03 09:15 UTC
+hvd.32044103001129	2023-01-17 20:00 UTC
+hvd.hxe6bx	2023-03-26 04:48 UTC
+uiug.30112001676896	2024-02-28 02:20 UTC
+inu.30000099671525	2023-07-04 02:42 UTC
+inu.30000099671624	2023-05-30 09:46 UTC
+uiug.30112046384886	2022-05-19 10:57 UTC
+hvd.hnqbtn	2023-03-19 17:16 UTC
+njp.32101065266304	2022-09-03 13:08 UTC
+njp.32101076201183	2023-07-03 04:24 UTC
+mdp.39015060429381	2023-04-09 19:32 UTC
+njp.32101075672871	2023-04-21 04:17 UTC
+hvd.32044010396893	2022-12-09 08:13 UTC
+uc1.b3385477	2023-06-12 02:37 UTC
+nyp.33433082219621	2023-06-01 06:45 UTC
+njp.32101064467036	2023-11-02 08:51 UTC
+hvd.32044031571342	2023-01-14 20:51 UTC
+hvd.32044054989868	2023-10-09 01:31 UTC
+inu.30000099671566	2023-05-30 09:46 UTC
+mdp.39015060429399	2023-04-18 12:44 UTC
+mdp.39015033845689	2023-11-26 10:54 UTC
+hvd.32044098628274	2022-12-29 11:19 UTC
+mdp.39015059397953	2023-04-13 23:26 UTC
+nyp.33433075914071	2023-07-03 02:44 UTC
+mdp.39015008095104	2023-07-01 22:59 UTC
+njp.32101065270892	2023-07-15 08:12 UTC
+hvd.32044092711431	2023-01-16 14:37 UTC
+coo.31924066518758	2023-11-19 14:39 UTC
+uc1.ax0003129954	2022-07-17 21:13 UTC
+njp.32101076472909	2023-07-06 01:55 UTC
+mdp.39015060429480	2023-04-15 03:08 UTC
+mdp.39015030936325	2023-04-06 02:07 UTC
+hvd.32044098641632	2022-12-29 10:48 UTC
+njp.32101075672855	2023-07-05 05:32 UTC
+chi.79279237	2022-08-29 08:22 UTC
+hvd.32044011590692	2022-12-31 01:51 UTC
+njp.32101076472917	2022-09-10 13:56 UTC
+mdp.39015059402340	2023-02-01 16:00 UTC
+umn.31951002792970z	2024-01-25 03:01 UTC
+njp.32101076472958	2023-07-11 16:57 UTC
+mdp.39015060430116	2023-04-18 12:45 UTC
+njp.32101076472933	2022-11-10 16:07 UTC
+mdp.39015060430058	2023-04-18 12:44 UTC
+mdp.39015060430371	2023-04-18 12:45 UTC
+mdp.39015014523602	2022-12-15 04:38 UTC
+hvd.32044098641343	2022-12-29 11:26 UTC
+njp.32101007893256	2023-08-21 10:56 UTC
+njp.32101063551608	2023-03-06 12:40 UTC
+hvd.32044092754100	2023-01-16 14:48 UTC
+mdp.39015041879613	2008-06-05 03:42 UTC
+nyp.33433074829270	2022-05-01 00:20 UTC
+njp.32101075672608	2023-04-21 05:02 UTC
+hvd.hnqbst	2023-03-19 17:15 UTC
+hvd.hxe6bz	2023-03-26 05:00 UTC
+hvd.32044010396893	2022-12-09 08:13 UTC
+nyp.33433082219902	2023-09-21 16:01 UTC
+coo1.ark:/13960/t3st84m4q	2015-07-18 06:04 UTC
+mdp.39015036664079	2023-01-27 20:38 UTC
+loc.ark:/13960/t0xp7hp6s	2011-03-12 12:24 UTC
+uc1.c2641998	2023-12-22 22:52 UTC
+hvd.32044092796085	2023-10-10 02:10 UTC
+hvd.32044092797208	2023-01-16 15:02 UTC
+mdp.39015008095153	2023-01-31 19:47 UTC
+mdp.39015060429506	2023-04-09 19:32 UTC
+uc1.$b272656	2022-09-14 05:41 UTC
+hvd.hnle8h	2023-03-19 15:06 UTC
+nyp.33433000182992	2023-05-18 20:49 UTC
+inu.30000099671723	2023-05-30 09:46 UTC
+mdp.39015060429589	2023-04-18 12:44 UTC
+hvd.hnqbtj	2023-03-19 17:15 UTC
+mdp.39015060429522	2023-04-09 19:32 UTC
+hvd.hxe6c3	2023-03-26 05:00 UTC
+nyp.33433074894126	2023-03-06 04:28 UTC
+njp.32101075672632	2023-04-21 04:16 UTC
+mdp.39015043572539	2022-12-26 14:27 UTC
+hvd.hx2hrd	2023-04-04 14:47 UTC
+njp.32101047467996	2023-09-22 21:04 UTC
+njp.32101047468002	2023-09-22 21:04 UTC
+coo1.ark:/13960/t70v9287j	2015-07-18 13:54 UTC
+njp.32101077879508	2022-09-07 18:55 UTC
+mdp.39015060429449	2023-04-18 12:44 UTC
+mdp.39015087701341	2024-03-10 19:04 UTC
+hvd.32044074313453	2022-12-09 16:18 UTC
+njp.32101076457785	2023-04-21 11:15 UTC
+mdp.39015060429548	2023-04-15 03:56 UTC
+chi.78013677	2023-08-04 04:31 UTC
+mdp.39015015383279	2023-08-04 09:23 UTC
+nyp.33433076071004	2022-10-05 14:13 UTC
+hvd.hnqbtr	2023-03-26 02:32 UTC
+inu.30000099671541	2022-12-16 15:45 UTC
+coo1.ark:/13960/t9n30f16x	2015-07-18 20:16 UTC
+mdp.39015073107529	2024-01-05 20:27 UTC
+njp.32101063578627	2024-03-06 13:16 UTC
+coo.31924062189661	2023-07-08 12:30 UTC
+njp.32101076471414	2023-12-24 13:04 UTC
+coo.31924066146733	2023-08-11 17:47 UTC
+mdp.39015060429431	2023-04-09 19:32 UTC
+mdp.39015049192894	2023-08-07 09:02 UTC
+uc1.b2972410	2022-10-01 23:20 UTC
+mdp.39015062280055	2022-11-26 21:01 UTC
+inu.30000099671558	2023-07-10 04:42 UTC
+inu.30000104005750	2023-03-30 18:06 UTC
+njp.32101077262788	2023-06-02 10:52 UTC
+mdp.39015049192928	2023-04-18 07:10 UTC
+njp.32101075672616	2023-04-21 04:16 UTC
diff --git a/scripts/version-labels/version-labels-2024-03-21.tsv b/scripts/version-labels/version-labels-2024-03-21.tsv
new file mode 100644
index 00000000..4c937b6d
--- /dev/null
+++ b/scripts/version-labels/version-labels-2024-03-21.tsv
@@ -0,0 +1,518 @@
+htid	version_label
+hvd.32044090278565	2022-12-04 12:12 UTC
+nyp.33433081683744	2022-11-10 15:09 UTC
+uc1.b3924132	2022-06-16 11:33 UTC
+mdp.39015026482151	2023-07-22 18:55 UTC
+uiug.30112106245936	2024-02-28 15:57 UTC
+hvd.32044009576562	2023-10-07 22:52 UTC
+nyp.33433067294433	2023-05-14 08:02 UTC
+coo.31924065856167	2024-02-18 07:26 UTC
+uc1.ax0002627784	2023-10-27 18:39 UTC
+wu.89001946482	2023-07-21 14:01 UTC
+uc1.b3311895	2023-05-24 06:50 UTC
+hvd.32044048963128	2023-10-08 22:39 UTC
+njp.32101076199213	2024-03-20 20:42 UTC
+coo.31924051399685	2023-08-01 22:52 UTC
+njp.32101076472800	2023-04-21 11:37 UTC
+njp.32101076472859	2024-03-20 21:48 UTC
+nyp.33433074380688	2023-07-08 15:15 UTC
+hvd.32044019842491	2023-03-22 10:56 UTC
+uc1.32106001559381	2022-07-11 01:01 UTC
+mdp.39015024071642	2023-06-27 17:47 UTC
+hvd.hx28d7	2023-03-01 03:52 UTC
+uva.x002111617	2022-11-11 10:19 UTC
+uc1.$b161790	2022-10-17 00:45 UTC
+uc1.$b683534	2023-05-04 12:10 UTC
+nyp.33433076066723	2022-10-02 13:14 UTC
+miun.ajd7522.0001.001	2012-07-26 23:46 UTC
+hvd.hxv9b7	2023-01-01 14:29 UTC
+coo.31924065856167	2024-02-18 07:26 UTC
+uc1.$b275098	2022-09-12 12:04 UTC
+nyp.33433081676979	2024-03-01 09:07 UTC
+coo.31924066177589	2023-10-03 14:18 UTC
+hvd.32044011432754	2023-01-14 16:05 UTC
+njp.32101076384609	2023-04-21 10:48 UTC
+pst.000020068974	2023-08-10 06:15 UTC
+mdp.39015087700681	2021-02-17 20:42 UTC
+njp.32101076403078	2024-03-20 22:55 UTC
+njp.32101075673655	2023-06-02 07:36 UTC
+udel.31741113248746	2016-07-08 20:19 UTC
+hvd.hn34f5	2022-12-05 08:54 UTC
+inu.30000066028642	2019-05-02 19:27 UTC
+uc1.b3794203	2023-05-31 10:25 UTC
+uc1.b3794204	2022-06-03 23:44 UTC
+inu.30000099671491	2022-12-16 15:44 UTC
+hvd.hnqbsu	2023-03-19 17:15 UTC
+mdp.39015020696541	2023-07-05 07:31 UTC
+njp.32101075672541	2023-04-21 04:15 UTC
+mdp.39015060441675	2023-04-15 03:09 UTC
+hvd.32044031554363	2022-12-17 13:26 UTC
+hvd.32044048963185	2023-10-08 22:38 UTC
+njp.32101076403300	2022-10-25 02:18 UTC
+coo.31924057525382	2024-02-10 13:52 UTC
+coo.31924065580551	2023-09-11 05:00 UTC
+mdp.39015008833884	2023-07-25 00:32 UTC
+njp.32101010945275	2023-04-20 07:48 UTC
+njp.32101076201159	2024-03-19 13:08 UTC
+inu.30000099671632	2023-05-30 09:46 UTC
+yale.39002004065844	2011-06-01 03:25 UTC
+hvd.32044098627870	2022-12-29 10:25 UTC
+inu.30000092253941	2022-12-20 16:29 UTC
+mdp.39015060429308	2023-04-15 03:56 UTC
+mdp.39015060429464	2023-04-06 06:50 UTC
+uc1.b3385165	2023-12-18 07:01 UTC
+uiug.30112042710548	2024-02-17 18:52 UTC
+uc1.b3548551	2022-10-09 05:09 UTC
+uc1.b3850894	2022-09-29 04:50 UTC
+hvd.hnqbts	2023-03-19 17:16 UTC
+mdp.39015024071824	2023-03-12 01:33 UTC
+hvd.32044050827351	2023-03-22 13:37 UTC
+coo.31924065585840	2023-10-03 14:21 UTC
+mdp.39015048893195	2023-06-27 08:17 UTC
+mdp.39015053252139	2022-12-16 11:29 UTC
+mdp.39015059489032	2023-07-08 05:46 UTC
+uc1.b3385486	2023-06-12 02:37 UTC
+inu.30000099671665	2022-12-16 15:45 UTC
+mdp.39015060425942	2022-10-30 02:47 UTC
+coo.31924066146733	2023-08-11 17:47 UTC
+uc1.b2905408	2022-10-17 15:18 UTC
+hvd.32044010335081	2022-12-28 23:29 UTC
+coo.31924057525861	2024-02-10 13:52 UTC
+njp.32101076201167	2024-03-19 13:08 UTC
+hvd.32044014419220	2022-12-09 18:23 UTC
+mdp.39015053262393	2022-12-15 12:43 UTC
+mdp.39015060429746	2023-04-18 12:44 UTC
+uc1.b2972967	2023-08-31 14:52 UTC
+umn.31951000742933f	2023-09-29 21:37 UTC
+hvd.hwqu51	2023-02-28 12:31 UTC
+mdp.39015060425751	2023-07-06 13:33 UTC
+uiuo.ark:/13960/t4qk01n82	2014-06-04 10:01 UTC
+njp.32101063578718	2022-07-16 03:13 UTC
+mdp.39015009286215	2023-07-29 07:08 UTC
+nyp.33433082488895	2024-03-03 11:56 UTC
+hvd.32044098628217	2022-12-29 10:25 UTC
+hvd.32044043851013	2023-07-09 10:01 UTC
+njp.32101076199171	2024-03-19 13:11 UTC
+mdp.39015060426742	2023-07-06 13:32 UTC
+nyp.33433082488911	2023-09-22 15:27 UTC
+njp.32101075672806	2023-04-21 04:17 UTC
+mdp.39015033845549	2023-11-26 06:21 UTC
+nyp.33433076055809	2022-10-05 13:13 UTC
+njp.32101076425980	2023-04-21 11:02 UTC
+coo.31924057522082	2023-08-05 16:39 UTC
+mdp.39015004858224	2023-11-08 13:59 UTC
+coo.31924065856167	2024-02-18 07:26 UTC
+uiug.30112042290434	2024-02-20 04:47 UTC
+uc1.b3011277	2023-05-26 14:44 UTC
+mdp.39015003346247	2023-07-22 14:29 UTC
+mdp.39015049192902	2023-07-22 21:13 UTC
+njp.32101076379989	2024-03-20 19:59 UTC
+njp.32101076533932	2024-03-21 00:58 UTC
+chi.78323978	2023-08-07 17:05 UTC
+chi.55229744	2024-02-26 06:56 UTC
+njp.32101073758805	2023-03-06 23:56 UTC
+uc2.ark:/13960/t4bp05b0f	2018-12-08 13:43 UTC
+mdp.39015059488877	2023-07-08 05:46 UTC
+chi.78323841	2023-11-30 13:22 UTC
+mdp.39015005484020	2022-10-02 20:16 UTC
+hvd.32044010332070	2023-02-25 03:53 UTC
+njp.32101047468010	2023-09-22 21:04 UTC
+uc1.b3627386	2022-09-23 11:58 UTC
+hvd.32044092658095	2023-01-16 13:14 UTC
+hvd.32044014692362	2023-03-21 10:43 UTC
+nyp.33433000183008	2023-05-18 23:09 UTC
+njp.32101080222720	2023-10-28 19:10 UTC
+njp.32101074834787	2023-06-02 09:40 UTC
+uc1.31158010000023	2023-08-24 03:53 UTC
+nyp.33433067366678	2024-03-04 02:01 UTC
+hvd.hwa2b7	2023-03-20 06:00 UTC
+hvd.32044103001111	2023-01-17 20:33 UTC
+umn.31951t00020309x	2023-10-01 17:01 UTC
+mdp.39015008880067	2022-10-12 19:31 UTC
+chi.78023993	2023-08-04 04:30 UTC
+hvd.32044092738798	2023-01-16 14:43 UTC
+nyp.33433081658886	2023-09-21 08:38 UTC
+mdp.39015043572422	2024-01-12 15:17 UTC
+njp.32101007684655	2024-03-04 21:45 UTC
+uc1.32106020079791	2022-10-16 18:19 UTC
+hvd.32044092797216	2023-01-16 15:02 UTC
+mdp.39015059846678	2022-11-22 09:32 UTC
+mdp.39015036664038	2022-10-06 15:36 UTC
+chi.78013704	2023-08-04 04:31 UTC
+mdp.39015059395619	2023-07-07 08:28 UTC
+uc1.b3627386	2022-09-23 11:58 UTC
+hvd.32044048963011	2023-10-08 22:39 UTC
+nyp.33433081646642	2022-05-01 03:24 UTC
+nyp.33433076055809	2022-10-05 13:13 UTC
+mdp.39015060429415	2023-04-06 06:50 UTC
+hvd.hxkepr	2023-03-03 05:40 UTC
+uiug.30112001676896	2024-02-28 02:20 UTC
+mdp.39015060429423	2023-04-18 12:44 UTC
+chi.12153205	2024-02-12 15:40 UTC
+mdp.39015026482151	2023-07-22 18:55 UTC
+mdp.39015063933546	2023-12-31 21:55 UTC
+inu.30000099860342	2022-12-20 16:30 UTC
+njp.32101076425485	2023-07-06 01:56 UTC
+umn.31951p00293997r	2023-01-14 04:12 UTC
+wu.89001946482	2023-07-21 14:01 UTC
+uc1.b3311895	2023-05-24 06:50 UTC
+hvd.32044092624287	2023-01-16 15:26 UTC
+uc1.b3627386	2022-09-23 11:58 UTC
+uc1.$b31654	2022-05-29 17:18 UTC
+inu.30000099671764	2023-01-31 17:30 UTC
+hvd.32044094024825	2023-01-16 21:26 UTC
+njp.32101020794176	2023-11-01 10:30 UTC
+hvd.32044012913166	2022-12-09 14:08 UTC
+njp.32101076451556	2023-10-26 04:46 UTC
+njp.32101076472966	2022-11-10 16:08 UTC
+mdp.39015060429597	2023-04-18 12:44 UTC
+coo.31924066328299	2023-12-02 03:03 UTC
+coo.31924066328299	2023-12-02 03:03 UTC
+njp.32101076180460	2023-07-14 12:36 UTC
+mdp.39015012991363	2023-04-05 13:23 UTC
+njp.32101075673754	2023-04-21 04:19 UTC
+njp.32101075672665	2022-11-05 21:32 UTC
+msu.31293018462196	2022-05-28 02:13 UTC
+njp.32101077262895	2023-06-30 10:52 UTC
+hvd.32044092797182	2023-01-16 15:02 UTC
+mdp.39015043572588	2023-08-15 11:16 UTC
+nyp.33433081659017	2024-03-05 00:04 UTC
+uc1.b2900825	2022-10-17 13:59 UTC
+mdp.39015060424127	2023-04-15 03:55 UTC
+njp.32101075672905	2022-11-05 23:42 UTC
+hvd.32044010396893	2022-12-09 08:13 UTC
+mdp.39015010329087	2023-03-31 15:46 UTC
+njp.32101076890142	2024-03-21 00:18 UTC
+mdp.39015010328121	2023-09-04 14:39 UTC
+uc1.b2972398	2023-08-31 14:51 UTC
+hvd.ah3rmv	2023-03-18 08:21 UTC
+mdp.39015048909868	2022-07-27 21:17 UTC
+njp.32101074443415	2023-07-06 19:28 UTC
+mdp.39015078153817	2022-12-15 18:56 UTC
+hvd.32044040731473	2023-10-08 19:59 UTC
+pst.000068744151	2023-12-10 02:20 UTC
+njp.32101065266668	2022-09-04 00:16 UTC
+njp.32101065266668	2022-09-04 00:16 UTC
+hvd.32044086720679	2022-12-10 15:06 UTC
+coo.31924057531109	2024-02-18 05:18 UTC
+mdp.39015060429332	2023-04-18 12:44 UTC
+chi.19606141	2024-02-27 20:53 UTC
+hvd.32044010495000	2022-12-31 00:36 UTC
+hvd.32044024285587	2022-12-31 02:07 UTC
+njp.32101076201175	2024-03-19 13:08 UTC
+uva.x030453236	2022-08-26 14:39 UTC
+mdp.39015043572661	2022-09-30 17:45 UTC
+uc1.b2905409	2022-10-05 17:01 UTC
+mdp.39015060429555	2023-04-18 12:44 UTC
+mdp.39015060429498	2023-04-15 03:56 UTC
+mdp.39015008601109	2022-11-23 18:27 UTC
+nyp.33433074853965	2022-12-24 03:37 UTC
+njp.32101068158847	2023-05-20 00:32 UTC
+hvd.hnlia3	2023-03-19 14:38 UTC
+loc.ark:/13960/t3gx4tr12	2011-03-17 01:33 UTC
+hvd.tz1l4c	2023-03-21 11:20 UTC
+mdp.39015060430108	2023-04-15 03:57 UTC
+hvd.tz1l4w	2023-05-31 12:29 UTC
+njp.32101010945226	2023-07-05 23:53 UTC
+mdp.39015031048211	2023-04-10 09:02 UTC
+njp.32101077276523	2023-04-21 17:28 UTC
+njp.32101076782703	2023-06-19 12:20 UTC
+hvd.32044050831999	2023-03-22 13:37 UTC
+mdp.39015012330885	2023-07-05 05:13 UTC
+uc1.b3546679	2023-06-11 10:35 UTC
+njp.32101075673481	2023-04-21 04:18 UTC
+mdp.39015020441104	2022-12-21 00:42 UTC
+coo.31924106553286	2023-12-06 22:25 UTC
+njp.32101076472792	2023-04-21 11:37 UTC
+mdp.39015043572372	2022-09-30 17:45 UTC
+hvd.32044050827351	2023-03-22 13:37 UTC
+mdp.39015063944592	2022-11-19 07:56 UTC
+njp.32101063578650	2023-04-20 09:35 UTC
+njp.32101076433125	2023-09-26 15:14 UTC
+mdp.39015048893252	2022-12-15 12:05 UTC
+coo.31924065856167	2024-02-18 07:26 UTC
+njp.32101075716934	2024-03-19 12:06 UTC
+coo.31924057525606	2024-02-10 15:13 UTC
+coo.31924062186204	2023-12-01 18:53 UTC
+uc1.b3385173	2023-06-12 02:44 UTC
+mdp.39015059402357	2022-07-27 23:27 UTC
+njp.32101075672624	2023-04-21 04:16 UTC
+hvd.ah3kfk	2023-03-18 07:39 UTC
+njp.32101076795333	2024-03-19 15:01 UTC
+njp.32101023869397	2023-04-20 08:03 UTC
+uc1.$b312189	2022-09-12 05:57 UTC
+njp.32101007684614	2024-03-04 21:45 UTC
+hvd.32044092624352	2023-01-16 13:00 UTC
+uva.x000240890	2022-09-18 04:24 UTC
+nyp.33433067294433	2023-05-14 08:02 UTC
+nyp.33433067294433	2023-05-14 08:02 UTC
+coo.31924007186517	2023-11-30 00:54 UTC
+njp.32101047468002	2023-09-22 21:04 UTC
+hvd.32044009907841	2023-10-07 23:22 UTC
+hvd.32044021008149	2023-10-08 11:00 UTC
+njp.32101076201183	2023-07-03 04:24 UTC
+njp.32101076457744	2023-04-21 11:15 UTC
+mdp.39015060430082	2023-07-06 13:29 UTC
+uc1.b3385513	2023-12-18 06:59 UTC
+mdp.39015053252139	2022-12-16 11:29 UTC
+inu.30000092253925	2023-04-10 00:21 UTC
+pst.000008820648	2023-08-07 19:42 UTC
+mdp.39015008570205	2023-06-05 00:18 UTC
+coo.31924075116701	2023-09-27 19:07 UTC
+nyp.33433089908747	2023-08-08 14:35 UTC
+hvd.32044086791217	2022-12-09 17:53 UTC
+njp.32101072577347	2024-03-17 02:19 UTC
+njp.32101071985772	2022-12-25 02:44 UTC
+mdp.39015056480562	2022-11-24 12:09 UTC
+umn.31951002792969k	2022-09-14 23:40 UTC
+nyp.33433087345637	2023-05-26 11:58 UTC
+mdp.39015022469087	2023-08-06 16:32 UTC
+inu.30000084048762	2023-04-21 01:25 UTC
+njp.32101023869397	2023-04-20 08:03 UTC
+mdp.39015008095047	2022-12-15 21:50 UTC
+njp.32101077260600	2024-03-20 23:52 UTC
+njp.32101077260618	2024-03-20 23:52 UTC
+nyp.33433074380704	2023-07-05 06:03 UTC
+inu.30000104007657	2023-03-30 18:06 UTC
+njp.32101076457702	2023-04-21 11:15 UTC
+mdp.39015043800013	2023-04-13 10:29 UTC
+nyp.33433004518415	2024-03-08 20:02 UTC
+hvd.32044038399135	2023-01-14 23:49 UTC
+njp.32101077288247	2023-04-21 15:12 UTC
+njp.32101076199130	2023-07-11 11:08 UTC
+njp.32101076530979	2023-05-20 05:04 UTC
+uc1.c2608792	2023-09-27 23:40 UTC
+njp.32101076530979	2023-05-20 05:04 UTC
+nyp.33433074380662	2023-05-31 03:09 UTC
+hvd.32044011856838	2022-12-31 00:39 UTC
+uc1.b3919785	2022-09-30 00:16 UTC
+wu.89001946482	2023-07-21 14:01 UTC
+uc1.b3311895	2023-05-24 06:50 UTC
+uc2.ark:/13960/t8w95458t	2010-05-03 22:15 UTC
+mdp.39015013094217	2023-07-22 15:04 UTC
+mdp.39015008305289	2023-06-15 15:36 UTC
+njp.32101076530979	2023-05-20 05:04 UTC
+hvd.hwp8ba	2023-04-07 15:28 UTC
+njp.32101076457066	2023-04-05 18:58 UTC
+coo1.ark:/13960/t4bp0n867	2022-11-07 18:14 UTC
+njp.32101076530979	2023-05-20 05:04 UTC
+uc1.b2972949	2023-05-26 05:19 UTC
+njp.32101045352828	2023-09-22 21:20 UTC
+njp.32101047467988	2023-09-22 21:04 UTC
+hvd.32044086759800	2022-12-10 14:16 UTC
+uc1.b3885866	2022-09-29 21:15 UTC
+mdp.39015060429357	2023-04-18 12:44 UTC
+njp.32101077288569	2024-03-20 23:10 UTC
+aeu.ark:/13960/t1pg22p71	2014-09-15 07:08 UTC
+nyp.33433082488887	2023-01-24 13:51 UTC
+njp.32101064475831	2024-03-16 21:24 UTC
+uc1.31175035197097	2022-05-24 06:26 UTC
+njp.32101076880150	2023-05-13 23:22 UTC
+coo.31924008821047	2022-11-25 23:34 UTC
+nyp.33433074380720	2023-06-01 19:48 UTC
+hvd.32044038400958	2023-10-08 18:51 UTC
+mdp.39015030932753	2022-12-15 07:29 UTC
+nyp.33433074380696	2023-07-11 10:18 UTC
+njp.32101076889508	2023-10-26 04:28 UTC
+njp.32101077288239	2023-04-21 15:12 UTC
+hvd.32044098627268	2022-12-29 10:23 UTC
+hvd.32044092634013	2023-01-15 12:33 UTC
+hvd.32044014683114	2023-10-08 04:35 UTC
+hvd.32044058190059	2023-01-15 05:02 UTC
+umn.31951002804000l	2024-02-02 16:44 UTC
+uc1.$b661479	2023-05-03 21:36 UTC
+hvd.32044092645134	2023-01-15 11:07 UTC
+mdp.39015060429357	2023-04-18 12:44 UTC
+mdp.39015074687149	2023-07-20 18:19 UTC
+mdp.39015035805772	2023-01-29 00:15 UTC
+pst.000068744458	2023-11-01 15:38 UTC
+mdp.39015016898432	2023-01-13 01:34 UTC
+uc1.b3924130	2022-06-10 04:29 UTC
+uc1.b3924129	2022-06-16 11:52 UTC
+njp.32101075672509	2023-04-21 04:16 UTC
+uc1.b3293449	2022-08-31 05:38 UTC
+loc.ark:/13960/t1fj37n7j	2011-03-12 18:03 UTC
+loc.ark:/13960/t9280zf6z	2011-03-19 03:14 UTC
+njp.32101076200664	2023-04-04 23:21 UTC
+njp.32101076403425	2024-03-20 22:53 UTC
+mdp.39015060429340	2023-04-18 12:44 UTC
+mdp.39015003348201	2023-11-24 05:38 UTC
+mdp.39015067091739	2023-04-10 13:54 UTC
+mdp.39015059896285	2023-08-06 17:42 UTC
+mdp.39015060430397	2023-04-18 12:45 UTC
+inu.30000099860565	2022-12-16 15:46 UTC
+njp.32101073025528	2023-10-25 16:37 UTC
+hvd.32044090276395	2023-06-22 04:37 UTC
+hvd.hwilnp	2023-03-20 08:08 UTC
+dul1.ark:/13960/t6d23816n	2014-01-10 17:15 UTC
+njp.32101075716934	2024-03-19 12:06 UTC
+njp.32101075716934	2024-03-19 12:06 UTC
+njp.32101076199239	2023-07-04 12:30 UTC
+hvd.hnqbsv	2023-03-19 17:15 UTC
+nyp.33433081647616	2024-03-01 04:11 UTC
+njp.32101037023239	2023-10-28 04:07 UTC
+njp.32101037601646	2023-11-01 09:45 UTC
+njp.32101063578791	2024-03-06 13:16 UTC
+hvd.32044012418034	2023-10-08 02:17 UTC
+ucm.5326809190	2022-05-22 09:08 UTC
+mdp.39015054289338	2022-11-24 12:41 UTC
+njp.32101074443332	2024-03-19 10:54 UTC
+njp.32101074443399	2023-07-12 14:30 UTC
+njp.32101074443415	2023-07-06 19:28 UTC
+njp.32101021580343	2023-04-20 08:03 UTC
+inu.30000099860326	2022-12-16 15:47 UTC
+njp.32101076041084	2023-08-02 17:35 UTC
+njp.32101075672749	2023-04-21 04:17 UTC
+njp.32101075729960	2024-03-19 11:48 UTC
+hvd.32044098627433	2022-12-29 10:24 UTC
+uc1.b3885859	2023-06-02 19:17 UTC
+mdp.39015060429530	2023-04-18 12:44 UTC
+njp.32101077288213	2023-04-08 12:24 UTC
+mdp.39076000323746	2023-08-13 21:22 UTC
+inu.32000000683138	2024-01-19 09:06 UTC
+mdp.39015027588287	2024-01-25 09:54 UTC
+mdp.39015073107768	2024-01-05 21:48 UTC
+coo.31924057525671	2024-02-10 13:52 UTC
+hvd.32044092711480	2023-01-16 15:29 UTC
+uc1.b3924126	2022-06-05 01:58 UTC
+hvd.32044038400958	2023-10-08 18:51 UTC
+hvd.32044092797232	2023-01-16 15:42 UTC
+njp.32101076384435	2023-03-26 09:14 UTC
+njp.32101076378189	2023-10-25 19:32 UTC
+hvd.32044009957044	2023-10-07 23:28 UTC
+njp.32101076378536	2023-10-25 19:32 UTC
+hvd.hnqbsx	2023-03-26 02:32 UTC
+hvd.32044012418034	2023-10-08 02:17 UTC
+nyp.33433081756896	2023-01-18 18:29 UTC
+chi.55220547	2024-02-12 15:35 UTC
+njp.32101076889979	2023-10-27 10:54 UTC
+chi.79213384	2024-02-11 10:28 UTC
+uiuo.ark:/13960/t4qk01n82	2014-06-04 10:01 UTC
+hvd.32044048962955	2023-02-26 13:35 UTC
+hvd.32044092677376	2023-10-10 01:49 UTC
+uc1.32106001646766	2023-05-10 19:57 UTC
+uc1.32106015528877	2022-06-21 04:43 UTC
+nyp.33433081672853	2023-09-21 13:26 UTC
+njp.32101076426079	2023-04-21 11:04 UTC
+njp.32101076040946	2023-09-26 18:29 UTC
+mdp.39015048893823	2023-07-01 02:53 UTC
+uc1.b2974316	2023-06-07 20:53 UTC
+uc2.ark:/13960/t8ff3wr3q	2010-04-29 11:05 UTC
+hvd.hxkepr	2023-03-03 05:40 UTC
+njp.32101076457728	2023-04-21 11:15 UTC
+coo.31924069259624	2023-10-14 20:07 UTC
+hvd.32044092640663	2023-01-17 00:26 UTC
+njp.32101075673622	2024-03-19 10:31 UTC
+mdp.39015049192910	2023-07-03 08:47 UTC
+hvd.32044092796093	2023-01-16 15:01 UTC
+mdp.39015030866506	2023-11-08 17:54 UTC
+hvd.32044092797190	2023-01-16 15:02 UTC
+chi.12755443	2024-02-27 20:53 UTC
+hvd.32044048963029	2024-01-03 09:15 UTC
+hvd.hnqbsr	2023-03-19 17:15 UTC
+mdp.39015048893831	2023-06-27 22:57 UTC
+hvd.32044048963136	2024-01-03 09:15 UTC
+hvd.32044103001129	2023-01-17 20:00 UTC
+hvd.hxe6bx	2023-03-26 04:48 UTC
+uiug.30112001676896	2024-02-28 02:20 UTC
+inu.30000099671525	2023-07-04 02:42 UTC
+inu.30000099671624	2023-05-30 09:46 UTC
+uiug.30112046384886	2022-05-19 10:57 UTC
+hvd.hnqbtn	2023-03-19 17:16 UTC
+njp.32101065266304	2022-09-03 13:08 UTC
+njp.32101076201183	2023-07-03 04:24 UTC
+mdp.39015060429381	2023-04-09 19:32 UTC
+njp.32101075672871	2023-04-21 04:17 UTC
+hvd.32044010396893	2022-12-09 08:13 UTC
+uc1.b3385477	2023-06-12 02:37 UTC
+nyp.33433082219621	2023-06-01 06:45 UTC
+njp.32101064467036	2023-11-02 08:51 UTC
+hvd.32044031571342	2023-01-14 20:51 UTC
+hvd.32044054989868	2023-10-09 01:31 UTC
+inu.30000099671566	2023-05-30 09:46 UTC
+mdp.39015060429399	2023-04-18 12:44 UTC
+mdp.39015033845689	2023-11-26 10:54 UTC
+hvd.32044098628274	2022-12-29 11:19 UTC
+mdp.39015059397953	2023-04-13 23:26 UTC
+nyp.33433075914071	2023-07-03 02:44 UTC
+mdp.39015008095104	2023-07-01 22:59 UTC
+njp.32101065270892	2023-07-15 08:12 UTC
+hvd.32044092711431	2023-01-16 14:37 UTC
+coo.31924066518758	2023-11-19 14:39 UTC
+uc1.ax0003129954	2022-07-17 21:13 UTC
+njp.32101076472909	2023-07-06 01:55 UTC
+mdp.39015060429480	2023-04-15 03:08 UTC
+mdp.39015030936325	2023-04-06 02:07 UTC
+hvd.32044098641632	2022-12-29 10:48 UTC
+njp.32101075672855	2023-07-05 05:32 UTC
+chi.79279237	2022-08-29 08:22 UTC
+hvd.32044011590692	2022-12-31 01:51 UTC
+njp.32101076472917	2022-09-10 13:56 UTC
+mdp.39015059402340	2023-02-01 16:00 UTC
+umn.31951002792970z	2024-01-25 03:01 UTC
+njp.32101076472958	2023-07-11 16:57 UTC
+mdp.39015060430116	2023-04-18 12:45 UTC
+njp.32101076472933	2022-11-10 16:07 UTC
+mdp.39015060430058	2023-04-18 12:44 UTC
+mdp.39015060430371	2023-04-18 12:45 UTC
+mdp.39015014523602	2022-12-15 04:38 UTC
+hvd.32044098641343	2022-12-29 11:26 UTC
+njp.32101007893256	2023-08-21 10:56 UTC
+njp.32101063551608	2023-03-06 12:40 UTC
+hvd.32044092754100	2023-01-16 14:48 UTC
+mdp.39015041879613	2008-06-05 03:42 UTC
+nyp.33433074829270	2022-05-01 00:20 UTC
+njp.32101075672608	2023-04-21 05:02 UTC
+hvd.hnqbst	2023-03-19 17:15 UTC
+hvd.hxe6bz	2023-03-26 05:00 UTC
+hvd.32044010396893	2022-12-09 08:13 UTC
+nyp.33433082219902	2023-09-21 16:01 UTC
+coo1.ark:/13960/t3st84m4q	2015-07-18 06:04 UTC
+mdp.39015036664079	2023-01-27 20:38 UTC
+loc.ark:/13960/t0xp7hp6s	2011-03-12 12:24 UTC
+uc1.c2641998	2023-12-22 22:52 UTC
+hvd.32044092796085	2023-10-10 02:10 UTC
+hvd.32044092797208	2023-01-16 15:02 UTC
+mdp.39015008095153	2023-01-31 19:47 UTC
+mdp.39015060429506	2023-04-09 19:32 UTC
+uc1.$b272656	2022-09-14 05:41 UTC
+hvd.hnle8h	2023-03-19 15:06 UTC
+nyp.33433000182992	2023-05-18 20:49 UTC
+inu.30000099671723	2023-05-30 09:46 UTC
+mdp.39015060429589	2023-04-18 12:44 UTC
+hvd.hnqbtj	2023-03-19 17:15 UTC
+mdp.39015060429522	2023-04-09 19:32 UTC
+hvd.hxe6c3	2023-03-26 05:00 UTC
+nyp.33433074894126	2023-03-06 04:28 UTC
+njp.32101075672632	2023-04-21 04:16 UTC
+mdp.39015043572539	2022-12-26 14:27 UTC
+hvd.hx2hrd	2023-04-04 14:47 UTC
+njp.32101047467996	2023-09-22 21:04 UTC
+njp.32101047468002	2023-09-22 21:04 UTC
+coo1.ark:/13960/t70v9287j	2015-07-18 13:54 UTC
+njp.32101077879508	2022-09-07 18:55 UTC
+mdp.39015060429449	2023-04-18 12:44 UTC
+mdp.39015087701341	2024-03-10 19:04 UTC
+hvd.32044074313453	2022-12-09 16:18 UTC
+njp.32101076457785	2023-04-21 11:15 UTC
+mdp.39015060429548	2023-04-15 03:56 UTC
+chi.78013677	2023-08-04 04:31 UTC
+mdp.39015015383279	2023-08-04 09:23 UTC
+nyp.33433076071004	2022-10-05 14:13 UTC
+hvd.hnqbtr	2023-03-26 02:32 UTC
+inu.30000099671541	2022-12-16 15:45 UTC
+coo1.ark:/13960/t9n30f16x	2015-07-18 20:16 UTC
+mdp.39015073107529	2024-01-05 20:27 UTC
+njp.32101063578627	2024-03-06 13:16 UTC
+coo.31924062189661	2023-07-08 12:30 UTC
+njp.32101076471414	2023-12-24 13:04 UTC
+coo.31924066146733	2023-08-11 17:47 UTC
+mdp.39015060429431	2023-04-09 19:32 UTC
+mdp.39015049192894	2023-08-07 09:02 UTC
+uc1.b2972410	2022-10-01 23:20 UTC
+mdp.39015062280055	2022-11-26 21:01 UTC
+inu.30000099671558	2023-07-10 04:42 UTC
+inu.30000104005750	2023-03-30 18:06 UTC
+njp.32101077262788	2023-06-02 10:52 UTC
+mdp.39015049192928	2023-04-18 07:10 UTC
+njp.32101075672616	2023-04-21 04:16 UTC

From bd7a6b1c6f1c54debc54daf979d5a60780a9d017 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 21 Mar 2024 18:19:24 -0400
Subject: [PATCH 26/71] Customize fields when saving as new #591

---
 ppa/archive/admin.py | 45 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/ppa/archive/admin.py b/ppa/archive/admin.py
index 6849fbe9..e7931162 100644
--- a/ppa/archive/admin.py
+++ b/ppa/archive/admin.py
@@ -177,9 +177,54 @@ def source_link(self, obj):
     source_link.short_description = "Source id"
     source_link.admin_order_field = "source_id"
 
+    def change_view(self, request, object_id, form_url="", extra_context=None):
+        # customize behavior when copying a record and saving as new
+        if request.POST.get("_saveasnew"):
+            # if source is unset, this means we are loading the "save as new"
+            # form for a hathitrust record
+            if not request.POST.get("source"):
+                # customize save as new field contents
+                instance = DigitizedWork.objects.get(pk=object_id)
+                # make a copy of the querydict so we can update it
+                post_params = request.POST.copy()
+                # read-only fields should be preserved
+                post_params["source"] = instance.source
+                post_params["source_id"] = instance.source_id
+                post_params["source_url"] = instance.source_url
+                post_params["record_id"] = instance.record_id
+                # clear out fields that should be changed when excerpting
+                clear_fields = [
+                    "title",
+                    "sort_title",
+                    "author",
+                    "pages_orig",
+                    "pages_digital",
+                    # "page_count",  # read-only, does not automatically propagate
+                    "notes",
+                    "public_notes",
+                    "collections",
+                    "cluster",
+                ]
+                for field in clear_fields:
+                    try:
+                        del post_params[field]
+                    except KeyError:
+                        pass
+
+                # update request with our modified post parameters
+                request.POST = post_params
+
+        return super().change_view(
+            request,
+            object_id,
+            form_url,
+            extra_context=extra_context,
+        )
+
     def save_model(self, request, obj, form, change):
         """Note any fields in the protected list that have been changed in
         the admin and preserve in database."""
+
         # If new object, created from scratch, nothing to track and preserve
         # or if item is not a HathiTrust item, save and return
         if not change or obj.source != DigitizedWork.HATHI:

From c34c92db5a85de3de377c2aafca9856286493f5e Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 21 Mar 2024 18:20:06 -0400
Subject: [PATCH 27/71] Handle excerpt page count & indexing when saving new
 records #591

---
 ppa/archive/models.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/ppa/archive/models.py b/ppa/archive/models.py
index 365ff996..f8c87939 100644
--- a/ppa/archive/models.py
+++ b/ppa/archive/models.py
@@ -598,18 +598,25 @@ def save(self, *args, **kwargs):
             self.source_id = new_source_id
             self.pages_digital = new_pages_digital
 
-        if self.has_changed("pages_digital"):
+        # if excerpt page range has changed
+        # OR this is a new record with a page range
+        if self.has_changed("pages_digital") or (
+            self.pk is None and self.pages_digital
+        ):
             # update the page count if possible (i.e., not a Gale record)
             self.page_count = self.count_pages()
-            # if there is a page range set, update page count and index
-            if self.pages_digital:
+            # if page range changed on existing record, clear out old index
+            if self.pages_digital and self.pk is not None:
                 # update index to remove all pages that are no longer in range
                 self.solr.update.delete_by_query(
                     'source_id:"%s" AND item_type:page NOT order:(%s)'
                     % (self.source_id, " OR ".join(str(p) for p in self.page_span))
                 )
             # any page range change requires reindexing (potentially slow)
-            logger.debug("Reindexing pages for %s after change to page range", self)
+            if self.pk is None:
+                logger.debug("Indexing pages for new excerpt %s", self)
+            else:
+                logger.debug("Reindexing pages for %s after change to page range", self)
             self.index_items(Page.page_index_data(self))
             # NOTE: removing a page range may not work as expected
             # (does not recalculate page count; cannot recalculate for Gale items)

From 191e84b9535823c9d1bd57afaaa7d2c5d24b2db5 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Fri, 22 Mar 2024 11:18:07 -0400
Subject: [PATCH 28/71] Update unit tests and remove redundant excerpt logic

---
 .../management/commands/hathi_excerpt.py      | 19 +++++++------------
 ppa/archive/tests/test_views.py               |  4 +++-
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/ppa/archive/management/commands/hathi_excerpt.py b/ppa/archive/management/commands/hathi_excerpt.py
index ea1fb458..38aa9585 100644
--- a/ppa/archive/management/commands/hathi_excerpt.py
+++ b/ppa/archive/management/commands/hathi_excerpt.py
@@ -41,7 +41,7 @@
 from django.core.management.base import BaseCommand, CommandError
 from parasolr.django.signals import IndexableSignalHandler
 
-from ppa.archive.models import Collection, DigitizedWork, Page
+from ppa.archive.models import Collection, DigitizedWork
 
 logger = logging.getLogger(__name__)
 
@@ -83,7 +83,8 @@ def handle(self, *args, **kwargs):
             self.excerpt(row)
 
         self.stdout.write(
-            "\nExcerpted {excerpted:,d} existing records; created {created:,d} new excerpts. {error:,d} errors.".format_map(
+            "\nExcerpted {excerpted:,d} existing records; "
+            + "created {created:,d} new excerpts. {error:,d} errors.".format_map(
                 self.stats
             )
         )
@@ -146,12 +147,9 @@ def excerpt(self, row):
         digwork.public_notes = row.get("Public Notes", "")
 
         try:
-            # Calculate & save number of pages based on page range.
-            # (automatically calculated on save for excerpt but not
-            # for newly created items)
-            # Could trigger parse error if page span is invalid.
-            digwork.page_count = digwork.count_pages()
             # save to create or update in the database
+            # page count is automatically calculated on save for excerpts
+            # Could trigger parse error if page span is invalid.
             digwork.save()
         except intspan.ParseError as err:
             self.stderr.write(
@@ -173,14 +171,11 @@ def excerpt(self, row):
 
         if created:
             self.stats["created"] += 1
-            # any page range change requires reindexing (potentially slow)
-            logger.debug("Indexing pages for new excerpt %s", digwork)
-            DigitizedWork.index_items(Page.page_index_data(digwork))
-
+            # pages are automatically indexed when saving a new excerpt
         else:
+            self.stats["excerpted"] += 1
             # Indexed pages are automatically updated for existing records on save
             # when page range has changed.
-            self.stats["excerpted"] += 1
 
         DigitizedWork.index_items([digwork])
 
diff --git a/ppa/archive/tests/test_views.py b/ppa/archive/tests/test_views.py
index 8cfef2b0..18683a59 100644
--- a/ppa/archive/tests/test_views.py
+++ b/ppa/archive/tests/test_views.py
@@ -170,8 +170,10 @@ def test_anonymous_display_excerpt_hathi(self):
             response, hathi_page_url(excerpt.source_id, excerpt.first_page())
         )
 
-    def test_anonymous_display_excerpt_gale(self):
+    @patch("ppa.archive.models.DigitizedWork.index_items")
+    def test_anonymous_display_excerpt_gale(self, mock_index_items):
         # create a gale excerpt to test link logic
+        # patch index_items to skip attempting to index pages
         excerpt = DigitizedWork.objects.create(
             source_id="abc.1234",
             source_url="https://hdl.example.co/9823/abc.1234",

From c82fd2f95a5e2ec34fb57cd8580a789c497fce8d Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Fri, 22 Mar 2024 11:51:02 -0400
Subject: [PATCH 29/71] Update tests so creating test excerpts works without
 hathi data

---
 ppa/archive/management/commands/hathi_excerpt.py | 7 +++----
 ppa/archive/tests/test_models.py                 | 3 ++-
 ppa/archive/tests/test_views.py                  | 9 ++++++---
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/ppa/archive/management/commands/hathi_excerpt.py b/ppa/archive/management/commands/hathi_excerpt.py
index 38aa9585..8b7bec24 100644
--- a/ppa/archive/management/commands/hathi_excerpt.py
+++ b/ppa/archive/management/commands/hathi_excerpt.py
@@ -83,10 +83,9 @@ def handle(self, *args, **kwargs):
             self.excerpt(row)
 
         self.stdout.write(
-            "\nExcerpted {excerpted:,d} existing records; "
-            + "created {created:,d} new excerpts. {error:,d} errors.".format_map(
-                self.stats
-            )
+            f"\nExcerpted {self.stats['excerpted']:,d} existing records; "
+            + f"created {self.stats['created']:,d} new excerpts. "
+            + f"{self.stats['error']:,d} errors."
         )
 
     def load_collections(self):
diff --git a/ppa/archive/tests/test_models.py b/ppa/archive/tests/test_models.py
index cdc57fde..42abb1b0 100644
--- a/ppa/archive/tests/test_models.py
+++ b/ppa/archive/tests/test_models.py
@@ -689,7 +689,8 @@ def test_save_suppress(self):
             work.save()
             mock_hathiobj.delete_pairtree_data.assert_not_called()
 
-    def test_save_suppress_excerpt(self):
+    @patch("ppa.archive.models.DigitizedWork.index_items")
+    def test_save_suppress_excerpt(self, mock_index_items):
         work = DigitizedWork(source_id="chi.79279237", item_type=DigitizedWork.EXCERPT)
         with patch.object(work, "hathi") as mock_hathiobj:
             # no change in status - nothing should happen
diff --git a/ppa/archive/tests/test_views.py b/ppa/archive/tests/test_views.py
index 18683a59..afc4d41c 100644
--- a/ppa/archive/tests/test_views.py
+++ b/ppa/archive/tests/test_views.py
@@ -150,7 +150,8 @@ def test_anonymous_display_no_volume(self):
             msg_prefix="Volume metadata should not display if no enumcron",
         )
 
-    def test_anonymous_display_excerpt_hathi(self):
+    @patch("ppa.archive.models.DigitizedWork.index_items")
+    def test_anonymous_display_excerpt_hathi(self, mock_index_items):
         # create an excerpt
         excerpt = DigitizedWork.objects.create(
             source_id="abc.1234",
@@ -192,7 +193,8 @@ def test_anonymous_display_excerpt_gale(self, mock_index_items):
             ),
         )
 
-    def test_anonymous_display_article_hathi(self):
+    @patch("ppa.archive.models.DigitizedWork.index_items")
+    def test_anonymous_display_article_hathi(self, mock_index_items):
         # create an article
         article = DigitizedWork.objects.create(
             source_id="abc.1234",
@@ -426,7 +428,8 @@ def test_search_within_ajax(self):
         # should have pagination
         self.assertContains(response, '<div class="page-controls')
 
-    def test_get_queryset(self):
+    @patch("ppa.archive.models.DigitizedWork.index_items")
+    def test_get_queryset(self, mock_index_items):
         # requesting non-excerpt with start page specified should return 404 not found
         bogus_dial_excerpt_url = reverse(
             "archive:detail",

From 5bc539bb2a653f3c9893203418ac9630de800dee Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Mon, 25 Mar 2024 11:11:39 -0400
Subject: [PATCH 30/71] Include protected_fields in read-only fields copied
 when saving as new

ref #591
---
 ppa/archive/admin.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ppa/archive/admin.py b/ppa/archive/admin.py
index e7931162..c8a34104 100644
--- a/ppa/archive/admin.py
+++ b/ppa/archive/admin.py
@@ -192,6 +192,7 @@ def change_view(self, request, object_id, form_url="", extra_context=None):
                 post_params["source_id"] = instance.source_id
                 post_params["source_url"] = instance.source_url
                 post_params["record_id"] = instance.record_id
+                post_params["protected_fields"] = instance.protected_fields
                 # clear out fields that should be changed when excerpting
                 clear_fields = [
                     "title",

From f586df1c2309fc4ca8c637ce7d3289b140c21789 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Mon, 25 Mar 2024 13:15:05 -0400
Subject: [PATCH 31/71] Make protected_fields editable so we can copy when
 using save as new

ref #591
---
 ppa/archive/admin.py            | 13 ++++++++++++-
 ppa/archive/tests/test_admin.py |  8 +++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/ppa/archive/admin.py b/ppa/archive/admin.py
index c8a34104..a8670cec 100644
--- a/ppa/archive/admin.py
+++ b/ppa/archive/admin.py
@@ -158,6 +158,13 @@ def get_readonly_fields(self, request, obj=None):
         """
         if obj and obj.source == DigitizedWork.HATHI:
             return self.hathi_readonly_fields + self.readonly_fields
+
+        print(request.POST)
+        if request.POST.get("_saveasnew"):
+            # protected fields must not be read-only in order
+            # to preserve/copy when saving as new
+            return ("added", "updated")
+
         return self.readonly_fields
 
     def list_collections(self, obj):
@@ -192,7 +199,11 @@ def change_view(self, request, object_id, form_url="", extra_context=None):
                 post_params["source_id"] = instance.source_id
                 post_params["source_url"] = instance.source_url
                 post_params["record_id"] = instance.record_id
-                post_params["protected_fields"] = instance.protected_fields
+                # copy protected wield flags in simple string format
+                post_params[
+                    "protected_fields"
+                ] = instance.protected_fields.to_simple_str()
+
                 # clear out fields that should be changed when excerpting
                 clear_fields = [
                     "title",
diff --git a/ppa/archive/tests/test_admin.py b/ppa/archive/tests/test_admin.py
index 860a4a80..d3d6c067 100644
--- a/ppa/archive/tests/test_admin.py
+++ b/ppa/archive/tests/test_admin.py
@@ -56,7 +56,13 @@ def test_readonly_fields(self):
         site = AdminSite()
         digadmin = DigitizedWorkAdmin(DigitizedWork, site)
 
-        assert digadmin.get_readonly_fields(Mock()) == digadmin.readonly_fields
+        assert digadmin.get_readonly_fields(Mock(POST={})) == digadmin.readonly_fields
+
+        # when using 'save as new', protected fields should not be read only
+        assert digadmin.get_readonly_fields(Mock(POST={"_saveasnew": 1})) == (
+            "added",
+            "updated",
+        )
 
         # hathi record
         hathi_work = DigitizedWork.objects.first()

From 07fe596b86c8adadd58555b0d456565807e8a652 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Mon, 25 Mar 2024 15:00:26 -0400
Subject: [PATCH 32/71] Make protected_fields not required when editing in
 admin

---
 ...21_alter_digitizedwork_protected_fields.py | 22 +++++++++++++++++++
 ppa/archive/models.py                         |  7 ++++--
 2 files changed, 27 insertions(+), 2 deletions(-)
 create mode 100644 ppa/archive/migrations/0021_alter_digitizedwork_protected_fields.py

diff --git a/ppa/archive/migrations/0021_alter_digitizedwork_protected_fields.py b/ppa/archive/migrations/0021_alter_digitizedwork_protected_fields.py
new file mode 100644
index 00000000..887f3aed
--- /dev/null
+++ b/ppa/archive/migrations/0021_alter_digitizedwork_protected_fields.py
@@ -0,0 +1,22 @@
+# Generated by Django 5.0.2 on 2024-03-25 18:52
+
+import ppa.archive.models
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("archive", "0020_digitizedwork_page_count_help_text"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="digitizedwork",
+            name="protected_fields",
+            field=ppa.archive.models.ProtectedWorkField(
+                blank=True,
+                default=ppa.archive.models.ProtectedWorkFieldFlags,
+                help_text="Fields protected from HathiTrust bulk update because they have been manually edited in the Django admin.",
+            ),
+        ),
+    ]
diff --git a/ppa/archive/models.py b/ppa/archive/models.py
index f8c87939..4bcbc0bd 100644
--- a/ppa/archive/models.py
+++ b/ppa/archive/models.py
@@ -184,8 +184,10 @@ class ProtectedWorkField(models.Field):
     )
 
     def __init__(self, verbose_name=None, name=None, **kwargs):
-        """Make the field unnullable and not allowed to be blank."""
-        super().__init__(verbose_name, name, blank=False, null=False, **kwargs)
+        """Make the field unnullable; by default, not allowed to be blank."""
+        if "blank" not in kwargs:
+            kwargs["blank"] = False
+        super().__init__(verbose_name, name, null=False, **kwargs)
 
     def from_db_value(self, value, expression, connection):
         """Always return an instance of :class:`ProtectedWorkFieldFlags`"""
@@ -403,6 +405,7 @@ class DigitizedWork(ModelIndexable, TrackChangesModel):
     #: modified in Django admin.
     protected_fields = ProtectedWorkField(
         default=ProtectedWorkFieldFlags,
+        blank=True,  # required for save as new, where we make editable to copy
         help_text="Fields protected from HathiTrust bulk "
         "update because they have been manually edited in the "
         "Django admin.",

From 36f6065bf12cf6b586cd2030a04fcac68e4405e8 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Mon, 25 Mar 2024 16:36:40 -0400
Subject: [PATCH 33/71] Handle empty string when converting value to protected
 field flag

---
 ppa/archive/models.py            | 2 ++
 ppa/archive/tests/test_models.py | 8 ++++++++
 2 files changed, 10 insertions(+)

diff --git a/ppa/archive/models.py b/ppa/archive/models.py
index 4bcbc0bd..1a9cead6 100644
--- a/ppa/archive/models.py
+++ b/ppa/archive/models.py
@@ -198,6 +198,8 @@ def get_internal_type(self):
         return "PositiveSmallIntegerField"
 
     def get_prep_value(self, value):
+        if value == "":
+            return 0
         return int(value)
 
     def to_python(self, value):
diff --git a/ppa/archive/tests/test_models.py b/ppa/archive/tests/test_models.py
index 42abb1b0..e21e688a 100644
--- a/ppa/archive/tests/test_models.py
+++ b/ppa/archive/tests/test_models.py
@@ -28,6 +28,7 @@
     DigitizedWork,
     Page,
     ProtectedWorkFieldFlags,
+    ProtectedWorkField,
 )
 
 FIXTURES_PATH = os.path.join(settings.BASE_DIR, "ppa", "archive", "fixtures")
@@ -49,6 +50,13 @@ def test_str(self):
         assert str(fields) == "enumcron, sort_title, title"
 
 
+class TestProtectedWorkField(TestCase):
+    def test_get_prep_value(self):
+        assert ProtectedWorkField().get_prep_value("1") == 1
+        # handle empty string
+        assert ProtectedWorkField().get_prep_value("") == 0
+
+
 @pytest.mark.django_db
 class TestSignalHandlers:
     @patch.object(ModelIndexable, "index_items")

From 9954fcc9f5b3c7e330325e10f0ec0816bfc3bdf2 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Mon, 25 Mar 2024 16:36:40 -0400
Subject: [PATCH 34/71] Remove debug print statement in admin view

---
 ppa/archive/admin.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ppa/archive/admin.py b/ppa/archive/admin.py
index a8670cec..e9266364 100644
--- a/ppa/archive/admin.py
+++ b/ppa/archive/admin.py
@@ -159,7 +159,6 @@ def get_readonly_fields(self, request, obj=None):
         if obj and obj.source == DigitizedWork.HATHI:
             return self.hathi_readonly_fields + self.readonly_fields
 
-        print(request.POST)
         if request.POST.get("_saveasnew"):
             # protected fields must not be read-only in order
             # to preserve/copy when saving as new

From c07effcd57315db4ce6072239f69b83e7dbd5da7 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 28 Mar 2024 12:32:04 -0400
Subject: [PATCH 35/71] Warn on missing page count; improve output for negative
 work/page diff

resolves #596
---
 .../management/commands/index_pages.py        | 37 +++++++++++++++++--
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/ppa/archive/management/commands/index_pages.py b/ppa/archive/management/commands/index_pages.py
index a5c49a77..7682c6d2 100644
--- a/ppa/archive/management/commands/index_pages.py
+++ b/ppa/archive/management/commands/index_pages.py
@@ -117,9 +117,27 @@ def handle(self, *args, **kwargs):
 
             if self.verbosity >= self.v_normal:
                 if work_diff:
-                    self.stdout.write(f"{work_diff:,} works not indexed in Solr")
+                    # negative = more works in solr than database
+                    if work_diff < 0:
+                        self.stdout.write(
+                            self.style.WARNING(
+                                f"{abs(work_diff):,} extra works indexed in Solr; "
+                                + " may need to clear old data"
+                            )
+                        )
+                    else:
+                        self.stdout.write(f"{work_diff:,} works not indexed in Solr")
                 if page_diff:
-                    self.stdout.write(f"{page_diff:,} pages not indexed in Solr")
+                    # negative = more pages in solr than expected
+                    if work_diff < 0:
+                        self.stdout.write(
+                            self.style.WARNING(
+                                f"{abs(page_diff):,} more pages indexed in Solr than expected"
+                            )
+                        )
+
+                    else:
+                        self.stdout.write(f"{page_diff:,} pages not indexed in Solr")
 
         if kwargs.get("expedite"):
             # find works with missing pages
@@ -133,7 +151,20 @@ def handle(self, *args, **kwargs):
             pages_per_work = facets.facet_fields["group_id"]
             for digwork in DigitizedWork.items_to_index():
                 solr_page_count = pages_per_work.get(digwork.index_id(), 0)
-                if digwork.page_count != solr_page_count:
+                # it indicates an error, but page count could be null;
+                # if so, assume page count mismatch
+                if digwork.page_count is None:
+                    # add to list of works to index
+                    mismatches.append(digwork)
+                    # warn about the missing page count
+                    if self.verbosity >= self.v_normal:
+                        self.stdout.write(
+                            self.style.WARNING(
+                                f"Warning: {digwork} page count is not set in database"
+                            )
+                        )
+
+                elif digwork.page_count != solr_page_count:
                     # add to list of works to index
                     mismatches.append(digwork)
 

From 203c682812300a0d5a3b9f9dc648be6fa5b25e5a Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 28 Mar 2024 14:15:14 -0400
Subject: [PATCH 36/71] New manage command to update excerpt digital page range

adapted from hathi_excerpt manage command

resolves #625
---
 .../management/commands/adjust_excerpts.py    | 136 ++++++++++++++++++
 1 file changed, 136 insertions(+)
 create mode 100644 ppa/archive/management/commands/adjust_excerpts.py

diff --git a/ppa/archive/management/commands/adjust_excerpts.py b/ppa/archive/management/commands/adjust_excerpts.py
new file mode 100644
index 00000000..5f99b9de
--- /dev/null
+++ b/ppa/archive/management/commands/adjust_excerpts.py
@@ -0,0 +1,136 @@
+"""
+**adjust_excerpts** is a custom manage command to update
+the digital page range for excerpts or articles. It requires a CSV file
+with source id and original page range (to identify the correct record),
+and the new digital page range.
+
+The CSV must include:
+    * source_id
+    * pages_orig
+    * new_pages_digital
+
+Updated records are automatically indexed in Solr.
+"""
+
+import csv
+import logging
+
+import intspan
+from django.conf import settings
+from django.contrib.admin.models import CHANGE, LogEntry
+from django.contrib.auth.models import User
+from django.contrib.contenttypes.models import ContentType
+from django.core.management.base import BaseCommand, CommandError
+
+from ppa.archive.models import DigitizedWork
+
+logger = logging.getLogger(__name__)
+
+
+class Command(BaseCommand):
+    """Update digital page range for excerpted works."""
+
+    help = __doc__
+
+    #: normal verbosity level
+    v_normal = 1
+    verbosity = v_normal
+
+    def add_arguments(self, parser):
+        parser.add_argument("csv", help="CSV file with updated page ranges")
+
+    def handle(self, *args, **kwargs):
+        self.verbosity = kwargs.get("verbosity", self.verbosity)
+
+        # load csv file and check required fields
+        excerpt_info = self.load_csv(kwargs["csv"])
+
+        self.stats = {"error": 0, "notfound": 0, "updated": 0}
+
+        # get script user and digwork content type for creating log entries
+        self.script_user = User.objects.get(username=settings.SCRIPT_USERNAME)
+        self.digwork_contentype = ContentType.objects.get_for_model(DigitizedWork)
+
+        for row in excerpt_info:
+            self.update_excerpt(row)
+
+        self.stdout.write(
+            f"\nUpdated {self.stats['updated']:,d} records. "
+            + f"{self.stats['notfound']:,d} not found, "
+            + f"{self.stats['error']:,d} error{'s' if self.stats['error'] != 1 else ''}."
+        )
+
+    def update_excerpt(self, row):
+        """Process a row of the spreadsheet, find an existing excerpt
+        by source id and original page range, and update the digital
+        pages."""
+
+        # lookup by source id and original page range
+        digwork = DigitizedWork.objects.filter(
+            source_id=row["source_id"], pages_orig=row["pages_orig"]
+        ).first()
+        if not digwork:
+            self.stdout.write(
+                self.style.WARNING(
+                    "No record found for source id %(source_id)s and pages_orig %(pages_orig)s"
+                    % row
+                )
+            )
+            self.stats["notfound"] += 1
+            return
+
+        # update digital page range
+        digwork.pages_digital = row["new_pages_digital"]
+        # if this is not a change, do nothing
+        if not digwork.has_changed("pages_digital"):
+            return
+
+        try:
+            # save in the database;
+            # should automatically recalculate page range and index page content
+            digwork.save()
+            self.stats["updated"] += 1
+        except intspan.ParseError as err:
+            self.stderr.write(
+                self.style.WARNING("Error saving %s: %s" % (digwork, err))
+            )
+            self.stats["error"] += 1
+            return
+
+        # if changed and save succeeded, log the update
+        self.log_update(digwork)
+
+    def log_update(self, digwork):
+        """Create a log entry to document digital page range change."""
+
+        # create log entry to record what was done
+        LogEntry.objects.log_action(
+            user_id=self.script_user.pk,
+            content_type_id=self.digwork_contentype.pk,
+            object_id=digwork.pk,
+            object_repr=str(digwork),
+            change_message="Updated pages_digital",
+            action_flag=CHANGE,
+        )
+
+    csv_required_fields = ["source_id", "pages_orig", "new_pages_digital"]
+
+    def load_csv(self, path):
+        """Load a CSV file with information about excerpts to be updated."""
+        try:
+            with open(path, encoding="utf-8-sig") as csvfile:
+                csvreader = csv.DictReader(csvfile)
+                data = [
+                    row for row in csvreader if any(row.values())
+                ]  # skip blank rows
+        except FileNotFoundError:
+            raise CommandError("Error loading the specified CSV file: %s" % path)
+
+        csv_keys = set(data[0].keys())
+        csv_key_diff = set(self.csv_required_fields).difference(csv_keys)
+        # if any required fields are not present, error and quit
+        if csv_key_diff:
+            raise CommandError(
+                "Missing required fields in CSV file: %s" % ", ".join(csv_key_diff)
+            )
+        return data

From 0379a08108f0f07faaa60d2a72286302fb76c7ec Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 28 Mar 2024 14:57:26 -0400
Subject: [PATCH 37/71] Update adjust_excerpts to avoid duplication, add unit
 tests

- extend hathi_excerpt management command to avoid duplicating load_csv
---
 .../management/commands/adjust_excerpts.py    |  55 +++-----
 ppa/archive/tests/test_adjust_excerpts.py     | 124 ++++++++++++++++++
 2 files changed, 145 insertions(+), 34 deletions(-)
 create mode 100644 ppa/archive/tests/test_adjust_excerpts.py

diff --git a/ppa/archive/management/commands/adjust_excerpts.py b/ppa/archive/management/commands/adjust_excerpts.py
index 5f99b9de..20373c8d 100644
--- a/ppa/archive/management/commands/adjust_excerpts.py
+++ b/ppa/archive/management/commands/adjust_excerpts.py
@@ -12,7 +12,6 @@
 Updated records are automatically indexed in Solr.
 """
 
-import csv
 import logging
 
 import intspan
@@ -20,44 +19,53 @@
 from django.contrib.admin.models import CHANGE, LogEntry
 from django.contrib.auth.models import User
 from django.contrib.contenttypes.models import ContentType
-from django.core.management.base import BaseCommand, CommandError
+from django.template.defaultfilters import pluralize
 
 from ppa.archive.models import DigitizedWork
+from ppa.archive.management.commands import hathi_excerpt
 
 logger = logging.getLogger(__name__)
 
 
-class Command(BaseCommand):
+class Command(hathi_excerpt.Command):
     """Update digital page range for excerpted works."""
 
     help = __doc__
+    # inherits csv loading & validation from hathi_excerpt command
 
     #: normal verbosity level
     v_normal = 1
     verbosity = v_normal
+    #: override required fields
+    csv_required_fields = ["source_id", "pages_orig", "new_pages_digital"]
 
     def add_arguments(self, parser):
         parser.add_argument("csv", help="CSV file with updated page ranges")
 
+    def setup(self):
+        "common setup steps for running the script or testing"
+
+        self.stats = {"error": 0, "notfound": 0, "updated": 0, "unchanged": 0}
+        self.script_user = User.objects.get(username=settings.SCRIPT_USERNAME)
+        self.digwork_contentype = ContentType.objects.get_for_model(DigitizedWork)
+
     def handle(self, *args, **kwargs):
         self.verbosity = kwargs.get("verbosity", self.verbosity)
 
         # load csv file and check required fields
         excerpt_info = self.load_csv(kwargs["csv"])
-
-        self.stats = {"error": 0, "notfound": 0, "updated": 0}
-
-        # get script user and digwork content type for creating log entries
-        self.script_user = User.objects.get(username=settings.SCRIPT_USERNAME)
-        self.digwork_contentype = ContentType.objects.get_for_model(DigitizedWork)
+        self.setup()
 
         for row in excerpt_info:
             self.update_excerpt(row)
 
+        # summarize what was done
         self.stdout.write(
-            f"\nUpdated {self.stats['updated']:,d} records. "
+            f"\nUpdated {self.stats['updated']:,d} "
+            + f"record{pluralize(self.stats['updated'])}. "
+            + f"{self.stats['unchanged']:,d} unchanged, "
             + f"{self.stats['notfound']:,d} not found, "
-            + f"{self.stats['error']:,d} error{'s' if self.stats['error'] != 1 else ''}."
+            + f"{self.stats['error']:,d} error{pluralize(self.stats['error'])}."
         )
 
     def update_excerpt(self, row):
@@ -83,6 +91,7 @@ def update_excerpt(self, row):
         digwork.pages_digital = row["new_pages_digital"]
         # if this is not a change, do nothing
         if not digwork.has_changed("pages_digital"):
+            self.stats["unchanged"] += 1
             return
 
         try:
@@ -108,29 +117,7 @@ def log_update(self, digwork):
             user_id=self.script_user.pk,
             content_type_id=self.digwork_contentype.pk,
             object_id=digwork.pk,
-            object_repr=str(digwork),
+            object_repr=repr(digwork),
             change_message="Updated pages_digital",
             action_flag=CHANGE,
         )
-
-    csv_required_fields = ["source_id", "pages_orig", "new_pages_digital"]
-
-    def load_csv(self, path):
-        """Load a CSV file with information about excerpts to be updated."""
-        try:
-            with open(path, encoding="utf-8-sig") as csvfile:
-                csvreader = csv.DictReader(csvfile)
-                data = [
-                    row for row in csvreader if any(row.values())
-                ]  # skip blank rows
-        except FileNotFoundError:
-            raise CommandError("Error loading the specified CSV file: %s" % path)
-
-        csv_keys = set(data[0].keys())
-        csv_key_diff = set(self.csv_required_fields).difference(csv_keys)
-        # if any required fields are not present, error and quit
-        if csv_key_diff:
-            raise CommandError(
-                "Missing required fields in CSV file: %s" % ", ".join(csv_key_diff)
-            )
-        return data
diff --git a/ppa/archive/tests/test_adjust_excerpts.py b/ppa/archive/tests/test_adjust_excerpts.py
new file mode 100644
index 00000000..6e11145a
--- /dev/null
+++ b/ppa/archive/tests/test_adjust_excerpts.py
@@ -0,0 +1,124 @@
+from io import StringIO
+from unittest.mock import patch
+
+import pytest
+from django.contrib.admin.models import CHANGE, LogEntry
+from django.core.management import call_command
+
+from ppa.archive.models import DigitizedWork
+from ppa.archive.management.commands import adjust_excerpts
+
+
+@pytest.mark.django_db
+class TestAdjustExcerptsCommand:
+    @patch("ppa.archive.models.DigitizedWork.index_items")
+    def test_update_success(self, mock_index_items):
+        source_id = "abc.13245089"
+        pages_orig = "10-20"
+        pages_digital = "12-22"
+        work = DigitizedWork.objects.create(
+            source_id=source_id, pages_orig=pages_orig, pages_digital=pages_digital
+        )
+
+        cmd = adjust_excerpts.Command()
+        cmd.setup()  # initialize stats dict
+
+        # test with sample info coming from csv
+        update_info = {
+            "source_id": source_id,
+            "pages_orig": pages_orig,
+            "new_pages_digital": "15-25",
+        }
+        cmd.update_excerpt(update_info)
+        assert cmd.stats["updated"] == 1
+        # inspect the newly-excerpted work; get a fresh copy from the db
+        excerpt = DigitizedWork.objects.get(pk=work.pk)
+        assert excerpt.pages_digital == update_info["new_pages_digital"]
+
+        # check that log entry was created to document the change
+        log = LogEntry.objects.get(object_id=excerpt.pk)
+        assert log.action_flag == CHANGE
+        assert log.change_message == "Updated pages_digital"
+        assert log.user.username == "script"
+
+    def test_not_found(self, capsys):
+        cmd = adjust_excerpts.Command()
+        cmd.setup()  # initialize stats dict
+
+        # test with sample info, no corresponding db record
+        update_info = {
+            "source_id": "abcs.123",
+            "pages_orig": "i-iii",
+            "new_pages_digital": "15-25",
+        }
+        cmd.update_excerpt(update_info)
+        assert cmd.stats["notfound"] == 1
+        captured = capsys.readouterr()
+        assert "No record found" in captured.out
+
+    def test_error(self, capsys):
+        source_id = "abc.13245089"
+        pages_orig = "10-20"
+        pages_digital = "12-22"
+        DigitizedWork.objects.create(
+            source_id=source_id, pages_orig=pages_orig, pages_digital=pages_digital
+        )
+
+        cmd = adjust_excerpts.Command()
+        cmd.setup()
+        # test with sample info coming from csv
+        update_info = {
+            "source_id": source_id,
+            "pages_orig": pages_orig,
+            "new_pages_digital": "BOGUS",
+        }
+        cmd.update_excerpt(update_info)
+        assert cmd.stats["error"] == 1
+        # check captured output
+        captured = capsys.readouterr()
+        assert f"Error saving {source_id}" in captured.err
+
+    def test_unchanged(self):
+        source_id = "abc.13245089"
+        pages_orig = "10-20"
+        pages_digital = "12-22"
+        DigitizedWork.objects.create(
+            source_id=source_id, pages_orig=pages_orig, pages_digital=pages_digital
+        )
+        cmd = adjust_excerpts.Command()
+        cmd.setup()
+
+        # test with sample info coming from csv
+        update_info = {
+            "source_id": source_id,
+            "pages_orig": pages_orig,
+            "new_pages_digital": pages_digital,
+        }
+        cmd.update_excerpt(update_info)
+        assert cmd.stats["unchanged"] == 1
+
+    @patch("ppa.archive.models.DigitizedWork.index_items")
+    def test_call_commmand(self, mock_index_items, tmp_path):
+        source_id = "abc.13245089"
+        pages_orig = "10-20"
+        pages_digital = "12-22"
+        DigitizedWork.objects.create(
+            source_id=source_id, pages_orig=pages_orig, pages_digital=pages_digital
+        )
+        stdout = StringIO()
+        # create minimal valid CSV with all required fields
+        csvfile = tmp_path / "excerpt_updates.csv"
+        csvfile.write_text(
+            "\n".join(
+                [
+                    "source_id,pages_orig,new_pages_digital",
+                    f"{source_id},{pages_orig},25-30",
+                ]
+            )
+        )
+        call_command("adjust_excerpts", csvfile, stdout=stdout)
+        output = stdout.getvalue()
+        assert "Updated 1 record." in output
+        assert "0 errors" in output
+        assert "0 not found" in output
+        assert "0 unchanged" in output

From d296e0a1d9eeec71585e27c4235dc773fe0f5c46 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 28 Mar 2024 15:05:24 -0400
Subject: [PATCH 38/71] Update tests so they do not error without hathi data
 setting

---
 ppa/archive/tests/test_adjust_excerpts.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/ppa/archive/tests/test_adjust_excerpts.py b/ppa/archive/tests/test_adjust_excerpts.py
index 6e11145a..d7963e04 100644
--- a/ppa/archive/tests/test_adjust_excerpts.py
+++ b/ppa/archive/tests/test_adjust_excerpts.py
@@ -17,7 +17,10 @@ def test_update_success(self, mock_index_items):
         pages_orig = "10-20"
         pages_digital = "12-22"
         work = DigitizedWork.objects.create(
-            source_id=source_id, pages_orig=pages_orig, pages_digital=pages_digital
+            source_id=source_id,
+            pages_orig=pages_orig,
+            pages_digital=pages_digital,
+            source=DigitizedWork.OTHER,
         )
 
         cmd = adjust_excerpts.Command()
@@ -61,7 +64,10 @@ def test_error(self, capsys):
         pages_orig = "10-20"
         pages_digital = "12-22"
         DigitizedWork.objects.create(
-            source_id=source_id, pages_orig=pages_orig, pages_digital=pages_digital
+            source_id=source_id,
+            pages_orig=pages_orig,
+            pages_digital=pages_digital,
+            source=DigitizedWork.OTHER,
         )
 
         cmd = adjust_excerpts.Command()
@@ -83,7 +89,10 @@ def test_unchanged(self):
         pages_orig = "10-20"
         pages_digital = "12-22"
         DigitizedWork.objects.create(
-            source_id=source_id, pages_orig=pages_orig, pages_digital=pages_digital
+            source_id=source_id,
+            pages_orig=pages_orig,
+            pages_digital=pages_digital,
+            source=DigitizedWork.OTHER,
         )
         cmd = adjust_excerpts.Command()
         cmd.setup()

From 401f44c6798bfa922c4d984d5a4729d67a8711f8 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 28 Mar 2024 15:41:16 -0400
Subject: [PATCH 39/71] Check the correct variable for negative page count
 difference

---
 ppa/archive/management/commands/index_pages.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ppa/archive/management/commands/index_pages.py b/ppa/archive/management/commands/index_pages.py
index 7682c6d2..eff9a1a7 100644
--- a/ppa/archive/management/commands/index_pages.py
+++ b/ppa/archive/management/commands/index_pages.py
@@ -129,7 +129,7 @@ def handle(self, *args, **kwargs):
                         self.stdout.write(f"{work_diff:,} works not indexed in Solr")
                 if page_diff:
                     # negative = more pages in solr than expected
-                    if work_diff < 0:
+                    if page_diff < 0:
                         self.stdout.write(
                             self.style.WARNING(
                                 f"{abs(page_diff):,} more pages indexed in Solr than expected"

From ae124a01f6b6b1ce0b26ed70371f9907832e8d71 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 4 Apr 2024 09:20:24 -0400
Subject: [PATCH 40/71] Add old work id field to DigitizedWork

---
 .../0022_digitizedwork_old_workid.py          | 22 +++++++++++++++++++
 ppa/archive/models.py                         |  7 ++++++
 2 files changed, 29 insertions(+)
 create mode 100644 ppa/archive/migrations/0022_digitizedwork_old_workid.py

diff --git a/ppa/archive/migrations/0022_digitizedwork_old_workid.py b/ppa/archive/migrations/0022_digitizedwork_old_workid.py
new file mode 100644
index 00000000..5673147e
--- /dev/null
+++ b/ppa/archive/migrations/0022_digitizedwork_old_workid.py
@@ -0,0 +1,22 @@
+# Generated by Django 5.0.2 on 2024-04-04 13:20
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("archive", "0021_alter_digitizedwork_protected_fields"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="digitizedwork",
+            name="old_workid",
+            field=models.CharField(
+                blank=True,
+                help_text="past work id; used for excerpts previously identified by start of digital page range",
+                max_length=255,
+                verbose_name="Old Work ID",
+            ),
+        ),
+    ]
diff --git a/ppa/archive/models.py b/ppa/archive/models.py
index 1a9cead6..6c526641 100644
--- a/ppa/archive/models.py
+++ b/ppa/archive/models.py
@@ -480,6 +480,13 @@ class DigitizedWork(ModelIndexable, TrackChangesModel):
         blank=True,
         validators=[validate_page_range],
     )
+    old_workid = models.CharField(
+        "Old Work ID",
+        max_length=255,
+        help_text="past work id; used for excerpts previously "
+        + "identified by start of digital page range",
+        blank=True,
+    )
 
     class Meta:
         ordering = ("sort_title",)

From 94d22e0c9809885594f0591303e043fe240d0708 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 4 Apr 2024 10:01:34 -0400
Subject: [PATCH 41/71] Data migration to populate old work id for excerpt +
 first digital page

---
 .../0023_save_excerpt_old_workid.py           | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 ppa/archive/migrations/0023_save_excerpt_old_workid.py

diff --git a/ppa/archive/migrations/0023_save_excerpt_old_workid.py b/ppa/archive/migrations/0023_save_excerpt_old_workid.py
new file mode 100644
index 00000000..4a568884
--- /dev/null
+++ b/ppa/archive/migrations/0023_save_excerpt_old_workid.py
@@ -0,0 +1,31 @@
+# Generated by Django 5.0.2 on 2024-04-04 13:26
+
+from django.db import migrations
+
+from intspan import intspan
+
+
+def populate_excerpt_old_workid(apps, schema_editor):
+    DigitizedWork = apps.get_model("archive", "DigitizedWork")
+    # find all works with a digital page range
+    for digwork in DigitizedWork.objects.exclude(pages_digital=""):
+        # use logic similar to model code to parse the page range
+        # and get the number of the first page
+        first_digital_page = list(intspan(digwork.pages_digital))[0]
+        # should not be possible to save a record with a page range
+        # that can't be parsed by intspan
+        # previously, excerpt id was source_id-pN where N is first digital page
+        digwork.old_workid = f"{digwork.source_id}-p{first_digital_page}"
+        digwork.save()
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("archive", "0022_digitizedwork_old_workid"),
+    ]
+
+    operations = [
+        migrations.RunPython(
+            code=populate_excerpt_old_workid, reverse_code=migrations.RunPython.noop
+        )
+    ]

From df468e720d830b287244f2e13ad8208d2aae14b8 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 4 Apr 2024 10:47:43 -0400
Subject: [PATCH 42/71] Update digitized work url and view to use original
 start page

previously used first page in digital pages, but that is not stable
---
 ppa/archive/models.py            | 11 +++++------
 ppa/archive/tests/test_models.py | 24 ++++++++++++++++++++++--
 ppa/archive/tests/test_views.py  | 16 +++++++++++++---
 ppa/archive/urls.py              |  3 ++-
 ppa/archive/views.py             |  4 ++--
 5 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/ppa/archive/models.py b/ppa/archive/models.py
index 6c526641..e36c7912 100644
--- a/ppa/archive/models.py
+++ b/ppa/archive/models.py
@@ -505,7 +505,7 @@ def get_absolute_url(self):
         """
         url_opts = {"source_id": self.source_id}
         # start page must be specified if set but must not be included if empty
-        if self.pages_digital:
+        if self.pages_orig:
             url_opts["start_page"] = self.first_page()
         return reverse("archive:detail", kwargs=url_opts)
 
@@ -846,10 +846,9 @@ def populate_from_bibdata(self, bibdata):
     }
 
     def first_page(self):
-        """Number of the first page in range, if this is an excerpt"""
-        # return digital page for now; may be switching to original
-        # or this method may be going away
-        return self.first_page_digital()
+        """Number of the first page in range, if this is an excerpt
+        (first of original page range, not digital)"""
+        return self.first_page_original()
 
     def first_page_digital(self):
         """Number of the first page in range (digital pages / page index),
@@ -877,7 +876,7 @@ def index_id(self):
         """use source id + first page in range (if any) as solr identifier"""
         first_page = self.first_page()
         if first_page:
-            return "%s-p%d" % (self.source_id, first_page)
+            return "%s-p%s" % (self.source_id, first_page)
         return self.source_id
 
     @classmethod
diff --git a/ppa/archive/tests/test_models.py b/ppa/archive/tests/test_models.py
index e21e688a..d8bc764b 100644
--- a/ppa/archive/tests/test_models.py
+++ b/ppa/archive/tests/test_models.py
@@ -552,16 +552,25 @@ def test_index_data(self):
             assert index_data["id"] == digwork.source_id
 
     def test_get_absolute_url(self):
-        work = DigitizedWork.objects.first()
+        work = DigitizedWork.objects.filter(pages_orig="").first()
+        print(work)
         assert work.get_absolute_url() == reverse(
             "archive:detail", kwargs={"source_id": work.source_id}
         )
 
-        work.pages_digital = "11-13"
+        work.pages_orig = "11-13"
+        print(work)
+        print(work.first_page())
+        print(work.first_page_original())
         assert work.get_absolute_url() == reverse(
             "archive:detail", kwargs={"source_id": work.source_id, "start_page": 11}
         )
 
+        work.pages_orig = "iii-xi"
+        assert work.get_absolute_url() == reverse(
+            "archive:detail", kwargs={"source_id": work.source_id, "start_page": "iii"}
+        )
+
     @patch("ppa.archive.models.HathiBibliographicAPI")
     def test_get_metadata_hathi(self, mock_hathibib):
         work = DigitizedWork(source_id="ht:1234")
@@ -672,6 +681,17 @@ def test_index_id(self):
         work = DigitizedWork(source_id="chi.79279237")
         assert work.index_id() == work.source_id
 
+        # for excerpts, index id includes first page from original page range
+        excerpt = DigitizedWork(
+            source_id="chi.89279238", pages_orig="3-5", pages_digital="5-7"
+        )
+        assert excerpt.index_id() == f"{excerpt.source_id}-p3"
+
+        excerpt = DigitizedWork(
+            source_id="abc.123459238", pages_orig="ii-iv", pages_digital="3-4"
+        )
+        assert excerpt.index_id() == f"{excerpt.source_id}-pii"
+
     def test_save_suppress(self):
         work = DigitizedWork(source_id="chi.79279237")
         with patch.object(work, "hathi") as mock_hathiobj:
diff --git a/ppa/archive/tests/test_views.py b/ppa/archive/tests/test_views.py
index afc4d41c..8704dd81 100644
--- a/ppa/archive/tests/test_views.py
+++ b/ppa/archive/tests/test_views.py
@@ -438,7 +438,7 @@ def test_get_queryset(self, mock_index_items):
         assert self.client.get(bogus_dial_excerpt_url).status_code == 404
         # create and retrieve an excerpt; should return 200 ok with correct object
         dial_excerpt = DigitizedWork.objects.create(
-            source_id=self.dial.source_id, pages_digital="200-250"
+            source_id=self.dial.source_id, pages_orig="200-250", pages_digital="202-251"
         )
         response = self.client.get(dial_excerpt.get_absolute_url())
         assert response.status_code == 200
@@ -449,9 +449,17 @@ def test_get_queryset(self, mock_index_items):
         assert response.status_code == 200
         assert response.context["object"] == self.dial
 
+        # confirm first page regex filter works propertly
+        dial_excerpt2 = DigitizedWork.objects.create(
+            source_id=self.dial.source_id, pages_orig="20-25", pages_digital="22-27"
+        )
+        response = self.client.get(dial_excerpt2.get_absolute_url())
+        # start page 20 should match 20 only and not 200
+        assert response.context["object"] == dial_excerpt2
+
         # create excerpt where there is no existing work
         excerpt = DigitizedWork.objects.create(
-            source_id="abc.123456", pages_digital="10-20"
+            source_id="abc.123456", pages_orig="10-20", pages_digital="12-22"
         )
         response = self.client.get(excerpt.get_absolute_url())
         # retrieve url for source id with no start apge
@@ -464,7 +472,9 @@ def test_get_queryset(self, mock_index_items):
         assert response["Location"] == excerpt.get_absolute_url()
 
         # if there are *TWO* excerpts for the same source, should 404 instead of redirecting
-        DigitizedWork.objects.create(source_id="abc.123456", pages_digital="30-45")
+        DigitizedWork.objects.create(
+            source_id="abc.123456", pages_orig="30-45", pages_digital="32-47"
+        )
         assert self.client.get(nonexistent_source_url).status_code == 404
 
 
diff --git a/ppa/archive/urls.py b/ppa/archive/urls.py
index 9b722be8..e3e6239c 100644
--- a/ppa/archive/urls.py
+++ b/ppa/archive/urls.py
@@ -21,8 +21,9 @@
         views.DigitizedWorkByRecordId.as_view(),
         name="record-id",
     ),
+    # excerpt original page may be numeric or alpha (e.g., roman numerals)
     re_path(
-        r"^(?P<source_id>[^-]+)-p(?P<start_page>\d+)/",
+        r"^(?P<source_id>[^-]+)-p(?P<start_page>[\da-zA-Z]+)/",
         views.DigitizedWorkDetailView.as_view(),
         name="detail",
     ),
diff --git a/ppa/archive/views.py b/ppa/archive/views.py
index 4bbf965a..3cfff097 100644
--- a/ppa/archive/views.py
+++ b/ppa/archive/views.py
@@ -299,10 +299,10 @@ def get_queryset(self):
         start_page = self.kwargs.get("start_page")
         # if start page is specified, filter to get the correct excerpt
         if start_page:
-            qs = source_qs.filter(pages_digital__startswith=start_page)
+            qs = source_qs.filter(pages_orig__regex=f"^{start_page}([,-]|\b)")
         # if start page is NOT specified, ensure we do not retrieve an excerpt
         else:
-            qs = source_qs.filter(pages_digital__exact="")
+            qs = source_qs.filter(pages_orig__exact="")
 
         #  if qs is empty and start page is not set, check if there is _one_ excerpt
         # for the source id; if there is, we want to return a permanent redirect

From f8a493092bb5bb0017b49c7ceb6d97100e1e1fcd Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 4 Apr 2024 10:48:58 -0400
Subject: [PATCH 43/71] Add unique constraint on source id + original page
 range

---
 ...digitizedwork_unique_sourceid_pages_orig.py | 18 ++++++++++++++++++
 ppa/archive/models.py                          |  7 ++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)
 create mode 100644 ppa/archive/migrations/0024_digitizedwork_unique_sourceid_pages_orig.py

diff --git a/ppa/archive/migrations/0024_digitizedwork_unique_sourceid_pages_orig.py b/ppa/archive/migrations/0024_digitizedwork_unique_sourceid_pages_orig.py
new file mode 100644
index 00000000..328116aa
--- /dev/null
+++ b/ppa/archive/migrations/0024_digitizedwork_unique_sourceid_pages_orig.py
@@ -0,0 +1,18 @@
+# Generated by Django 5.0.2 on 2024-04-04 14:48
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("archive", "0023_save_excerpt_old_workid"),
+    ]
+
+    operations = [
+        migrations.AddConstraint(
+            model_name="digitizedwork",
+            constraint=models.UniqueConstraint(
+                fields=("source_id", "pages_orig"), name="unique_sourceid_pages_orig"
+            ),
+        ),
+    ]
diff --git a/ppa/archive/models.py b/ppa/archive/models.py
index e36c7912..204ca515 100644
--- a/ppa/archive/models.py
+++ b/ppa/archive/models.py
@@ -495,7 +495,12 @@ class Meta:
         constraints = [
             models.UniqueConstraint(
                 fields=["source_id", "pages_digital"], name="unique_sourceid_pagerange"
-            )
+            ),
+            # we are now using original page range for unique id,
+            # so require source id + pages_orig to be unique
+            models.UniqueConstraint(
+                fields=["source_id", "pages_orig"], name="unique_sourceid_pages_orig"
+            ),
         ]
 
     def get_absolute_url(self):

From 0d169c1225f28f9c0acffb0ae748031b623c82d1 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 4 Apr 2024 11:07:47 -0400
Subject: [PATCH 44/71] Add redirect based on old work id (first digital page)

---
 ppa/archive/tests/test_views.py | 22 ++++++++++++++++++++--
 ppa/archive/views.py            | 17 +++++++++++++----
 2 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/ppa/archive/tests/test_views.py b/ppa/archive/tests/test_views.py
index 8704dd81..126f9fbf 100644
--- a/ppa/archive/tests/test_views.py
+++ b/ppa/archive/tests/test_views.py
@@ -457,9 +457,13 @@ def test_get_queryset(self, mock_index_items):
         # start page 20 should match 20 only and not 200
         assert response.context["object"] == dial_excerpt2
 
-        # create excerpt where there is no existing work
+        # create excerpt where there is no existing work;
+        # set old_workid based on first digital page
         excerpt = DigitizedWork.objects.create(
-            source_id="abc.123456", pages_orig="10-20", pages_digital="12-22"
+            source_id="abc.123456",
+            pages_orig="10-20",
+            pages_digital="12-22",
+            old_workid="abc.123456-p12",
         )
         response = self.client.get(excerpt.get_absolute_url())
         # retrieve url for source id with no start apge
@@ -477,6 +481,20 @@ def test_get_queryset(self, mock_index_items):
         )
         assert self.client.get(nonexistent_source_url).status_code == 404
 
+        # if we try to find a work by the old id (first digital page),
+        # should redirect
+        response = self.client.get(
+            reverse(
+                "archive:detail",
+                kwargs={
+                    "source_id": excerpt.source_id,
+                    "start_page": excerpt.first_page_digital(),
+                },
+            )
+        )
+        assert response.status_code == 301
+        assert response["Location"] == excerpt.get_absolute_url()
+
 
 class TestDigitizedWorkListRequest(TestCase):
     fixtures = ["sample_digitized_works"]
diff --git a/ppa/archive/views.py b/ppa/archive/views.py
index 3cfff097..b4f845c2 100644
--- a/ppa/archive/views.py
+++ b/ppa/archive/views.py
@@ -304,11 +304,20 @@ def get_queryset(self):
         else:
             qs = source_qs.filter(pages_orig__exact="")
 
-        #  if qs is empty and start page is not set, check if there is _one_ excerpt
-        # for the source id; if there is, we want to return a permanent redirect
-        if not qs.exists() and not start_page:
-            if source_qs.count() == 1:
+        if not qs.exists():
+            #  if qs is empty and start page is not set, check if there is _one_ excerpt
+            # for the source id; if there is, we want to return a permanent redirect
+            if not start_page and source_qs.count() == 1:
                 self.redirect_url = source_qs.first().get_absolute_url()
+            if start_page:
+                # if qs empty and start page _is_ set, check for an old id
+                # (previously excerpt ids were based on digital page range)
+                digwork_oldid = source_qs.filter(
+                    old_workid="%(source_id)s-p%(start_page)s" % self.kwargs
+                ).first()
+                if digwork_oldid:
+                    self.redirect_url = source_qs.first().get_absolute_url()
+
         # otherwise, return a 404
         return qs
 

From 1f1dba745b22de9804ba16b55ca82f796ee67c18 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 4 Apr 2024 11:24:54 -0400
Subject: [PATCH 45/71] Configure wagtail admin base url

---
 ppa/settings.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ppa/settings.py b/ppa/settings.py
index c6d14ebd..07b11eff 100644
--- a/ppa/settings.py
+++ b/ppa/settings.py
@@ -204,6 +204,8 @@
 SITE_ID = 1
 
 WAGTAIL_SITE_NAME = "Princeton Prosody Archive"
+# needed by wagtail to generate URLs for notification emails
+WAGTAILADMIN_BASE_URL = "https://prosody.princeton.edu/"
 
 WAGTAILEMBEDS_FINDERS = [
     {"class": "wagtail.embeds.finders.oembed"},

From 93b42d8c6c8cf5eef20358254d261e708f5aea46 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 4 Apr 2024 11:31:19 -0400
Subject: [PATCH 46/71] Remove unsupported draftail feature 'document'

---
 ppa/pages/models.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ppa/pages/models.py b/ppa/pages/models.py
index 266373c0..83733a7e 100644
--- a/ppa/pages/models.py
+++ b/ppa/pages/models.py
@@ -248,7 +248,6 @@ class BodyContentBlock(blocks.StreamBlock):
             "ul",
             "hr",
             "blockquote",
-            "document",
             "superscript",
             "subscript",
             "strikethrough",

From 1e286ebac5e5db58b1ee8fc2bfbdeb8296b9208d Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 4 Apr 2024 12:55:12 -0400
Subject: [PATCH 47/71] Refactor django settings to use split-settings approach

adapted from geniza project settings
---
 .github/workflows/unit-tests.yml              |   7 +-
 README.rst                                    |   2 +-
 ppa/settings/__init__.py                      |  14 +++
 .../components/base.py}                       | 110 +++++-------------
 ppa/settings/components/debug.py              |  12 ++
 ppa/settings/environments/development.py      |  16 +++
 .../settings/environments/test.py             |  39 +++----
 ppa/{ => settings}/local_settings.py.sample   |   0
 requirements.txt                              |   1 +
 9 files changed, 97 insertions(+), 104 deletions(-)
 create mode 100644 ppa/settings/__init__.py
 rename ppa/{settings.py => settings/components/base.py} (74%)
 create mode 100644 ppa/settings/components/debug.py
 create mode 100644 ppa/settings/environments/development.py
 rename ci/testsettings.py => ppa/settings/environments/test.py (55%)
 rename ppa/{ => settings}/local_settings.py.sample (100%)

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 40922993..a9af212b 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -10,6 +10,7 @@ env:
   DB_NAME: ppa
   DB_USER: ppa
   DB_PASSWORD: ppa
+  DJANGO_ENV: test
 
 jobs:
   js-unit:
@@ -89,12 +90,10 @@ jobs:
           pip install -r dev-requirements.txt
 
       - name: Setup local_settings.py
-        run: |
-          cp ci/testsettings.py ppa/local_settings.py
-          python -c "import uuid; print('SECRET_KEY = \'%s\'' % uuid.uuid4())" >> ppa/local_settings.py
+        run: python -c "import uuid; print('SECRET_KEY = \'%s\'' % uuid.uuid4())" >> ppa/settings/local_settings.py
 
       - name: Run pytest
-        run: py.test --cov=./ --cov-report=xml
+        run: pytest --cov=./ --cov-report=xml
 
       - name: Upload test coverage to Codecov
         uses: codecov/codecov-action@v4
diff --git a/README.rst b/README.rst
index a0698f60..0efa1dbb 100644
--- a/README.rst
+++ b/README.rst
@@ -55,7 +55,7 @@ Initial setup and installation:
 
 - Copy sample local settings and configure for your environment::
 
-   cp ppa/local_settings.py.sample ppa/local_settings.py
+   cp ppa/settings/local_settings.py.sample ppa/settings/local_settings.py
 
 - Create a database, configure in local settings in the `DATABASES` dictionary, change `SECRET_KEY`, and run migrations::
 
diff --git a/ppa/settings/__init__.py b/ppa/settings/__init__.py
new file mode 100644
index 00000000..6ed415b9
--- /dev/null
+++ b/ppa/settings/__init__.py
@@ -0,0 +1,14 @@
+from os import environ
+
+from split_settings.tools import include, optional
+
+ENV = environ.get("DJANGO_ENV") or "development"
+
+include(
+    "components/base.py",
+    "components/debug.py",
+    # optionally load environment-specific configuration
+    optional("environments/{0}.py".format(ENV)),
+    # for now, local settings is required
+    "local_settings.py",
+)
diff --git a/ppa/settings.py b/ppa/settings/components/base.py
similarity index 74%
rename from ppa/settings.py
rename to ppa/settings/components/base.py
index 07b11eff..06cdd50f 100644
--- a/ppa/settings.py
+++ b/ppa/settings/components/base.py
@@ -1,34 +1,22 @@
 """
 Django settings for ppa project.
-
-Generated by 'django-admin startproject' using Django 1.11.7.
-
-For more information on this file, see
-https://docs.djangoproject.com/en/1.11/topics/settings/
-
-For the full list of settings and their values, see
-https://docs.djangoproject.com/en/1.11/ref/settings/
 """
 
-import os
+from pathlib import Path
 
-# Quick-start development settings - unsuitable for production
-# See https://docs.djangoproject.com/en/1.11/howto/deployment/checklist/
+# Build paths inside the project like this: BASE_DIR / 'subdir'.
+# called from ppa-django/ppa/settings/__init__.py
+# do NOT import this module directly, the path will be different
+PROJECT_APP_PATH = Path(__file__).resolve().parent.parent
+PROJECT_APP = PROJECT_APP_PATH.name
+# base dir is one level up from that (ppa-django)
+BASE_DIR = PROJECT_APP_PATH.parent
 
 # SECURITY WARNING: don't run with debug turned on in production!
 DEBUG = False
 
 ALLOWED_HOSTS = []
 
-#########
-# PATHS #
-#########
-
-# Full filesystem path to the project.
-PROJECT_APP_PATH = os.path.dirname(os.path.abspath(__file__))
-PROJECT_APP = os.path.basename(PROJECT_APP_PATH)
-PROJECT_ROOT = BASE_DIR = os.path.dirname(PROJECT_APP_PATH)
-
 # Every cache key will get prefixed with this value - here we set it to
 # the name of the directory the project is in to try and use something
 # project specific.
@@ -42,15 +30,15 @@
 # Don't put anything in this directory yourself; store your static files
 # in apps' "static/" subdirectories and in STATICFILES_DIRS.
 # Example: "/home/media/media.lawrence.com/static/"
-STATIC_ROOT = os.path.join(PROJECT_ROOT, STATIC_URL.strip("/"))
+STATIC_ROOT = BASE_DIR / STATIC_URL.strip("/")
 
 # Additional locations of static files
 STATICFILES_DIRS = [
     # Put strings here, like "/home/html/static" or "C:/www/django/static".
     # Always use forward slashes, even on Windows.
     # Don't forget to use absolute paths, not relative paths.
-    os.path.join(BASE_DIR, "sitemedia"),
-    os.path.join(BASE_DIR, "bundles"),
+    BASE_DIR / "sitemedia",
+    BASE_DIR / "bundles",
 ]
 
 # URL that handles the media served from MEDIA_ROOT. Make sure to use a
@@ -60,7 +48,7 @@
 
 # Absolute filesystem path to the directory that will hold user-uploaded files.
 # Example: "/home/media/media.lawrence.com/media/"
-MEDIA_ROOT = os.path.join(PROJECT_ROOT, *MEDIA_URL.strip("/").split("/"))
+MEDIA_ROOT = BASE_DIR / MEDIA_URL.strip("/")
 
 STATICFILES_FINDERS = (
     "django.contrib.staticfiles.finders.FileSystemFinder",
@@ -129,11 +117,10 @@
 
 ROOT_URLCONF = "ppa.urls"
 
-
 TEMPLATES = [
     {
         "BACKEND": "django.template.backends.django.DjangoTemplates",
-        "DIRS": [os.path.join(BASE_DIR, "templates")],
+        "DIRS": [BASE_DIR / "templates"],
         "OPTIONS": {
             "context_processors": [
                 "django.template.context_processors.debug",
@@ -154,18 +141,29 @@
 
 WSGI_APPLICATION = "ppa.wsgi.application"
 
-
-# Database
-# https://docs.djangoproject.com/en/1.11/ref/settings/#databases
-
 DATABASES = {
     "default": {
-        "ENGINE": "django.db.backends.sqlite3",
-        "NAME": os.path.join(BASE_DIR, "db.sqlite3"),
+        "ENGINE": "django.db.backends.postgresql",
+        "NAME": "ppa",
+        "USER": "ppa",
+        "PASSWORD": "",
+        "HOST": "",  # empty string for localhost
+        "PORT": "",  # empty string for default
+    }
+}
+
+SOLR_CONNECTIONS = {
+    "default": {
+        "URL": "http://localhost:8983/solr/",
+        "COLLECTION": "ppa",
+        "CONFIGSET": "ppa",
+        "TEST": {
+            # set aggressive commitWithin when testing
+            "COMMITWITHIN": 750,
+        },
     }
 }
 
-# preserve django 3.1 behavior
 DEFAULT_AUTO_FIELD = "django.db.models.AutoField"
 
 # Password validation
@@ -212,12 +210,9 @@
     {"class": "ppa.pages.embed_finders.GlitchEmbedFinder"},
 ]
 
-GRAPPELLI_ADMIN_TITLE = "Princeton Prosody Archive Admin"
-
 # username for logging activity by local scripts
 SCRIPT_USERNAME = "script"
 
-
 # PUCAS configuration for CAS/LDAP login and user provisioning.
 # Only includes non-sensitive configurations that do not change
 PUCAS_LDAP = {
@@ -235,7 +230,7 @@
     "DEFAULT": {
         "CACHE": True,
         "BUNDLE_DIR_NAME": "bundles/",  # must end with slash
-        "STATS_FILE": os.path.join(BASE_DIR, "webpack-stats.json"),
+        "STATS_FILE": BASE_DIR / "webpack-stats.json",
         "POLL_INTERVAL": 0.1,
         "TIMEOUT": None,
         "IGNORE": [r".+\.hot-update.js", r".+\.map"],
@@ -291,44 +286,3 @@
 
 # load a manifest file
 CSP_MANIFEST_SRC = "'self'"
-
-##################
-# LOCAL SETTINGS #
-##################
-
-# (local settings import logic adapted from mezzanine)
-
-# Allow any settings to be defined in local_settings.py which should be
-# ignored in your version control system allowing for settings to be
-# defined per machine.
-
-# Instead of doing "from .local_settings import *", we use exec so that
-# local_settings has full access to everything defined in this module.
-# Also force into sys.modules so it's visible to Django's autoreload.
-
-f = os.path.join(BASE_DIR, "ppa", "local_settings.py")
-if os.path.exists(f):
-    import imp
-    import sys
-
-    module_name = "ppa.local_settings"
-    module = imp.new_module(module_name)
-    module.__file__ = f
-    sys.modules[module_name] = module
-    exec(open(f, "rb").read())
-
-# if in debug mode and django-debug-toolbar is available, add to installed apps
-if DEBUG:
-    try:
-        INSTALLED_APPS.append("debug_toolbar")
-        MIDDLEWARE.append("debug_toolbar.middleware.DebugToolbarMiddleware")
-    except ImportError:
-        pass
-
-    # allow webpack dev server through CSP when in DEBUG
-    CSP_SCRIPT_SRC += ("http://localhost:3000", "'unsafe-eval'", "'unsafe-inline'")
-    CSP_STYLE_SRC += ("http://localhost:3000", "'unsafe-inline'")
-    CSP_CONNECT_SRC += (
-        "http://localhost:3000",
-        "ws://localhost:3000",
-    )
diff --git a/ppa/settings/components/debug.py b/ppa/settings/components/debug.py
new file mode 100644
index 00000000..cd2fde28
--- /dev/null
+++ b/ppa/settings/components/debug.py
@@ -0,0 +1,12 @@
+# if django-debug-toolbar is installed, enable it
+
+from ppa.settings.components.base import INSTALLED_APPS, MIDDLEWARE
+
+# Configure internal IPs for access to view debug toolbar
+INTERNAL_IPS = ["127.0.0.1", "localhost"]
+
+try:
+    INSTALLED_APPS.append("debug_toolbar")
+    MIDDLEWARE += ("debug_toolbar.middleware.DebugToolbarMiddleware",)
+except ImportError:
+    pass
diff --git a/ppa/settings/environments/development.py b/ppa/settings/environments/development.py
new file mode 100644
index 00000000..5d6f06f1
--- /dev/null
+++ b/ppa/settings/environments/development.py
@@ -0,0 +1,16 @@
+from ppa.settings import CSP_SCRIPT_SRC, CSP_STYLE_SRC, CSP_CONNECT_SRC
+
+
+DEBUG = True
+
+# ALLOWED_HOSTS = ["*"]
+CSP_REPORT_ONLY = True
+
+if DEBUG:
+    # allow webpack dev server through CSP when in DEBUG
+    CSP_SCRIPT_SRC += ("http://localhost:3000", "'unsafe-eval'", "'unsafe-inline'")
+    CSP_STYLE_SRC += ("http://localhost:3000", "'unsafe-inline'")
+    CSP_CONNECT_SRC += (
+        "http://localhost:3000",
+        "ws://localhost:3000",
+    )
diff --git a/ci/testsettings.py b/ppa/settings/environments/test.py
similarity index 55%
rename from ci/testsettings.py
rename to ppa/settings/environments/test.py
index 8d22934f..fa8d90c4 100644
--- a/ci/testsettings.py
+++ b/ppa/settings/environments/test.py
@@ -1,14 +1,9 @@
-# This file is exec'd from settings.py, so it has access to and can
-# modify all the variables in settings.py.
+from ppa.settings import DATABASES, SOLR_CONNECTIONS
 
-# If this file is changed in development, the development server will
-# have to be manually restarted because changes will not be noticed
-# immediately.
-
-DEBUG = False
-
-DATABASES = {
-    "default": {
+# These settings correspond to the service container settings in the
+# .github/workflow .yml files.
+DATABASES["default"].update(
+    {
         "ENGINE": "django.db.backends.postgresql",
         "NAME": "ppa",
         "PASSWORD": "ppa",
@@ -18,25 +13,27 @@
         "TEST": {
             "CHARSET": "utf8",
         },
-    },
-}
-
-# required by mezzanine for unit tests
-ALLOWED_HOSTS = ["*"]
+    }
+)
 
-# required for integration tests that query Solr
-SOLR_CONNECTIONS = {
-    "default": {
+SOLR_CONNECTIONS["default"].update(
+    {
         "URL": "http://localhost:8983/solr/",
         "COLLECTION": "ppa",
         "CONFIGSET": "ppa",
+        # set aggressive commitWithin for test
+        "COMMITWITHIN": 750,
         "TEST": {"COMMITWITHIN": 100},
     }
-}
+)
+
+# turn off debug so we see 404s when testing
+DEBUG = False
+
+# required for tests when DEBUG = False
+ALLOWED_HOSTS = ["*"]
 
 # use a fake webpack loader to ignore missing assets for unit tests
 WEBPACK_LOADER = {
     "DEFAULT": {"LOADER_CLASS": "webpack_loader.loaders.FakeWebpackLoader"}
 }
-
-# secret key added as a travis build step
diff --git a/ppa/local_settings.py.sample b/ppa/settings/local_settings.py.sample
similarity index 100%
rename from ppa/local_settings.py.sample
rename to ppa/settings/local_settings.py.sample
diff --git a/requirements.txt b/requirements.txt
index 92e81316..0984208a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -29,5 +29,6 @@ django-adminlogentries
 django-import-export
 psycopg2-binary
 multiprocess
+django-split-settings
 # only needed for the 'generate_textcorpus' manage command
 orjsonl
\ No newline at end of file

From f907a8fc92e9bb37e8f1ec997ff982b502987407 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 4 Apr 2024 12:56:50 -0400
Subject: [PATCH 48/71] Simplify unit test setup; only testing on postgresql,
 not mysql

---
 .github/workflows/unit-tests.yml | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index a9af212b..ae05b290 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -7,9 +7,6 @@ on:
     - cron: "0 16 * * 2"
 
 env:
-  DB_NAME: ppa
-  DB_USER: ppa
-  DB_PASSWORD: ppa
   DJANGO_ENV: test
 
 jobs:
@@ -50,13 +47,6 @@ jobs:
         ports:
           - 8983:8983
     steps:
-      # Set the value of DJANGO_DB_BACKEND which is used in ci/testsettings.py to
-      # configure django's ORM based on whether we're testing postgres or mysql
-      - name: Set django database backend adapter
-        env:
-          BACKEND: postgresql
-        run: echo "DJANGO_DB_BACKEND=$(echo "$BACKEND")" >> $GITHUB_ENV
-
       - name: Checkout repository
         uses: actions/checkout@v4
 

From d918cb3ea9d492d5ae504b89e43b30ef5607a1b4 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 4 Apr 2024 13:02:08 -0400
Subject: [PATCH 49/71] Update sphinx workflow for local settings path change

---
 .github/workflows/sphinx_docs.yml | 2 +-
 sphinx-docs/conf.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/sphinx_docs.yml b/.github/workflows/sphinx_docs.yml
index 825ea71e..74b29d24 100644
--- a/.github/workflows/sphinx_docs.yml
+++ b/.github/workflows/sphinx_docs.yml
@@ -36,7 +36,7 @@ jobs:
         run: pip install -r dev-requirements.txt
 
       - name: Setup local_settings.py
-        run: python -c "import uuid; print('SECRET_KEY = \'%s\'' % uuid.uuid4())" >> ppa/local_settings.py
+        run: python -c "import uuid; print('SECRET_KEY = \'%s\'' % uuid.uuid4())" >> ppa/settings/local_settings.py
 
       - name: Build Sphinx docs
         run: cd sphinx-docs && make -b coverage html
diff --git a/sphinx-docs/conf.py b/sphinx-docs/conf.py
index 87692380..c64fdee6 100644
--- a/sphinx-docs/conf.py
+++ b/sphinx-docs/conf.py
@@ -63,7 +63,7 @@
 
 # General information about the project.
 project = "Princeton Prosody Archive"
-copyright = "2018, CDH @ Princeton University"
+copyright = "2024, CDH @ Princeton University"
 author = "CDH @ Princeton University"
 
 # The version info for the project you're documenting, acts as replacement for

From 05d2f3ec59cb5a407d2b429234bd85bb0d0421cc Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 4 Apr 2024 14:11:44 -0400
Subject: [PATCH 50/71] Remove unused test requirements file (out of date,
 overlaps with dev)

---
 test-requirements.txt | 4 ----
 1 file changed, 4 deletions(-)
 delete mode 100644 test-requirements.txt

diff --git a/test-requirements.txt b/test-requirements.txt
deleted file mode 100644
index b461b066..00000000
--- a/test-requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-pytest>=3.6,<5.4
-pytest-django
-pytest-cov
-django-webpack-loader
\ No newline at end of file

From ebe3607495de016a02732315876951c9a790ef75 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 4 Apr 2024 14:17:13 -0400
Subject: [PATCH 51/71] Suppress deprecation warnings when running pytest

- configure pythonpath in pytest.ini (requires pytest 7 or greater)
- simplify pytest instructions in readme; add note about suppressed warnings
---
 DEPLOYNOTES.rst      |  6 ++++++
 README.rst           | 19 ++++++-------------
 dev-requirements.txt |  2 +-
 pytest.ini           |  6 ++++++
 4 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/DEPLOYNOTES.rst b/DEPLOYNOTES.rst
index 8d59b1af..19773102 100644
--- a/DEPLOYNOTES.rst
+++ b/DEPLOYNOTES.rst
@@ -3,6 +3,12 @@
 Deploy and Upgrade notes
 ========================
 
+3.12
+----
+
+* Settings are now configured with django-split-settings as a module;
+  the path to local_settings.py is now ppa/settings/local_settings.py
+
 3.11.2
 ------
 
diff --git a/README.rst b/README.rst
index 0efa1dbb..76482cba 100644
--- a/README.rst
+++ b/README.rst
@@ -109,29 +109,22 @@ either set of assets frequently. These two processes are separate as well::
 Tests
 ~~~~~
 
-Python unit tests are written with `py.test <http://doc.pytest.org/>`_ but use
+Python unit tests are written with `pytest <http://doc.pytest.org/>`_ but use
 Django fixture loading and convenience testing methods when that makes
 things easier. To run them, first install development requirements::
 
     pip install -r dev-requirements.txt
 
-Run tests using py.test.  Note that this currently requires the
-top level project directory be included in your python path.  You can
-accomplish this either by calling pytest via python::
+To run all python unit tests, use:  `pytest`
 
-    python -m pytest
-
-Or, if you wish to use the ``pytest`` command directly, simply add the
-top-level project directory to your python path environment variable::
-
-  setenv PYTHONPATH .  # csh
-  export PYTHONPATH=.  # bash
+Some deprecation warnings for dependencies have been suppressed in
+pytest.ini; to see warnings, run with `pytest -Wd`.
 
 Make sure you configure a test solr connection and set up an empty
 Solr core using the same instructions as for the development core.
 
-Note that python unit tests access a test server over HTTP, and therefore
-expect static files to be compiled – see "Frontend development setup" above
+Some python unit tests access rendered views, and therefore
+expect static files to be compiled; see "Frontend development setup" above
 for how to do this.
 
 In a CI context, we use a fake webpack loader backend that ignores missing assets.
diff --git a/dev-requirements.txt b/dev-requirements.txt
index d47cc703..dffb9f5e 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -1,5 +1,5 @@
 -r requirements.txt
-pytest>=5.0
+pytest>=7.0
 pytest-django>=4.5.2
 pytest-cov
 django-debug-toolbar
diff --git a/pytest.ini b/pytest.ini
index c6bbe1ea..06c46753 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,4 +1,5 @@
 [pytest]
+pythonpath = .
 DJANGO_SETTINGS_MODULE=ppa.settings
 # look for tests in standard django test locations
 python_files = "ppa/**/tests.py" "ppa/**/tests/*.py" "ppa/tests.py"
@@ -6,3 +7,8 @@ python_files = "ppa/**/tests.py" "ppa/**/tests/*.py" "ppa/tests.py"
 addopts = -p parasolr.django.disconnect_indexing
 # limit testpath to speed up collecting step
 testpaths = ppa
+# suppress warnings (several coming up for dependencies as of 2024-04)
+filterwarnings =
+    ignore::django.utils.deprecation.RemovedInDjango51Warning
+    ignore::django.utils.deprecation.RemovedInDjango60Warning
+    ignore::DeprecationWarning
\ No newline at end of file

From e3b482d329a0e86900f34f1d7691f709c3f305c2 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 4 Apr 2024 14:33:51 -0400
Subject: [PATCH 52/71] Update and clean up sample local settings

---
 ppa/settings/local_settings.py.sample | 81 ++++++---------------------
 1 file changed, 18 insertions(+), 63 deletions(-)

diff --git a/ppa/settings/local_settings.py.sample b/ppa/settings/local_settings.py.sample
index fcb2cce8..078f78b2 100644
--- a/ppa/settings/local_settings.py.sample
+++ b/ppa/settings/local_settings.py.sample
@@ -1,63 +1,36 @@
-# Sample local settings
-# Copy to derrida/local_settings.py and configure
-# includes sensitive configurations, should *not* be
-# checked into version control
+# configurations that should not be checked into version control
+# Copy to ppa/settings/local_settings.py and configure
 
 import os
-# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
-BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-
-
-# SECURITY WARNING: don't run with debug turned on in production!
-DEBUG = True
-
-# Configure internal IPs for access to view debug toolbar
-# INTERNAL_IPS = ['127.0.0.1']
-
-ALLOWED_HOSTS = []
 
 # SECURITY WARNING: keep the secret key used in production secret!
 # http://www.miniwebtool.com/django-secret-key-generator/
 SECRET_KEY = ''
 
-
 # Email address for a technical contact.
 # If set, will be used in From header for HathiTrust API requests
 # TECHNICAL_CONTACT = ''
 
-
 # Turn this on in test/QA site to show test banner
-#SHOW_TEST_WARNING = True
+# SHOW_TEST_WARNING = True
 
 # Database
-# https://docs.djangoproject.com/en/1.10/ref/settings/#databases
-DATABASES = {
-    # sqlite for development
-    'default': {
-        'ENGINE': 'django.db.backends.sqlite3',
-        'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
-    }
-    # postgresql for qa/prod
-    # "default": {
-    #     "ENGINE": "django.db.backends.postgresql",
-    #     "NAME": "ppa",
-    #     "USER": "ppa",
-    #     "PASSWORD": "ppa",
-    #     "HOST": "",  # empty string for localhost
-    #     "PORT": "",  # empty string for default
-    # },
-}
-
-SOLR_CONNECTIONS = {
-    'default': {
-        'URL': 'http://localhost:8983/solr/',
-        'COLLECTION': 'ppa',
-        'CONFIGSET': 'ppa',
-        'TEST': {
-            'COMMITWITHIN': 100
-        }
+# override default database settings as needed
+# default name and user are both "ppa"
+# DATABASES["default"]["NAME"] = ""
+# DATABASES["default"]["USER"] = ""
+DATABASES["default"]["PASSWORD"] = "pass!@#$"
+
+# override default Solr configuration as needed
+# default collection and configset are both "ppa"
+SOLR_CONNECTIONS["default"].update(
+    {
+        "URL": "http://localhost:8983/solr/",
+#        "COLLECTION": "ppa",
+#        "CONFIGSET": "ppa",
+        "TEST": {"COMMITWITHIN": 100},
     }
-}
+)
 
 # local path to hathi pairtree data provided via rsync
 HATHI_DATA = '/path/to/hathi_pairtree_root'
@@ -81,13 +54,6 @@ PUCAS_LDAP.update({
     'SEARCH_FILTER': "(uid=%(user)s)",
 })
 
-
-# Absolute path to the directory static files should be collected to.
-# Don't put anything in this directory yourself; store your static files
-# in apps' "static/" subdirectories and in STATICFILES_DIRS.
-# Example: "/home/media/media.lawrence.com/static/"
-STATIC_ROOT = os.path.join(BASE_DIR, STATIC_URL.strip("/"))
-
 # Admin email configuration for error messages
 # ADMINS = [('name', 'email')]
 # SERVER_EMAIL = '
@@ -132,14 +98,3 @@ LOGGING = {
         },
     }
 }
-
-# https://github.com/mozilla/django-csp
-# Content security policy controls - see `settings.py` for policy settings.
-# In development, leave both lines commented out to block & not report.
-# In QA, set REPORT_ONLY to True and specify a "report-only" endpoint.
-# In production, set REPORT_ONLY to False and specify an "enforced" endpoint.
-# CSP_REPORT_ONLY = False
-# CSP_REPORT_URI = ''
-
-# Turn off caching for static assets
-WEBPACK_LOADER['DEFAULT']['CACHE'] = False

From c611cbe48ac4919d0d64ae9b7f20849e499fa7a3 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 4 Apr 2024 11:17:38 -0400
Subject: [PATCH 53/71] Cleanup debug print statements in unit tests

---
 ppa/archive/tests/test_models.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/ppa/archive/tests/test_models.py b/ppa/archive/tests/test_models.py
index d8bc764b..eaa2b7f1 100644
--- a/ppa/archive/tests/test_models.py
+++ b/ppa/archive/tests/test_models.py
@@ -553,15 +553,11 @@ def test_index_data(self):
 
     def test_get_absolute_url(self):
         work = DigitizedWork.objects.filter(pages_orig="").first()
-        print(work)
         assert work.get_absolute_url() == reverse(
             "archive:detail", kwargs={"source_id": work.source_id}
         )
 
         work.pages_orig = "11-13"
-        print(work)
-        print(work.first_page())
-        print(work.first_page_original())
         assert work.get_absolute_url() == reverse(
             "archive:detail", kwargs={"source_id": work.source_id, "start_page": 11}
         )

From 37ec6e0333b449776f07e15d95a64d78a865b919 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 4 Apr 2024 14:55:42 -0400
Subject: [PATCH 54/71] Document order of steps (rsync, reindex, correct
 excerpts) for deploy

---
 DEPLOYNOTES.rst | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/DEPLOYNOTES.rst b/DEPLOYNOTES.rst
index 19773102..dac7c318 100644
--- a/DEPLOYNOTES.rst
+++ b/DEPLOYNOTES.rst
@@ -8,6 +8,26 @@ Deploy and Upgrade notes
 
 * Settings are now configured with django-split-settings as a module;
   the path to local_settings.py is now ppa/settings/local_settings.py
+* Index ids for excerpts have changed; this requires reindexing works
+  and pages for excerpts and articles; pages should be indexed
+  after running rsync.  To reindex works::
+
+  python manage.py index -i work
+
+* Local pairtree data should be updated for all HathiTrust works::
+
+  python manage.py hathi_rsync
+
+* After pairtree content has been updated, pages should be updated
+  in Solr::
+
+  python manage.py index_pages
+
+* Digital page ranges for HathiTrust excerpts should be corrected
+  using a CSV file provided by the project team::
+
+  python manage.py adjust_excerpts HT_excerpt_corrections.csv
+
 
 3.11.2
 ------

From 3dd8e8b2b2e31de4a26f35bceadc638f5889c4f8 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 4 Apr 2024 15:04:48 -0400
Subject: [PATCH 55/71] Fix formatting in deploy notes; clarify local settings
 change

[skip ci]
---
 DEPLOYNOTES.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/DEPLOYNOTES.rst b/DEPLOYNOTES.rst
index dac7c318..309ef291 100644
--- a/DEPLOYNOTES.rst
+++ b/DEPLOYNOTES.rst
@@ -7,26 +7,26 @@ Deploy and Upgrade notes
 ----
 
 * Settings are now configured with django-split-settings as a module;
-  the path to local_settings.py is now ppa/settings/local_settings.py
+  local_settings.py must be moved to ppa/settings/local_settings.py
 * Index ids for excerpts have changed; this requires reindexing works
   and pages for excerpts and articles; pages should be indexed
   after running rsync.  To reindex works::
 
-  python manage.py index -i work
+    python manage.py index -i work
 
 * Local pairtree data should be updated for all HathiTrust works::
 
-  python manage.py hathi_rsync
+    python manage.py hathi_rsync
 
 * After pairtree content has been updated, pages should be updated
   in Solr::
 
-  python manage.py index_pages
+    python manage.py index_pages
 
 * Digital page ranges for HathiTrust excerpts should be corrected
   using a CSV file provided by the project team::
 
-  python manage.py adjust_excerpts HT_excerpt_corrections.csv
+    python manage.py adjust_excerpts HT_excerpt_corrections.csv
 
 
 3.11.2

From 12d1da978c7c11cce295cc5c3f7844703a1e4fe0 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 4 Apr 2024 15:22:01 -0400
Subject: [PATCH 56/71] Fix configuration for optional django-debug-toolbar

---
 ppa/settings/components/debug.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ppa/settings/components/debug.py b/ppa/settings/components/debug.py
index cd2fde28..622ad4dc 100644
--- a/ppa/settings/components/debug.py
+++ b/ppa/settings/components/debug.py
@@ -1,10 +1,9 @@
-# if django-debug-toolbar is installed, enable it
-
-from ppa.settings.components.base import INSTALLED_APPS, MIDDLEWARE
+from ppa.settings import INSTALLED_APPS, MIDDLEWARE
 
 # Configure internal IPs for access to view debug toolbar
 INTERNAL_IPS = ["127.0.0.1", "localhost"]
 
+# if django-debug-toolbar is installed, enable it
 try:
     INSTALLED_APPS.append("debug_toolbar")
     MIDDLEWARE += ("debug_toolbar.middleware.DebugToolbarMiddleware",)

From 07ac7a538afbcf1ad78a0093c3c03e653b29e4c6 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 4 Apr 2024 15:22:01 -0400
Subject: [PATCH 57/71] Fix configuration for optional django-debug-toolbar

---
 ppa/urls.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/ppa/urls.py b/ppa/urls.py
index 56aaa640..3543732e 100644
--- a/ppa/urls.py
+++ b/ppa/urls.py
@@ -58,16 +58,17 @@
 
 # serve media content for development
 if settings.DEBUG:
-    import debug_toolbar
-
     urlpatterns = [
-        # include debug toolbar urls first to avoid getting caught by other urls
-        re_path(r"^__debug__/", include(debug_toolbar.urls)),
-        re_path(
-            r"^media/(?P<path>.*)$",
-            serve,
-            {
-                "document_root": settings.MEDIA_ROOT,
-            },
-        ),
+        re_path(r"^media/(?P<path>.*)$", serve, {"document_root": settings.MEDIA_ROOT}),
     ] + urlpatterns
+
+    try:
+        # include debug toolbar when available
+        import debug_toolbar
+
+        urlpatterns = [
+            # include debug toolbar urls first to avoid getting caught by other urls
+            re_path(r"^__debug__/", include(debug_toolbar.urls)),
+        ] + urlpatterns
+    except ImportError:
+        pass

From ed0f4d8b69cbf416ab84a1a6a3c30cf4b8065eee Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 4 Apr 2024 15:31:23 -0400
Subject: [PATCH 58/71] Actually commit import of debug_toolbar to check for
 import error

- mark so ruff does not clean up unused import
---
 ppa/settings/components/debug.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ppa/settings/components/debug.py b/ppa/settings/components/debug.py
index 622ad4dc..39f55369 100644
--- a/ppa/settings/components/debug.py
+++ b/ppa/settings/components/debug.py
@@ -5,6 +5,8 @@
 
 # if django-debug-toolbar is installed, enable it
 try:
+    import debug_toolbar  # noqa: F401  (do not clean up unused import)
+
     INSTALLED_APPS.append("debug_toolbar")
     MIDDLEWARE += ("debug_toolbar.middleware.DebugToolbarMiddleware",)
 except ImportError:

From 00b8771d04a8d424d1dea5909dab8d65e76c9b7f Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 4 Apr 2024 15:59:26 -0400
Subject: [PATCH 59/71] Index work first page as string instead of integer in
 solr

since it may now include non-numeric labels, e.g. roman numerals
---
 ppa/archive/models.py |  2 +-
 ppa/archive/solr.py   | 48 +++++++++++++++++++------------------------
 2 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/ppa/archive/models.py b/ppa/archive/models.py
index 204ca515..7b1af106 100644
--- a/ppa/archive/models.py
+++ b/ppa/archive/models.py
@@ -924,7 +924,7 @@ def index_data(self):
         return {
             "id": index_id,
             "source_id": self.source_id,
-            "first_page_i": self.first_page(),
+            "first_page_s": self.first_page(),
             "group_id_s": index_id,  # for grouping pages by work or excerpt
             "source_t": self.get_source_display(),
             "source_url": self.source_url,
diff --git a/ppa/archive/solr.py b/ppa/archive/solr.py
index 5cb251e1..04e7961e 100644
--- a/ppa/archive/solr.py
+++ b/ppa/archive/solr.py
@@ -6,7 +6,6 @@
 
 
 class ArchiveSearchQuerySet(AliasedSolrQuerySet):
-
     # search title query field syntax
     # (query field configured in solr config; searches title & subtitle with
     # boosting)
@@ -32,7 +31,7 @@ class ArchiveSearchQuerySet(AliasedSolrQuerySet):
         "collections",
         "source_t",
         "image_id_s",
-        "first_page_i",
+        "first_page_s",
         "source_url",
         "work_type_s",
         "book_journal_s",
@@ -44,7 +43,7 @@ class ArchiveSearchQuerySet(AliasedSolrQuerySet):
     aliases = {
         "source_t": "source",
         "image_id_s": "image_id",
-        "first_page_i": "first_page",
+        "first_page_s": "first_page",
         "work_type_s": "work_type",
         "book_journal_s": "book_journal",
         "group_id_s": "group_id",
@@ -55,8 +54,9 @@ class ArchiveSearchQuerySet(AliasedSolrQuerySet):
     within_cluster_id = None
 
     def __init__(self, solr=None):
-        # field aliases: keys return the fields that will be returned from Solr for search page;
-        # values provide an aliased name if it should be different than solr index field.
+        # field aliases: keys return the fields that will be returned
+        # from Solr for search page; values provide an aliased name if
+        # it should be different than solr index field.
         # use alias if one is set, otherwise use field name
         self.field_aliases = {
             self.aliases.get(key, key): key for key in self.return_fields
@@ -120,12 +120,13 @@ def query_opts(self):
         # when searching within a cluster, collapse on group id
         collapse_on = "group_id_s" if self.within_cluster_id else "cluster_id_s"
 
-        # @NOTE: Role of order here in separating works from pages (works < pages) may need to be revisited eventually.
+        # NOTE: Role of order here in separating works from pages (works < pages)
+        # may need to be revisited eventually.
         collapse_filter = '{!collapse field=%s sort="order asc"}' % collapse_on
-        
-        # We can apply collapse here since we need it for both keyword query case and not
-        # Remember that cluster_id_s is now defined as `str(self.cluster) if self.cluster else index_id` in models.py.
-        # So collapsing by "cluster" id implicitly includes works with no cluster id set.
+
+        # We can apply collapse here since we need it for default search
+        # cluster id corresponds to index id for works not in a cluster,
+        # so collapsing by cluster id still includes works with no cluster id
         qs_copy = qs_copy.filter(collapse_filter)
 
         # if there is no keyword search present, only works should
@@ -163,16 +164,10 @@ def query_opts(self):
             qs_copy = qs_copy.raw_query_parameters(work_query=work_query)
 
         content_query = "content:(%s)" % self.keyword_query
-        qs_copy = (
-            qs_copy.search(combined_query)
-            # .filter(collapse_filter)     # This no longer needed since applied above in `qs_copy = qs_copy.filter(collapse_filter)`
-            .raw_query_parameters(
-                content_query=content_query,
-                keyword_query=self.keyword_query,
-                # expand="true",
-                work_query=work_query,
-                # **{"expand.rows": 1},
-            )
+        qs_copy = qs_copy.search(combined_query).raw_query_parameters(
+            content_query=content_query,
+            keyword_query=self.keyword_query,
+            work_query=work_query,
         )
 
         return qs_copy._base_query_opts()
@@ -182,18 +177,17 @@ def _base_query_opts(self):
         return super().query_opts()
 
 
-
 class PageSearchQuerySet(AliasedSolrQuerySet):
     # aliases for any fields we want to rename for search and display
     # includes non-renamed fields to push them into the return
     field_aliases = {
-        "id":"id",
-        "score":"score",
-        "order":"order",
-        "title":"title",
-        "label":"label",
+        "id": "id",
+        "score": "score",
+        "order": "order",
+        "title": "title",
+        "label": "label",
         "source_id": "source_id",
         "image_id": "image_id_s",
         "group_id": "group_id_s",
         "cluster_id": "cluster_id_s",
-    }
\ No newline at end of file
+    }

From 2065c1ea81e8082112381bdc5752431b68033ead Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Fri, 5 Apr 2024 12:08:16 -0400
Subject: [PATCH 60/71] Link to original source must use first digital page,
 not original

ref #555
---
 ppa/archive/templates/archive/digitizedwork_detail.html | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ppa/archive/templates/archive/digitizedwork_detail.html b/ppa/archive/templates/archive/digitizedwork_detail.html
index 2c11d9f6..cac145aa 100644
--- a/ppa/archive/templates/archive/digitizedwork_detail.html
+++ b/ppa/archive/templates/archive/digitizedwork_detail.html
@@ -110,11 +110,11 @@ <h1 class="header">{{ object.title }}</h1>
             <tr class="source-link">
                 <th scope="row">{{ object.get_source_link_label }}</th>
                 <td>
-                    {% if object.pages_digital %}  {# if page range is defined (excerpt/article), link to first page in range #}
+                    {% if object.pages_digital %}  {# if page range is defined (excerpt/article), link to first *digital* page in range #}
                         {% if object.source == object.HATHI %}
-                        <a href="{% hathi_page_url object.source_id object.first_page %}" target="_blank" rel="noopener noreferrer">{{ object.source_id }}</a>
+                        <a href="{% hathi_page_url object.source_id object.first_page_digital %}" target="_blank" rel="noopener noreferrer">{{ object.source_id }}</a>
                         {% elif object.source == object.GALE %}
-                        <a href="{% gale_page_url object.source_url object.first_page %}" target="_blank" rel="noopener noreferrer">{{ object.source_id }}</a>
+                        <a href="{% gale_page_url object.source_url object.first_page_digital %}" target="_blank" rel="noopener noreferrer">{{ object.source_id }}</a>
                         {% endif %}
                     {% else %} {# when there is no page range, use source url #}
                     <a href="{{ object.source_url }}" target="_blank" rel="noopener noreferrer">{{ object.source_id }}</a>

From 5d5ed2af1ec07439526179184bcc6e761aeefda1 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Mon, 8 Apr 2024 16:12:29 -0400
Subject: [PATCH 61/71] Adjust and test view regex for excerpt with single page

ref #555
---
 ppa/archive/tests/test_views.py | 6 ++++++
 ppa/archive/views.py            | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/ppa/archive/tests/test_views.py b/ppa/archive/tests/test_views.py
index 126f9fbf..c4992150 100644
--- a/ppa/archive/tests/test_views.py
+++ b/ppa/archive/tests/test_views.py
@@ -457,6 +457,12 @@ def test_get_queryset(self, mock_index_items):
         # start page 20 should match 20 only and not 200
         assert response.context["object"] == dial_excerpt2
 
+        # single page should also work
+        dial_excerpt2.pages_orig = "20"
+        dial_excerpt2.save()
+        response = self.client.get(dial_excerpt2.get_absolute_url())
+        assert response.context["object"] == dial_excerpt2
+
         # create excerpt where there is no existing work;
         # set old_workid based on first digital page
         excerpt = DigitizedWork.objects.create(
diff --git a/ppa/archive/views.py b/ppa/archive/views.py
index b4f845c2..29c64668 100644
--- a/ppa/archive/views.py
+++ b/ppa/archive/views.py
@@ -299,7 +299,7 @@ def get_queryset(self):
         start_page = self.kwargs.get("start_page")
         # if start page is specified, filter to get the correct excerpt
         if start_page:
-            qs = source_qs.filter(pages_orig__regex=f"^{start_page}([,-]|\b)")
+            qs = source_qs.filter(pages_orig__regex=f"^{start_page}([,-]|\b|$)")
         # if start page is NOT specified, ensure we do not retrieve an excerpt
         else:
             qs = source_qs.filter(pages_orig__exact="")

From c9e4f8627ec60b44b575e4987cbbfbe8be63aa84 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Mon, 8 Apr 2024 16:13:14 -0400
Subject: [PATCH 62/71] Update DigitizedWork string method to use original
 pages

ref #555
---
 ppa/archive/models.py            | 6 +++---
 ppa/archive/tests/test_models.py | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/ppa/archive/models.py b/ppa/archive/models.py
index 7b1af106..958f14ec 100644
--- a/ppa/archive/models.py
+++ b/ppa/archive/models.py
@@ -516,9 +516,9 @@ def get_absolute_url(self):
 
     def __str__(self):
         """Default string display. Uses :attr:`source_id`
-        and :attr:`pages_digital` if any"""
-        if self.pages_digital:
-            return "%s (%s)" % (self.source_id, self.pages_digital)
+        and :attr:`pages_orig` if any"""
+        if self.pages_orig:
+            return "%s (%s)" % (self.source_id, self.pages_orig)
         return self.source_id
 
     @property
diff --git a/ppa/archive/tests/test_models.py b/ppa/archive/tests/test_models.py
index eaa2b7f1..0c1d3d78 100644
--- a/ppa/archive/tests/test_models.py
+++ b/ppa/archive/tests/test_models.py
@@ -190,8 +190,9 @@ def test_str(self):
         digwork = DigitizedWork(source_id="njp.32101013082597")
         assert str(digwork) == digwork.source_id
 
-        # with pages
-        digwork.pages_digital = "20-25"
+        # with pages - should use *original*, not digital
+        digwork.pages_orig = "20-25"
+        digwork.pages_digital = "22-27"
         assert str(digwork) == "%s (20-25)" % digwork.source_id
 
     def test_display_title(self):

From e21e49fc0f1918a41ac922824e5cc4fa08d43403 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Mon, 8 Apr 2024 16:36:22 -0400
Subject: [PATCH 63/71] Improve behavior for digwork admin source link  #427

---
 ppa/archive/admin.py            | 15 +++++++++++++--
 ppa/archive/tests/test_admin.py | 15 +++++++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/ppa/archive/admin.py b/ppa/archive/admin.py
index e9266364..346cdb60 100644
--- a/ppa/archive/admin.py
+++ b/ppa/archive/admin.py
@@ -16,6 +16,7 @@
     ProtectedWorkFieldFlags,
 )
 from ppa.archive.views import ImportView
+from ppa.archive.templatetags.ppa_tags import hathi_page_url, gale_page_url
 
 
 # import/export resource
@@ -175,9 +176,19 @@ def list_collections(self, obj):
     list_collections.short_description = "Collections"
 
     def source_link(self, obj):
-        """Link to source record"""
+        """source id as an html link to source record, when source url is available"""
+        if not obj.source_url:
+            return obj.source_id
+
+        source_url = obj.source_url
+        # hathi/gale excerpt links should include first page
+        if obj.pages_digital:
+            if obj.source == DigitizedWork.HATHI:
+                source_url = hathi_page_url(obj.source_url, obj.first_page_digital())
+            if obj.source == DigitizedWork.GALE:
+                source_url = gale_page_url(obj.source_url, obj.first_page_digital())
         return mark_safe(
-            '<a href="%s" target="_blank">%s</a>' % (obj.source_url, obj.source_id)
+            '<a href="%s" target="_blank">%s</a>' % (source_url, obj.source_id)
         )
 
     source_link.short_description = "Source id"
diff --git a/ppa/archive/tests/test_admin.py b/ppa/archive/tests/test_admin.py
index d3d6c067..7948a1e6 100644
--- a/ppa/archive/tests/test_admin.py
+++ b/ppa/archive/tests/test_admin.py
@@ -51,6 +51,21 @@ def test_source_link(self):
         assert (
             snippet == '<a href="%s" target="_blank">njp.32101013082597</a>' % fake_url
         )
+        # excerpt with digital page
+        digwork.pages_digital = "22-30"
+        # - HathiTrust
+        digwork.source = DigitizedWork.HATHI
+        snippet = digadmin.source_link(digwork)
+        assert digwork.source_id in snippet
+        assert "seq=22" in snippet
+        # Gale
+        digwork.source = DigitizedWork.GALE
+        snippet = digadmin.source_link(digwork)
+        assert "&pg=22" in snippet
+
+        # no url - id only, no link
+        digwork.source_url = ""
+        assert digadmin.source_link(digwork) == digwork.source_id
 
     def test_readonly_fields(self):
         site = AdminSite()

From 83f6a63e1f91fdbff1ae0034e4e5ff00cd9ee86b Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Mon, 8 Apr 2024 17:10:13 -0400
Subject: [PATCH 64/71] Add custom validation to ensure source id + first page
 orig is unique

ref #555
---
 ppa/archive/models.py            | 26 ++++++++++++++++++++++++++
 ppa/archive/tests/test_models.py | 19 +++++++++++++++++++
 ppa/archive/views.py             |  2 +-
 3 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/ppa/archive/models.py b/ppa/archive/models.py
index 958f14ec..65875433 100644
--- a/ppa/archive/models.py
+++ b/ppa/archive/models.py
@@ -310,6 +310,12 @@ def validate_page_range(value):
         )
 
 
+class DigitizedWorkQuerySet(models.QuerySet):
+    def by_first_page_orig(self, start_page):
+        "find records based on first page in original page range"
+        return self.filter(pages_orig__regex=f"^{start_page}([,-]|\b|$)")
+
+
 class DigitizedWork(ModelIndexable, TrackChangesModel):
     """
     Record to manage digitized works included in PPA and store their basic
@@ -488,6 +494,9 @@ class DigitizedWork(ModelIndexable, TrackChangesModel):
         blank=True,
     )
 
+    # use custom queryset
+    objects = DigitizedWorkQuerySet.as_manager()
+
     class Meta:
         ordering = ("sort_title",)
         # require unique combination of source id + page range,
@@ -654,6 +663,23 @@ def clean(self):
                 "Changing source ID for HathiTrust records is not supported"
             )
 
+        # if original page range is set, check that first page is unique
+        if self.pages_orig:
+            first_page = self.first_page_original()
+            # check for other excerpts in this work with the same first page
+            other_excerpts = DigitizedWork.objects.filter(
+                source_id=self.source_id
+            ).by_first_page_orig(first_page)
+            # if this record has already been saved, exclude it when checking
+            if self.pk:
+                other_excerpts.exclude(pk=self.pk)
+            if other_excerpts.exists():
+                raise ValidationError(
+                    {
+                        "pages_orig": f"First page {first_page} is not unique for this source",
+                    }
+                )
+
     def compare_protected_fields(self, db_obj):
         """Compare protected fields in a
         :class:`ppa.archive.models.DigitizedWork` instance and return those
diff --git a/ppa/archive/tests/test_models.py b/ppa/archive/tests/test_models.py
index 0c1d3d78..b6eef6b3 100644
--- a/ppa/archive/tests/test_models.py
+++ b/ppa/archive/tests/test_models.py
@@ -803,6 +803,25 @@ def test_clean(self):
         work.source = DigitizedWork.OTHER
         work.clean()
 
+    def test_clean_unique_first_page(self):
+        DigitizedWork.objects.create(
+            source_id="chi.79279237", pages_orig="233-244", pages_digital="200-210"
+        )
+        # first original page matches even though range is distinct; unsaved
+        work2 = DigitizedWork(source_id="chi.79279237", pages_orig="233-240")
+        with pytest.raises(
+            ValidationError, match="First page 233 is not unique for this source"
+        ):
+            work2.clean()
+
+        # test updating existing record; same error
+        work2 = DigitizedWork.objects.create(source_id="chi.79279237", pages_orig="232")
+        work2.pages_orig = "233-235"
+        with pytest.raises(
+            ValidationError, match="First page 233 is not unique for this source"
+        ):
+            work2.clean()
+
     def test_clean_fields(self):
         work = DigitizedWork(
             source_id="chi.79279237",
diff --git a/ppa/archive/views.py b/ppa/archive/views.py
index 29c64668..a063070a 100644
--- a/ppa/archive/views.py
+++ b/ppa/archive/views.py
@@ -299,7 +299,7 @@ def get_queryset(self):
         start_page = self.kwargs.get("start_page")
         # if start page is specified, filter to get the correct excerpt
         if start_page:
-            qs = source_qs.filter(pages_orig__regex=f"^{start_page}([,-]|\b|$)")
+            qs = source_qs.by_first_page_orig(start_page)
         # if start page is NOT specified, ensure we do not retrieve an excerpt
         else:
             qs = source_qs.filter(pages_orig__exact="")

From 14294af831d0d4f9af6db64831dbabc3475ed138 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Mon, 8 Apr 2024 17:18:01 -0400
Subject: [PATCH 65/71] Use mock to skip trying to index pages when testing
 clean logic

---
 ppa/archive/tests/test_models.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ppa/archive/tests/test_models.py b/ppa/archive/tests/test_models.py
index b6eef6b3..7f8f05b8 100644
--- a/ppa/archive/tests/test_models.py
+++ b/ppa/archive/tests/test_models.py
@@ -803,7 +803,8 @@ def test_clean(self):
         work.source = DigitizedWork.OTHER
         work.clean()
 
-    def test_clean_unique_first_page(self):
+    @patch("ppa.archive.models.DigitizedWork.index_items")
+    def test_clean_unique_first_page(self, mock_index_items):
         DigitizedWork.objects.create(
             source_id="chi.79279237", pages_orig="233-244", pages_digital="200-210"
         )

From f96c3d81ea8f6d14a905f149b84555ce457e13af Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Tue, 9 Apr 2024 10:09:11 -0400
Subject: [PATCH 66/71] Correct hathi page link generation for excerpts in
 admin #427

---
 ppa/archive/admin.py            | 4 +++-
 ppa/archive/tests/test_admin.py | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/ppa/archive/admin.py b/ppa/archive/admin.py
index 346cdb60..c9a1dc18 100644
--- a/ppa/archive/admin.py
+++ b/ppa/archive/admin.py
@@ -184,8 +184,10 @@ def source_link(self, obj):
         # hathi/gale excerpt links should include first page
         if obj.pages_digital:
             if obj.source == DigitizedWork.HATHI:
-                source_url = hathi_page_url(obj.source_url, obj.first_page_digital())
+                # hathi page url method requires source id
+                source_url = hathi_page_url(obj.source_id, obj.first_page_digital())
             if obj.source == DigitizedWork.GALE:
+                # gale page url method requires source url
                 source_url = gale_page_url(obj.source_url, obj.first_page_digital())
         return mark_safe(
             '<a href="%s" target="_blank">%s</a>' % (source_url, obj.source_id)
diff --git a/ppa/archive/tests/test_admin.py b/ppa/archive/tests/test_admin.py
index 7948a1e6..447621e8 100644
--- a/ppa/archive/tests/test_admin.py
+++ b/ppa/archive/tests/test_admin.py
@@ -58,6 +58,8 @@ def test_source_link(self):
         snippet = digadmin.source_link(digwork)
         assert digwork.source_id in snippet
         assert "seq=22" in snippet
+        # hathi url is based on source id, not source url
+        assert digwork.source_url not in snippet
         # Gale
         digwork.source = DigitizedWork.GALE
         snippet = digadmin.source_link(digwork)

From 0c94b90914da2f9fd0726965ef29cf1cb9489d06 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Tue, 9 Apr 2024 10:16:02 -0400
Subject: [PATCH 67/71] Update hathi page url to use normal url params syntax

Older format with semicolons now redirects to standard format;
correction identified by @mnaydan
---
 ppa/archive/templatetags/ppa_tags.py   | 4 +++-
 ppa/archive/tests/test_templatetags.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/ppa/archive/templatetags/ppa_tags.py b/ppa/archive/templatetags/ppa_tags.py
index 26ff8766..b9edc0df 100644
--- a/ppa/archive/templatetags/ppa_tags.py
+++ b/ppa/archive/templatetags/ppa_tags.py
@@ -71,7 +71,9 @@ def hathi_page_url(item_id, order):
 
         {% page_url item_id page.order %}
     """
-    return "{}/pt?id={};view=1up;seq={}".format(HATHI_BASE_URL, item_id, order)
+    return mark_safe(
+        "{}/pt?id={}&view=1up&seq={}".format(HATHI_BASE_URL, item_id, order)
+    )
 
 
 @register.simple_tag
diff --git a/ppa/archive/tests/test_templatetags.py b/ppa/archive/tests/test_templatetags.py
index eef33b1d..90073e0f 100644
--- a/ppa/archive/tests/test_templatetags.py
+++ b/ppa/archive/tests/test_templatetags.py
@@ -73,7 +73,7 @@ def test_hathi_page_url():
     order = 50
     hathi_url = hathi_page_url(item_id, order)
     assert hathi_url.startswith("%s/pt" % HATHI_BASE_URL)
-    assert hathi_url.endswith("?id=%s;view=1up;seq=%s" % (item_id, order))
+    assert hathi_url.endswith("?id=%s&view=1up&seq=%s" % (item_id, order))
 
 
 def test_gale_page_url():

From ff23f31e45abb43acd3cc8ebef1d152eca04d286 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Tue, 9 Apr 2024 10:19:47 -0400
Subject: [PATCH 68/71] Exclude current object when validating first page
 unique for source

ref #555
---
 ppa/archive/models.py            | 2 +-
 ppa/archive/tests/test_models.py | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/ppa/archive/models.py b/ppa/archive/models.py
index 65875433..ba3ba8a6 100644
--- a/ppa/archive/models.py
+++ b/ppa/archive/models.py
@@ -672,7 +672,7 @@ def clean(self):
             ).by_first_page_orig(first_page)
             # if this record has already been saved, exclude it when checking
             if self.pk:
-                other_excerpts.exclude(pk=self.pk)
+                other_excerpts = other_excerpts.exclude(pk=self.pk)
             if other_excerpts.exists():
                 raise ValidationError(
                     {
diff --git a/ppa/archive/tests/test_models.py b/ppa/archive/tests/test_models.py
index 7f8f05b8..3141ab3a 100644
--- a/ppa/archive/tests/test_models.py
+++ b/ppa/archive/tests/test_models.py
@@ -805,9 +805,14 @@ def test_clean(self):
 
     @patch("ppa.archive.models.DigitizedWork.index_items")
     def test_clean_unique_first_page(self, mock_index_items):
-        DigitizedWork.objects.create(
+        digwork = DigitizedWork.objects.create(
             source_id="chi.79279237", pages_orig="233-244", pages_digital="200-210"
         )
+        # save with unrelated change; should not trigger validation error
+        digwork.pages_digital = "201-210"
+        digwork.save()
+        digwork.clean()
+
         # first original page matches even though range is distinct; unsaved
         work2 = DigitizedWork(source_id="chi.79279237", pages_orig="233-240")
         with pytest.raises(

From 3264dc7fcf8d006a47ea2cc2dfa7d62f7633e788 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 11 Apr 2024 16:22:55 -0400
Subject: [PATCH 69/71] Set version to 3.12 and document changes

---
 CHANGELOG.rst   | 15 +++++++++++++++
 ppa/__init__.py |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index f7ccb237..e25bbcbb 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -3,6 +3,21 @@
 CHANGELOG
 =========
 
+3.12
+----
+- As an admin, I want the Source ID link in list view to go to the first page of the excerpt for articles and excerpts, so that I can more easily access excerpt content.
+- As a developer, I want a script to do a one-time bulk fix of HathiTrust excerpt page ranges from a spreadsheet so that we can pull the corret content from updated HathiTrust materials.
+- As a developer, I want a script to update all HathiTrust content so that I can refresh locally cached data with OCR improvements and other changes.
+- bugfix: excerpt work ID is now based on sourceID + original page range
+  rather than digital page range
+- bugfix: fix indexing and page count for new excerpts when there are multiple excerpts from a single source
+- bugfix: improved index_pages script error handling for missing page count
+  in database when running in expedited mode
+- new manage command to to report on possible HathiTrust excerpt page range mismatches based on page labels in METS-ALTO
+- utility script to get volume last modification date from public HathiTrust website
+- updated settings to use django-split-settings
+- address deprecation warnings and suppress warnings for dependencies
+
 3.11.4
 ------
 
diff --git a/ppa/__init__.py b/ppa/__init__.py
index 2078fb24..04ff3c86 100644
--- a/ppa/__init__.py
+++ b/ppa/__init__.py
@@ -1,4 +1,4 @@
-__version_info__ = (3, 12, 0, "dev")
+__version_info__ = (3, 12, 0, None)
 
 
 # Dot-connect all but the last. Last is dash-connected if not None.

From 81f05df7abed563e76ee206b88ac9c689afe4d8a Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 11 Apr 2024 17:37:04 -0400
Subject: [PATCH 70/71] Update npm packages via npm audit fix

---
 package-lock.json | 361 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 251 insertions(+), 110 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index 93e7b60f..ef828af4 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -4918,21 +4918,21 @@
             }
         },
         "node_modules/body-parser": {
-            "version": "1.20.0",
-            "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.0.tgz",
-            "integrity": "sha512-DfJ+q6EPcGKZD1QWUjSpqp+Q7bDQTsQIF4zfUAtZ6qk+H/3/QRhg9CEp39ss+/T2vw0+HaidC0ecJj/DRLIaKg==",
+            "version": "1.20.2",
+            "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.2.tgz",
+            "integrity": "sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==",
             "dev": true,
             "dependencies": {
                 "bytes": "3.1.2",
-                "content-type": "~1.0.4",
+                "content-type": "~1.0.5",
                 "debug": "2.6.9",
                 "depd": "2.0.0",
                 "destroy": "1.2.0",
                 "http-errors": "2.0.0",
                 "iconv-lite": "0.4.24",
                 "on-finished": "2.4.1",
-                "qs": "6.10.3",
-                "raw-body": "2.5.1",
+                "qs": "6.11.0",
+                "raw-body": "2.5.2",
                 "type-is": "~1.6.18",
                 "unpipe": "1.0.0"
             },
@@ -4963,9 +4963,9 @@
             }
         },
         "node_modules/body-parser/node_modules/qs": {
-            "version": "6.10.3",
-            "resolved": "https://registry.npmjs.org/qs/-/qs-6.10.3.tgz",
-            "integrity": "sha512-wr7M2E0OFRfIfJZjKGieI8lBKb7fRCH4Fv5KNPEs7gJ8jadvotdsS08PzOKR7opXhZ/Xkjtt3WF9g38drmyRqQ==",
+            "version": "6.11.0",
+            "resolved": "https://registry.npmjs.org/qs/-/qs-6.11.0.tgz",
+            "integrity": "sha512-MvjoMCJwEarSbUYk5O+nmoSzSutSsTwF85zcHPQ9OrlFoZOYIjaqBAJIqIXjptyD5vThxGq52Xu/MaJzRkIk4Q==",
             "dev": true,
             "dependencies": {
                 "side-channel": "^1.0.4"
@@ -5158,12 +5158,18 @@
             }
         },
         "node_modules/call-bind": {
-            "version": "1.0.2",
-            "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.2.tgz",
-            "integrity": "sha512-7O+FbCihrB5WGbFYesctwmTKae6rOiIzmz1icreWJ+0aA7LJfuqhEso2T9ncpcFtzMQtzXf2QGGueWJGTYsqrA==",
+            "version": "1.0.7",
+            "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.7.tgz",
+            "integrity": "sha512-GHTSNSYICQ7scH7sZ+M2rFopRoLh8t2bLSW6BbgrtLsahOIB5iyAVJf9GjWK3cYTDaMj4XdBpM1cA6pIS0Kv2w==",
             "dependencies": {
-                "function-bind": "^1.1.1",
-                "get-intrinsic": "^1.0.2"
+                "es-define-property": "^1.0.0",
+                "es-errors": "^1.3.0",
+                "function-bind": "^1.1.2",
+                "get-intrinsic": "^1.2.4",
+                "set-function-length": "^1.2.1"
+            },
+            "engines": {
+                "node": ">= 0.4"
             },
             "funding": {
                 "url": "https://github.com/sponsors/ljharb"
@@ -5890,9 +5896,9 @@
             ]
         },
         "node_modules/content-type": {
-            "version": "1.0.4",
-            "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.4.tgz",
-            "integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA==",
+            "version": "1.0.5",
+            "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.5.tgz",
+            "integrity": "sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==",
             "dev": true,
             "engines": {
                 "node": ">= 0.6"
@@ -6756,6 +6762,22 @@
                 "node": ">=0.8"
             }
         },
+        "node_modules/define-data-property": {
+            "version": "1.1.4",
+            "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz",
+            "integrity": "sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A==",
+            "dependencies": {
+                "es-define-property": "^1.0.0",
+                "es-errors": "^1.3.0",
+                "gopd": "^1.0.1"
+            },
+            "engines": {
+                "node": ">= 0.4"
+            },
+            "funding": {
+                "url": "https://github.com/sponsors/ljharb"
+            }
+        },
         "node_modules/define-lazy-prop": {
             "version": "2.0.0",
             "resolved": "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-2.0.0.tgz",
@@ -7375,6 +7397,25 @@
                 "is-arrayish": "^0.2.1"
             }
         },
+        "node_modules/es-define-property": {
+            "version": "1.0.0",
+            "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.0.tgz",
+            "integrity": "sha512-jxayLKShrEqqzJ0eumQbVhTYQM27CfT1T35+gCgDFoL82JLsXqTJ76zv6A0YLOgEnLUMvLzsDsGIrl8NFpT2gQ==",
+            "dependencies": {
+                "get-intrinsic": "^1.2.4"
+            },
+            "engines": {
+                "node": ">= 0.4"
+            }
+        },
+        "node_modules/es-errors": {
+            "version": "1.3.0",
+            "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz",
+            "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==",
+            "engines": {
+                "node": ">= 0.4"
+            }
+        },
         "node_modules/es-module-lexer": {
             "version": "1.3.0",
             "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-1.3.0.tgz",
@@ -7712,17 +7753,17 @@
             }
         },
         "node_modules/express": {
-            "version": "4.18.1",
-            "resolved": "https://registry.npmjs.org/express/-/express-4.18.1.tgz",
-            "integrity": "sha512-zZBcOX9TfehHQhtupq57OF8lFZ3UZi08Y97dwFCkD8p9d/d2Y3M+ykKcwaMDEL+4qyUolgBDX6AblpR3fL212Q==",
+            "version": "4.19.2",
+            "resolved": "https://registry.npmjs.org/express/-/express-4.19.2.tgz",
+            "integrity": "sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==",
             "dev": true,
             "dependencies": {
                 "accepts": "~1.3.8",
                 "array-flatten": "1.1.1",
-                "body-parser": "1.20.0",
+                "body-parser": "1.20.2",
                 "content-disposition": "0.5.4",
                 "content-type": "~1.0.4",
-                "cookie": "0.5.0",
+                "cookie": "0.6.0",
                 "cookie-signature": "1.0.6",
                 "debug": "2.6.9",
                 "depd": "2.0.0",
@@ -7738,7 +7779,7 @@
                 "parseurl": "~1.3.3",
                 "path-to-regexp": "0.1.7",
                 "proxy-addr": "~2.0.7",
-                "qs": "6.10.3",
+                "qs": "6.11.0",
                 "range-parser": "~1.2.1",
                 "safe-buffer": "5.2.1",
                 "send": "0.18.0",
@@ -7760,9 +7801,9 @@
             "dev": true
         },
         "node_modules/express/node_modules/cookie": {
-            "version": "0.5.0",
-            "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.5.0.tgz",
-            "integrity": "sha512-YZ3GUyn/o8gfKJlnlX7g7xq4gyO6OSuhGPKaaGssGB2qgDUS0gPgtTvoyZLTt9Ab6dC4hfc9dV5arkvc/OCmrw==",
+            "version": "0.6.0",
+            "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.6.0.tgz",
+            "integrity": "sha512-U71cyTamuh1CRNCfpGY6to28lxvNwPG4Guz/EVjgf3Jmzv0vlDp1atT9eS5dDjMYHucpHbWns6Lwf3BKz6svdw==",
             "dev": true,
             "engines": {
                 "node": ">= 0.6"
@@ -7796,9 +7837,9 @@
             }
         },
         "node_modules/express/node_modules/qs": {
-            "version": "6.10.3",
-            "resolved": "https://registry.npmjs.org/qs/-/qs-6.10.3.tgz",
-            "integrity": "sha512-wr7M2E0OFRfIfJZjKGieI8lBKb7fRCH4Fv5KNPEs7gJ8jadvotdsS08PzOKR7opXhZ/Xkjtt3WF9g38drmyRqQ==",
+            "version": "6.11.0",
+            "resolved": "https://registry.npmjs.org/qs/-/qs-6.11.0.tgz",
+            "integrity": "sha512-MvjoMCJwEarSbUYk5O+nmoSzSutSsTwF85zcHPQ9OrlFoZOYIjaqBAJIqIXjptyD5vThxGq52Xu/MaJzRkIk4Q==",
             "dev": true,
             "dependencies": {
                 "side-channel": "^1.0.4"
@@ -8657,13 +8698,18 @@
             }
         },
         "node_modules/get-intrinsic": {
-            "version": "1.1.2",
-            "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.1.2.tgz",
-            "integrity": "sha512-Jfm3OyCxHh9DJyc28qGk+JmfkpO41A4XkneDSujN9MDXrm4oDKdHvndhZ2dN94+ERNfkYJWDclW6k2L/ZGHjXA==",
+            "version": "1.2.4",
+            "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.2.4.tgz",
+            "integrity": "sha512-5uYhsJH8VJBTv7oslg4BznJYhDoRI6waYCxMmCdnTrcCrHA/fCFKoTFz2JKKE0HdDFUF7/oQuhzumXJK7paBRQ==",
             "dependencies": {
-                "function-bind": "^1.1.1",
-                "has": "^1.0.3",
-                "has-symbols": "^1.0.3"
+                "es-errors": "^1.3.0",
+                "function-bind": "^1.1.2",
+                "has-proto": "^1.0.1",
+                "has-symbols": "^1.0.3",
+                "hasown": "^2.0.0"
+            },
+            "engines": {
+                "node": ">= 0.4"
             },
             "funding": {
                 "url": "https://github.com/sponsors/ljharb"
@@ -8859,6 +8905,17 @@
                 "node": ">= 0.10"
             }
         },
+        "node_modules/gopd": {
+            "version": "1.0.1",
+            "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.0.1.tgz",
+            "integrity": "sha512-d65bNlIadxvpb/A2abVdlqKqV563juRnZ1Wtk6s1sIR8uNsXR70xqIzVqxVf1eTqDunwT2MkczEeaezCKTZhwA==",
+            "dependencies": {
+                "get-intrinsic": "^1.1.3"
+            },
+            "funding": {
+                "url": "https://github.com/sponsors/ljharb"
+            }
+        },
         "node_modules/graceful-fs": {
             "version": "4.2.10",
             "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.10.tgz",
@@ -9878,11 +9935,22 @@
             }
         },
         "node_modules/has-property-descriptors": {
-            "version": "1.0.0",
-            "resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.0.tgz",
-            "integrity": "sha512-62DVLZGoiEBDHQyqG4w9xCuZ7eJEwNmJRWw2VY84Oedb7WFcA27fiEVe8oUQx9hAUJ4ekurquucTGwsyO1XGdQ==",
+            "version": "1.0.2",
+            "resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.2.tgz",
+            "integrity": "sha512-55JNKuIW+vq4Ke1BjOTjM2YctQIvCT7GFzHwmfZPGo5wnrgkid0YQtnAleFSqumZm4az3n2BS+erby5ipJdgrg==",
             "dependencies": {
-                "get-intrinsic": "^1.1.1"
+                "es-define-property": "^1.0.0"
+            },
+            "funding": {
+                "url": "https://github.com/sponsors/ljharb"
+            }
+        },
+        "node_modules/has-proto": {
+            "version": "1.0.3",
+            "resolved": "https://registry.npmjs.org/has-proto/-/has-proto-1.0.3.tgz",
+            "integrity": "sha512-SJ1amZAJUiZS+PhsVLf5tGydlaVB8EdFpaSO4gmiUKUOxk8qzn5AIy4ZeJUmh22znIdk/uMAUT2pl3FxzVUH+Q==",
+            "engines": {
+                "node": ">= 0.4"
             },
             "funding": {
                 "url": "https://github.com/sponsors/ljharb"
@@ -9939,7 +10007,6 @@
             "version": "2.0.0",
             "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.0.tgz",
             "integrity": "sha512-vUptKVTpIJhcczKBbgnS+RtcuYMB8+oNzPK2/Hp3hanz8JmpATdmmgLgSaadVREkDm+e2giHwY3ZRkyjSIDDFA==",
-            "dev": true,
             "dependencies": {
                 "function-bind": "^1.1.2"
             },
@@ -15647,9 +15714,9 @@
             }
         },
         "node_modules/object-inspect": {
-            "version": "1.12.2",
-            "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.12.2.tgz",
-            "integrity": "sha512-z+cPxW0QGUp0mcqcsgQyLVRDoXFQbXOwBaqyF7VIgI4TWNQsDHrBpUQslRmIfAoYWdYzs6UlKJtB2XJpTaNSpQ==",
+            "version": "1.13.1",
+            "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.1.tgz",
+            "integrity": "sha512-5qoj1RUiKOMsCCNLV1CBiPYE10sziTsnmNxkAI/rZhiD63CF7IqdFGC/XzjWjpSgLf0LxXX3bDFIh0E18f6UhQ==",
             "dev": true,
             "funding": {
                 "url": "https://github.com/sponsors/ljharb"
@@ -17412,9 +17479,9 @@
             }
         },
         "node_modules/raw-body": {
-            "version": "2.5.1",
-            "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.1.tgz",
-            "integrity": "sha512-qqJBtEyVgS0ZmPGdCFPWJ3FreoqvG4MVQln/kCgF7Olq95IbOp0/BWyMwbdtn4VTvkM8Y7khCQ2Xgk/tcrCXig==",
+            "version": "2.5.2",
+            "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.2.tgz",
+            "integrity": "sha512-8zGqypfENjCIqGhgXToC8aB2r7YrBX+AQAfIPs/Mlk+BtPTztOvTS01NRW/3Eh60J+a48lt8qsCzirQ6loCVfA==",
             "dev": true,
             "dependencies": {
                 "bytes": "3.1.2",
@@ -18454,6 +18521,22 @@
             "resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz",
             "integrity": "sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw=="
         },
+        "node_modules/set-function-length": {
+            "version": "1.2.2",
+            "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz",
+            "integrity": "sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==",
+            "dependencies": {
+                "define-data-property": "^1.1.4",
+                "es-errors": "^1.3.0",
+                "function-bind": "^1.1.2",
+                "get-intrinsic": "^1.2.4",
+                "gopd": "^1.0.1",
+                "has-property-descriptors": "^1.0.2"
+            },
+            "engines": {
+                "node": ">= 0.4"
+            }
+        },
         "node_modules/set-value": {
             "version": "2.0.1",
             "resolved": "https://registry.npmjs.org/set-value/-/set-value-2.0.1.tgz",
@@ -18536,14 +18619,18 @@
             }
         },
         "node_modules/side-channel": {
-            "version": "1.0.4",
-            "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.4.tgz",
-            "integrity": "sha512-q5XPytqFEIKHkGdiMIrY10mvLRvnQh42/+GoBlFW3b2LXLE2xxJpZFdm94we0BaoV3RwJyGqg5wS7epxTv0Zvw==",
+            "version": "1.0.6",
+            "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.6.tgz",
+            "integrity": "sha512-fDW/EZ6Q9RiO8eFG8Hj+7u/oW+XrPTIChwCOM2+th2A6OblDtYYIpve9m+KvI9Z4C9qSEXlaGR6bTEYHReuglA==",
             "dev": true,
             "dependencies": {
-                "call-bind": "^1.0.0",
-                "get-intrinsic": "^1.0.2",
-                "object-inspect": "^1.9.0"
+                "call-bind": "^1.0.7",
+                "es-errors": "^1.3.0",
+                "get-intrinsic": "^1.2.4",
+                "object-inspect": "^1.13.1"
+            },
+            "engines": {
+                "node": ">= 0.4"
             },
             "funding": {
                 "url": "https://github.com/sponsors/ljharb"
@@ -20728,9 +20815,9 @@
             }
         },
         "node_modules/webpack-dev-middleware": {
-            "version": "5.3.3",
-            "resolved": "https://registry.npmjs.org/webpack-dev-middleware/-/webpack-dev-middleware-5.3.3.tgz",
-            "integrity": "sha512-hj5CYrY0bZLB+eTO+x/j67Pkrquiy7kWepMHmUMoPsmcUaeEnQJqFzHJOyxgWlq746/wUuA64p9ta34Kyb01pA==",
+            "version": "5.3.4",
+            "resolved": "https://registry.npmjs.org/webpack-dev-middleware/-/webpack-dev-middleware-5.3.4.tgz",
+            "integrity": "sha512-BVdTqhhs+0IfoeAf7EoH5WE+exCmqGerHfDM0IL096Px60Tq2Mn9MAbnaGUe6HiMa41KMCYF19gyzZmBcq/o4Q==",
             "dev": true,
             "dependencies": {
                 "colorette": "^2.0.10",
@@ -25079,21 +25166,21 @@
             }
         },
         "body-parser": {
-            "version": "1.20.0",
-            "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.0.tgz",
-            "integrity": "sha512-DfJ+q6EPcGKZD1QWUjSpqp+Q7bDQTsQIF4zfUAtZ6qk+H/3/QRhg9CEp39ss+/T2vw0+HaidC0ecJj/DRLIaKg==",
+            "version": "1.20.2",
+            "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.2.tgz",
+            "integrity": "sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==",
             "dev": true,
             "requires": {
                 "bytes": "3.1.2",
-                "content-type": "~1.0.4",
+                "content-type": "~1.0.5",
                 "debug": "2.6.9",
                 "depd": "2.0.0",
                 "destroy": "1.2.0",
                 "http-errors": "2.0.0",
                 "iconv-lite": "0.4.24",
                 "on-finished": "2.4.1",
-                "qs": "6.10.3",
-                "raw-body": "2.5.1",
+                "qs": "6.11.0",
+                "raw-body": "2.5.2",
                 "type-is": "~1.6.18",
                 "unpipe": "1.0.0"
             },
@@ -25114,9 +25201,9 @@
                     }
                 },
                 "qs": {
-                    "version": "6.10.3",
-                    "resolved": "https://registry.npmjs.org/qs/-/qs-6.10.3.tgz",
-                    "integrity": "sha512-wr7M2E0OFRfIfJZjKGieI8lBKb7fRCH4Fv5KNPEs7gJ8jadvotdsS08PzOKR7opXhZ/Xkjtt3WF9g38drmyRqQ==",
+                    "version": "6.11.0",
+                    "resolved": "https://registry.npmjs.org/qs/-/qs-6.11.0.tgz",
+                    "integrity": "sha512-MvjoMCJwEarSbUYk5O+nmoSzSutSsTwF85zcHPQ9OrlFoZOYIjaqBAJIqIXjptyD5vThxGq52Xu/MaJzRkIk4Q==",
                     "dev": true,
                     "requires": {
                         "side-channel": "^1.0.4"
@@ -25252,12 +25339,15 @@
             }
         },
         "call-bind": {
-            "version": "1.0.2",
-            "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.2.tgz",
-            "integrity": "sha512-7O+FbCihrB5WGbFYesctwmTKae6rOiIzmz1icreWJ+0aA7LJfuqhEso2T9ncpcFtzMQtzXf2QGGueWJGTYsqrA==",
+            "version": "1.0.7",
+            "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.7.tgz",
+            "integrity": "sha512-GHTSNSYICQ7scH7sZ+M2rFopRoLh8t2bLSW6BbgrtLsahOIB5iyAVJf9GjWK3cYTDaMj4XdBpM1cA6pIS0Kv2w==",
             "requires": {
-                "function-bind": "^1.1.1",
-                "get-intrinsic": "^1.0.2"
+                "es-define-property": "^1.0.0",
+                "es-errors": "^1.3.0",
+                "function-bind": "^1.1.2",
+                "get-intrinsic": "^1.2.4",
+                "set-function-length": "^1.2.1"
             }
         },
         "callsites": {
@@ -25806,9 +25896,9 @@
             }
         },
         "content-type": {
-            "version": "1.0.4",
-            "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.4.tgz",
-            "integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA==",
+            "version": "1.0.5",
+            "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.5.tgz",
+            "integrity": "sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==",
             "dev": true
         },
         "convert-source-map": {
@@ -26425,6 +26515,16 @@
                 }
             }
         },
+        "define-data-property": {
+            "version": "1.1.4",
+            "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz",
+            "integrity": "sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A==",
+            "requires": {
+                "es-define-property": "^1.0.0",
+                "es-errors": "^1.3.0",
+                "gopd": "^1.0.1"
+            }
+        },
         "define-lazy-prop": {
             "version": "2.0.0",
             "resolved": "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-2.0.0.tgz",
@@ -26893,6 +26993,19 @@
                 "is-arrayish": "^0.2.1"
             }
         },
+        "es-define-property": {
+            "version": "1.0.0",
+            "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.0.tgz",
+            "integrity": "sha512-jxayLKShrEqqzJ0eumQbVhTYQM27CfT1T35+gCgDFoL82JLsXqTJ76zv6A0YLOgEnLUMvLzsDsGIrl8NFpT2gQ==",
+            "requires": {
+                "get-intrinsic": "^1.2.4"
+            }
+        },
+        "es-errors": {
+            "version": "1.3.0",
+            "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz",
+            "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw=="
+        },
         "es-module-lexer": {
             "version": "1.3.0",
             "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-1.3.0.tgz",
@@ -27160,17 +27273,17 @@
             }
         },
         "express": {
-            "version": "4.18.1",
-            "resolved": "https://registry.npmjs.org/express/-/express-4.18.1.tgz",
-            "integrity": "sha512-zZBcOX9TfehHQhtupq57OF8lFZ3UZi08Y97dwFCkD8p9d/d2Y3M+ykKcwaMDEL+4qyUolgBDX6AblpR3fL212Q==",
+            "version": "4.19.2",
+            "resolved": "https://registry.npmjs.org/express/-/express-4.19.2.tgz",
+            "integrity": "sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==",
             "dev": true,
             "requires": {
                 "accepts": "~1.3.8",
                 "array-flatten": "1.1.1",
-                "body-parser": "1.20.0",
+                "body-parser": "1.20.2",
                 "content-disposition": "0.5.4",
                 "content-type": "~1.0.4",
-                "cookie": "0.5.0",
+                "cookie": "0.6.0",
                 "cookie-signature": "1.0.6",
                 "debug": "2.6.9",
                 "depd": "2.0.0",
@@ -27186,7 +27299,7 @@
                 "parseurl": "~1.3.3",
                 "path-to-regexp": "0.1.7",
                 "proxy-addr": "~2.0.7",
-                "qs": "6.10.3",
+                "qs": "6.11.0",
                 "range-parser": "~1.2.1",
                 "safe-buffer": "5.2.1",
                 "send": "0.18.0",
@@ -27205,9 +27318,9 @@
                     "dev": true
                 },
                 "cookie": {
-                    "version": "0.5.0",
-                    "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.5.0.tgz",
-                    "integrity": "sha512-YZ3GUyn/o8gfKJlnlX7g7xq4gyO6OSuhGPKaaGssGB2qgDUS0gPgtTvoyZLTt9Ab6dC4hfc9dV5arkvc/OCmrw==",
+                    "version": "0.6.0",
+                    "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.6.0.tgz",
+                    "integrity": "sha512-U71cyTamuh1CRNCfpGY6to28lxvNwPG4Guz/EVjgf3Jmzv0vlDp1atT9eS5dDjMYHucpHbWns6Lwf3BKz6svdw==",
                     "dev": true
                 },
                 "depd": {
@@ -27232,9 +27345,9 @@
                     }
                 },
                 "qs": {
-                    "version": "6.10.3",
-                    "resolved": "https://registry.npmjs.org/qs/-/qs-6.10.3.tgz",
-                    "integrity": "sha512-wr7M2E0OFRfIfJZjKGieI8lBKb7fRCH4Fv5KNPEs7gJ8jadvotdsS08PzOKR7opXhZ/Xkjtt3WF9g38drmyRqQ==",
+                    "version": "6.11.0",
+                    "resolved": "https://registry.npmjs.org/qs/-/qs-6.11.0.tgz",
+                    "integrity": "sha512-MvjoMCJwEarSbUYk5O+nmoSzSutSsTwF85zcHPQ9OrlFoZOYIjaqBAJIqIXjptyD5vThxGq52Xu/MaJzRkIk4Q==",
                     "dev": true,
                     "requires": {
                         "side-channel": "^1.0.4"
@@ -27890,13 +28003,15 @@
             }
         },
         "get-intrinsic": {
-            "version": "1.1.2",
-            "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.1.2.tgz",
-            "integrity": "sha512-Jfm3OyCxHh9DJyc28qGk+JmfkpO41A4XkneDSujN9MDXrm4oDKdHvndhZ2dN94+ERNfkYJWDclW6k2L/ZGHjXA==",
+            "version": "1.2.4",
+            "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.2.4.tgz",
+            "integrity": "sha512-5uYhsJH8VJBTv7oslg4BznJYhDoRI6waYCxMmCdnTrcCrHA/fCFKoTFz2JKKE0HdDFUF7/oQuhzumXJK7paBRQ==",
             "requires": {
-                "function-bind": "^1.1.1",
-                "has": "^1.0.3",
-                "has-symbols": "^1.0.3"
+                "es-errors": "^1.3.0",
+                "function-bind": "^1.1.2",
+                "has-proto": "^1.0.1",
+                "has-symbols": "^1.0.3",
+                "hasown": "^2.0.0"
             }
         },
         "get-package-type": {
@@ -28048,6 +28163,14 @@
                 "sparkles": "^1.0.0"
             }
         },
+        "gopd": {
+            "version": "1.0.1",
+            "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.0.1.tgz",
+            "integrity": "sha512-d65bNlIadxvpb/A2abVdlqKqV563juRnZ1Wtk6s1sIR8uNsXR70xqIzVqxVf1eTqDunwT2MkczEeaezCKTZhwA==",
+            "requires": {
+                "get-intrinsic": "^1.1.3"
+            }
+        },
         "graceful-fs": {
             "version": "4.2.10",
             "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.10.tgz",
@@ -28890,13 +29013,18 @@
             }
         },
         "has-property-descriptors": {
-            "version": "1.0.0",
-            "resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.0.tgz",
-            "integrity": "sha512-62DVLZGoiEBDHQyqG4w9xCuZ7eJEwNmJRWw2VY84Oedb7WFcA27fiEVe8oUQx9hAUJ4ekurquucTGwsyO1XGdQ==",
+            "version": "1.0.2",
+            "resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.2.tgz",
+            "integrity": "sha512-55JNKuIW+vq4Ke1BjOTjM2YctQIvCT7GFzHwmfZPGo5wnrgkid0YQtnAleFSqumZm4az3n2BS+erby5ipJdgrg==",
             "requires": {
-                "get-intrinsic": "^1.1.1"
+                "es-define-property": "^1.0.0"
             }
         },
+        "has-proto": {
+            "version": "1.0.3",
+            "resolved": "https://registry.npmjs.org/has-proto/-/has-proto-1.0.3.tgz",
+            "integrity": "sha512-SJ1amZAJUiZS+PhsVLf5tGydlaVB8EdFpaSO4gmiUKUOxk8qzn5AIy4ZeJUmh22znIdk/uMAUT2pl3FxzVUH+Q=="
+        },
         "has-symbols": {
             "version": "1.0.3",
             "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.0.3.tgz",
@@ -28935,7 +29063,6 @@
             "version": "2.0.0",
             "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.0.tgz",
             "integrity": "sha512-vUptKVTpIJhcczKBbgnS+RtcuYMB8+oNzPK2/Hp3hanz8JmpATdmmgLgSaadVREkDm+e2giHwY3ZRkyjSIDDFA==",
-            "dev": true,
             "requires": {
                 "function-bind": "^1.1.2"
             }
@@ -33200,9 +33327,9 @@
             }
         },
         "object-inspect": {
-            "version": "1.12.2",
-            "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.12.2.tgz",
-            "integrity": "sha512-z+cPxW0QGUp0mcqcsgQyLVRDoXFQbXOwBaqyF7VIgI4TWNQsDHrBpUQslRmIfAoYWdYzs6UlKJtB2XJpTaNSpQ==",
+            "version": "1.13.1",
+            "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.1.tgz",
+            "integrity": "sha512-5qoj1RUiKOMsCCNLV1CBiPYE10sziTsnmNxkAI/rZhiD63CF7IqdFGC/XzjWjpSgLf0LxXX3bDFIh0E18f6UhQ==",
             "dev": true
         },
         "object-keys": {
@@ -34438,9 +34565,9 @@
             "dev": true
         },
         "raw-body": {
-            "version": "2.5.1",
-            "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.1.tgz",
-            "integrity": "sha512-qqJBtEyVgS0ZmPGdCFPWJ3FreoqvG4MVQln/kCgF7Olq95IbOp0/BWyMwbdtn4VTvkM8Y7khCQ2Xgk/tcrCXig==",
+            "version": "2.5.2",
+            "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.2.tgz",
+            "integrity": "sha512-8zGqypfENjCIqGhgXToC8aB2r7YrBX+AQAfIPs/Mlk+BtPTztOvTS01NRW/3Eh60J+a48lt8qsCzirQ6loCVfA==",
             "dev": true,
             "requires": {
                 "bytes": "3.1.2",
@@ -35223,6 +35350,19 @@
             "resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz",
             "integrity": "sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw=="
         },
+        "set-function-length": {
+            "version": "1.2.2",
+            "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz",
+            "integrity": "sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==",
+            "requires": {
+                "define-data-property": "^1.1.4",
+                "es-errors": "^1.3.0",
+                "function-bind": "^1.1.2",
+                "get-intrinsic": "^1.2.4",
+                "gopd": "^1.0.1",
+                "has-property-descriptors": "^1.0.2"
+            }
+        },
         "set-value": {
             "version": "2.0.1",
             "resolved": "https://registry.npmjs.org/set-value/-/set-value-2.0.1.tgz",
@@ -35286,14 +35426,15 @@
             "dev": true
         },
         "side-channel": {
-            "version": "1.0.4",
-            "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.4.tgz",
-            "integrity": "sha512-q5XPytqFEIKHkGdiMIrY10mvLRvnQh42/+GoBlFW3b2LXLE2xxJpZFdm94we0BaoV3RwJyGqg5wS7epxTv0Zvw==",
+            "version": "1.0.6",
+            "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.6.tgz",
+            "integrity": "sha512-fDW/EZ6Q9RiO8eFG8Hj+7u/oW+XrPTIChwCOM2+th2A6OblDtYYIpve9m+KvI9Z4C9qSEXlaGR6bTEYHReuglA==",
             "dev": true,
             "requires": {
-                "call-bind": "^1.0.0",
-                "get-intrinsic": "^1.0.2",
-                "object-inspect": "^1.9.0"
+                "call-bind": "^1.0.7",
+                "es-errors": "^1.3.0",
+                "get-intrinsic": "^1.2.4",
+                "object-inspect": "^1.13.1"
             }
         },
         "sigmund": {
@@ -36939,9 +37080,9 @@
             }
         },
         "webpack-dev-middleware": {
-            "version": "5.3.3",
-            "resolved": "https://registry.npmjs.org/webpack-dev-middleware/-/webpack-dev-middleware-5.3.3.tgz",
-            "integrity": "sha512-hj5CYrY0bZLB+eTO+x/j67Pkrquiy7kWepMHmUMoPsmcUaeEnQJqFzHJOyxgWlq746/wUuA64p9ta34Kyb01pA==",
+            "version": "5.3.4",
+            "resolved": "https://registry.npmjs.org/webpack-dev-middleware/-/webpack-dev-middleware-5.3.4.tgz",
+            "integrity": "sha512-BVdTqhhs+0IfoeAf7EoH5WE+exCmqGerHfDM0IL096Px60Tq2Mn9MAbnaGUe6HiMa41KMCYF19gyzZmBcq/o4Q==",
             "dev": true,
             "requires": {
                 "colorette": "^2.0.10",

From 3953a6d4e469384c8a2452aa0145baffbcbe0e00 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Thu, 11 Apr 2024 17:37:15 -0400
Subject: [PATCH 71/71] Require parasolr 0.9.2

---
 requirements.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 0984208a..03f543a4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,6 @@
 django>=5.0,<5.1
 pucas>=0.8
-# dev parasolr until next release
-git+https://github.com/Princeton-CDH/parasolr@develop#egg=parasolr
-#parasolr>=0.9
+parasolr>=0.9.2
 pairtree
 py-flags
 # pymarc 5+ has incompatible changes