Princeton-CDH · laurejt · Sep 20, 2024 · Sep 18, 2024 · Sep 18, 2024 · Sep 18, 2024
diff --git a/DEPLOYNOTES.rst b/DEPLOYNOTES.rst
@@ -7,6 +7,7 @@ Deploy and Upgrade notes
 ----
 
 * EEBO-TPC import requires configuring **EEBO_DATA** path in local settings.
+* To use local OCR for Gale page content, configure **GALE_LOCAL_OCR** path in local settings.
 
 3.12.1
 ------

diff --git a/ppa/archive/gale.py b/ppa/archive/gale.py
@@ -12,6 +12,22 @@
 logger = logging.getLogger(__name__)
 
 
+def get_local_ocr(item_id, page_num):
+    """
+    Get local OCR page text for specified page of a single Gale volume
+    """
+    ocr_dir = getattr(settings, "GALE_LOCAL_OCR", None)
+    if not ocr_dir:
+        raise ImproperlyConfigured(
+            "GALE_LOCAL_OCR configuration is required for indexing Gale page content"
+        )
+
+    stub_dir = item_id[::3][1:]  # Following conventions set in ppa-nlp
+    ocr_txt_fp = f"{ocr_dir}/{stub_dir}/{item_id}/{item_id}_{page_num}0.txt"
+    with open(ocr_txt_fp) as reader:
@@ -38,3 +38,5 @@
    stub_dir = item_id[::3][1:]  # Following conventions set in ppa-nlp
-    ocr_txt_fp = os.path.join(ocr_dir, stub_dir, item_id, f"{item_id}_{page_num}0.txt")
+    ocr_txt_fp = os.path.normpath(os.path.join(ocr_dir, stub_dir, item_id, f"{item_id}_{page_num}0.txt"))
+    if not ocr_txt_fp.startswith(os.path.normpath(ocr_dir)):
+        raise Exception("Access to the specified file is not allowed")
    with open(ocr_txt_fp) as reader:
@@ -38,3 +38,5 @@
    stub_dir = item_id[::3][1:]  # Following conventions set in ppa-nlp
-    ocr_txt_fp = os.path.join(ocr_dir, stub_dir, item_id, f"{item_id}_{page_num}0.txt")
+    ocr_txt_fp = os.path.normpath(os.path.join(ocr_dir, stub_dir, item_id, f"{item_id}_{page_num}0.txt"))
+    if not ocr_txt_fp.startswith(os.path.normpath(ocr_dir)):
+        raise Exception("Access to the specified file is not allowed")
    with open(ocr_txt_fp) as reader:
+        return reader.read()
+
+
 class GaleAPIError(Exception):
     """Base exception class for Gale API errors"""
 
@@ -192,14 +208,19 @@
         # iterate through the pages in the response
         for page in gale_record["pageResponse"]["pages"]:
             page_number = page["pageNumber"]
-            # page label (original page number) should be set in folioNumber,
-            # but is not set for all volumes; fallback to page number
-            # converted to integer to drop leading zeroes
-            page_label = page.get("folioNumber", int(page_number))
+            tags = []
+            try: 
+                ocr_text = get_local_ocr(item_id, page_number)
+                tags = ["local_ocr"]
+            except FileNotFoundError as e:
+                ocr_text = page.get("ocrText")  # some pages have no text
+                logger.warning(f'Local OCR not found for {item_id} {page_number}')
+
             info = {
                 "page_id": page_number,
-                "content": page.get("ocrText"),  # some pages have no text
-                "label": page_label,
+                "content": ocr_text,
+                "label": page.get("folioNumber"),
+                "tags": tags,
                 # image id needed for thumbnail url; use solr dynamic field
                 "image_id_s": page["image"]["id"],
                 # index image url since we will need it when Gale API changes

diff --git a/ppa/archive/models.py b/ppa/archive/models.py
@@ -1277,7 +1277,7 @@ def page_index_data(cls, digwork, gale_record=None):
                     "cluster_id_s": digwork.index_cluster_id,  # for grouping with cluster
                     "order": i,
                     # make sure label is set;
-                    # fallback to sequence number if no label, but mark with brackets
+                    # fallback to sequence number if no (or null) label, but mark with brackets
                     "label": page_info.get("label") or f"[{i}]",
                     "item_type": "page",
                 }

diff --git a/ppa/archive/tests/test_gale.py b/ppa/archive/tests/test_gale.py
@@ -12,6 +12,28 @@
 from ppa.archive.tests.test_models import FIXTURES_PATH
 
 
+@override_settings()
+def test_get_local_ocr(tmp_path):
+    item_id = "CB0123456789"
+    page_num = "0001"
+    content = "Testing...\n1\n2\n3"
+    # Mock ocr files for testing
+    ocr_dir = tmp_path.joinpath("147", item_id)
+    ocr_dir.mkdir(parents=True)
+    ocr_file = ocr_dir.joinpath(f"{item_id}_{page_num}0.txt")
+    ocr_file.write_text(content)
+
+    with override_settings(GALE_LOCAL_OCR=f"{tmp_path}"):
+        assert content == gale.get_local_ocr(item_id, page_num)
+
+
+@override_settings()
+def test_get_local_ocr_config_error():
+    del settings.GALE_LOCAL_OCR
+    with pytest.raises(ImproperlyConfigured):
+        gale.get_local_ocr("item_id", "page_num")
+
+
 @override_settings(GALE_API_USERNAME="galeuser123")
 @patch("ppa.archive.gale.requests")
 class TestGaleAPI(TestCase):
@@ -219,6 +241,57 @@ def test_get_item(self, mockrequests):
         with pytest.raises(gale.GaleAPIError):
             gale_api.get_item("CW123456")
 
+    @patch("ppa.archive.gale.get_local_ocr")
+    @patch("ppa.archive.gale.GaleAPI.get_item")
+    def test_get_item_pages(self, mock_get_item, mock_get_local_ocr, mockrequests):
+        item_id = "CW0123456789"
+        # Set up API
+        gale_api = gale.GaleAPI()
+        test_pages = [
+            {
+                "pageNumber": "0001",
+                "folioNumber": "i",
+                "image": {"id": "09876001234567", "url": "http://example.com/img/1"}
+                # some pages have no ocr text
+            },
+            {
+                "pageNumber": "0002",
+                "image": {"id": "08765002345678", "url": "http://example.com/img/2"},
+                "ocrText": "more test content",
+            },
+            {
+                "pageNumber": "0003",
+                "image": {"id": "0765400456789", "url": "http://example.com/img/3"},
+                "ocrText": "ignored text",
+            },
+        ]
+        api_response = {
+            "doc": {},  # unused for this test
+            "pageResponse": {"pages": test_pages},
+        }
+        mock_get_item.return_value = api_response
+        # Set up get_local_ocr so that only the 3rd page's text is found
+        mock_get_local_ocr.side_effect = [FileNotFoundError, FileNotFoundError, "local ocr text"]
+        page_data = list(gale_api.get_item_pages(item_id))
+        mock_get_item.called_once()
+        assert mock_get_local_ocr.call_count == 3
+        assert len(page_data) == 3
+        assert [ p["page_id"] for p in page_data ] == ["0001", "0002", "0003"]
+        assert [ p["content"] for p in page_data ] == [None, "more test content", "local ocr text"]
+        assert [ p["label"] for p in page_data ] == ["i", None, None]
+        assert [ p["tags"] for p in page_data ] == [ [], [], ["local_ocr"] ]
+        assert [ p["image_id_s"] for p in page_data ] == ["09876001234567", "08765002345678", "0765400456789"]
+        assert [ p["image_url_s"] for p in page_data ] == [f"http://example.com/img/{i+1}" for i in range(3)]
+
+        # skip apip call if record is provided
+        mock_get_item.reset_mock()
+        mock_get_local_ocr.reset_mock()
+        mock_get_local_ocr.side_effect = [FileNotFoundError, FileNotFoundError, "local ocr text"]
+        page_data = list(gale_api.get_item_pages(item_id, api_response))
+        mock_get_item.assert_not_called()
+        assert mock_get_local_ocr.call_count == 3
+        assert len(page_data) == 3
+
 
 @override_settings(MARC_DATA="/path/to/data/marc")
 @patch("ppa.archive.gale.PairtreeStorageFactory")

diff --git a/ppa/archive/tests/test_import_util.py b/ppa/archive/tests/test_import_util.py
@@ -345,6 +345,7 @@ def test_import_digitizedwork_marc_error(self, mock_gale_api, mock_get_marc_reco
         assert importer.results[test_id] == not_found_error
 
     # username is required to init GaleAPI class, but API is not actually used
+    @override_settings(GALE_LOCAL_OCR="unused")
     @override_settings(GALE_API_USERNAME="unused")
     @patch("ppa.archive.import_util.get_marc_record")
     @patch("ppa.archive.import_util.GaleAPI")

diff --git a/ppa/archive/tests/test_models.py b/ppa/archive/tests/test_models.py
@@ -1151,7 +1151,7 @@ def test_page_index_data_eebotcp(self):
         page_data = list(page_data)
         assert page_data[0]["label"] == "[1]"
         assert "Licensed,\nROBERT MIDGLEY." in page_data[0]["content"]
-        # eebo-tcp page data logic is tested more thoroughly elsewheer
+        # eebo-tcp page data logic is tested more thoroughly elsewhere
 
     def test_page_index_data_suppressed(self):
         # if item is suppressed - no page data
@@ -1165,56 +1165,57 @@ def test_page_index_data_nonhathi(self):
         assert not list(Page.page_index_data(nonhathi_work))
 
     # username is required to init GaleAPI class, but API is not actually used
+    @override_settings(GALE_LOCAL_OCR="unused")
     @override_settings(GALE_API_USERNAME="unused")
-    @patch.object(gale.GaleAPI, "get_item")
-    def test_gale_page_index_data(self, mock_gale_get_item):
+    @patch.object(gale.GaleAPI, "get_item_pages")
+    def test_gale_page_index_data(self, mock_gale_get_item_pages):
         gale_work = DigitizedWork(source=DigitizedWork.GALE, source_id="CW123456")
-        test_pages = [
+        test_page_data = [
             {
-                "pageNumber": "0001",
-                "folioNumber": "i",
-                "image": {"id": "09876001234567", "url": "http://example.com/img/1"}
-                # some pages have no ocr text
+                "page_id": "0001",
+                "content": None,
+                "label": "i",
+                "tags": [],
+                "image_id_s": "09876001234567",
+                "image_url_s": "http://example.com/img/1",
             },
             {
-                "pageNumber": "0002",
-                "image": {"id": "08765002345678", "url": "http://example.com/img/2"},
-                "ocrText": "more test content",
+                "page_id": "0002",
+                "content": "original ocr content",
+                "label": None,
+                "tags": [],
+                "image_id_s": "08765002345678",
+                "image_url_s": "http://example.com/img/2",
             },
+            {
+                "page_id": "0003",
+                "content": "local ocr content",
+                "label": None,
+                "tags": ["local_ocr"],
+                "image_id_s": "0765400456789",
+                "image_url_s": "http://example.com/img/3",
+            }
         ]
-        api_response = {
-            "doc": {},  # unused for this test
-            "pageResponse": {"pages": test_pages},
-        }
-        mock_gale_get_item.return_value = api_response
+        mock_gale_get_item_pages.return_value = test_page_data
         page_data = list(Page.page_index_data(gale_work))
-        assert len(page_data) == 2
+        assert len(page_data) == 3
         for i, index_data in enumerate(page_data):
-            assert (
-                index_data["id"]
-                == f"{gale_work.source_id}.{test_pages[i]['pageNumber']}"
-            )
+            assert index_data["id"] == f"{gale_work.source_id}.000{i+1}"
             assert index_data["source_id"] == gale_work.source_id
-            assert index_data["content"] == test_pages[i].get("ocrText")
+            assert index_data["group_id_s"] == gale_work.index_id()
+            assert index_data["cluster_id_s"] == gale_work.index_cluster_id
             assert index_data["order"] == i + 1
-            # should use folio number when set
-            if "folioNumber" in test_pages[i]:
-                assert index_data["label"] == test_pages[i]["folioNumber"]
-            else:
-                assert index_data["label"] == int(test_pages[i]["pageNumber"])
+            assert index_data["label"] == test_page_data[i]["label"] or f"[{i+1}]"
             assert index_data["item_type"] == "page"
-            assert index_data["image_id_s"] == test_pages[i]["image"]["id"]
-            assert index_data["image_url_s"] == test_pages[i]["image"]["url"]
-
-        # skip api call if item data is passed in
-        mock_gale_get_item.reset_mock()
-        page_data = list(Page.page_index_data(gale_work, api_response))
-        assert mock_gale_get_item.get_item.call_count == 0
-        assert len(page_data) == 2
+            assert "page_id" not in index_data  # this field is not preserved
+            assert index_data["content"] == test_page_data[i]["content"]
+            assert index_data["tags"] == test_page_data[i]["tags"]
+            assert index_data["image_id_s"] == test_page_data[i]["image_id_s"]
+            assert index_data["image_url_s"] == test_page_data[i]["image_url_s"]
 
         # limit if page range specified
         gale_excerpt = DigitizedWork(
-            source=DigitizedWork.GALE, source_id="CW123456", pages_digital="2-3"
+            source=DigitizedWork.GALE, source_id="CW123456", pages_digital="2-4"
         )
-        page_data = list(Page.page_index_data(gale_excerpt, api_response))
-        assert len(page_data) == 1
+        page_data = list(Page.page_index_data(gale_excerpt))
+        assert len(page_data) == 2
diff --git a/ppa/settings/local_settings.py.sample b/ppa/settings/local_settings.py.sample
@@ -44,6 +44,9 @@ GALE_API_USERNAME = ''
 # local path for cached marc record; needed for Gale/ECCO import
 MARC_DATA = ''
 
+# local path for Gale OCR data
+GALE_LOCAL_OCR = ''
+
 # local path for importing and indexing selected EEBO-TCP content
 # should contain xml and marc files named by TCP id
 EEBO_DATA = ""