feat: support pdf link extraction in hi_res strategy (#3753)

This PR aims to add support for link extraction in pdf `hi_res` strategy. The `partition_pdf()` function now supports link extraction when using the `hi_res` strategy, allowing users to extract hyperlinks from PDF documents. ### Summary - Added functionalities to support link extraction in hi_res flow - Enhanced word extraction functionality used for link extraction in both `fast` and `hi_res` flows, resulted in more correct `start_index` and `text` in `links` metadata. - Updated ingest fixture update workflow to not skip Astra DB source test ### Testing ``` elements = partition_pdf( filename="example-docs/pdf/embedded-link.pdf", strategy="hi_res" ) assert len(elements[0].metadata.links) == 3 ``` --------- Co-authored-by: ryannikolaidis <[email protected]> Co-authored-by: christinestraub <[email protected]> Co-authored-by: cragwolfe <[email protected]>
Unstructured-IO · Oct 31, 2024 · df156eb · df156eb
1 parent 1953b86
commit df156eb
Show file tree

Hide file tree

Showing 26 changed files with 1,718 additions and 1,039 deletions.
diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml
@@ -94,6 +94,8 @@ jobs:
           AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           OCTOAI_API_KEY: ${{ secrets.OCTOAI_API_KEY }}
+          ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_TOKEN}}
+          ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}}
           OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
           OVERWRITE_FIXTURES: "true"
           CI: "true"

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.16.4-dev2
+## 0.16.4
 
 ### Enhancements
 
@@ -9,6 +9,8 @@
 
 ### Features
 
+* **Add support for link extraction in pdf hi_res strategy.** The `partition_pdf()` function now supports link extraction when using the `hi_res` strategy, allowing users to extract hyperlinks from PDF documents more effectively. 
+
 ### Fixes
 
 

diff --git a/requirements/ingest/ingest.txt b/requirements/ingest/ingest.txt
@@ -1,4 +1,4 @@
-unstructured-ingest[airtable, astradb, azure, azure-cognitive-search, bedrock, biomed, box, chroma, clarifai, confluence, couchbase, databricks-volumes, delta-table, discord, dropbox, elasticsearch, embed-huggingface, embed-octoai, embed-vertexai, embed-voyageai, gcs, github, gitlab, google-drive, hubspot, jira, kafka, kdbai, milvus, mongodb, notion, onedrive, openai, opensearch, outlook, pinecone, postgres, qdrant, reddit, remote, s3, salesforce, sftp, sharepoint, singlestore, slack, vectara, weaviate, wikipedia]==0.2.0
+unstructured-ingest[airtable, astradb, azure, azure-cognitive-search, bedrock, biomed, box, chroma, clarifai, confluence, couchbase, databricks-volumes, delta-table, discord, dropbox, elasticsearch, embed-huggingface, embed-octoai, embed-vertexai, embed-voyageai, gcs, github, gitlab, google-drive, hubspot, jira, kafka, kdbai, milvus, mongodb, notion, onedrive, openai, opensearch, outlook, pinecone, postgres, qdrant, reddit, remote, s3, salesforce, sftp, sharepoint, singlestore, slack, vectara, weaviate, wikipedia]==0.2.1
 s3fs>=2024.9.0
 urllib3>=1.26.20
 backoff>=2.2.1

diff --git a/test_unstructured/partition/common/test_common.py b/test_unstructured/partition/common/test_common.py
@@ -1,14 +1,11 @@
 import pathlib
-from dataclasses import dataclass
 from multiprocessing import Pool
-from unittest import mock
 
 import numpy as np
 import pytest
 from PIL import Image
 from unstructured_inference.inference import layout
 from unstructured_inference.inference.elements import TextRegion
-from unstructured_inference.inference.layout import DocumentLayout, PageLayout
 from unstructured_inference.inference.layoutelement import LayoutElement
 
 from test_unstructured.unit_utils import example_doc_path
@@ -29,7 +26,6 @@
     Image as ImageElement,
 )
 from unstructured.partition.common import common
-from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT
 
 
 class MockPageLayout(layout.PageLayout):
@@ -399,84 +395,12 @@ def test_contains_emoji(text, expected):
     assert common.contains_emoji(text) is expected
 
 
-def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
-    layout_elem_absent_coordinates = MockDocumentLayout()
-    for page in layout_elem_absent_coordinates.pages:
-        for el in page.elements:
-            el.bbox = None
-    elements = common.document_to_element_list(layout_elem_absent_coordinates)
-    assert elements[0].metadata.coordinates is None
-
-
 def test_get_page_image_metadata_and_coordinate_system():
     doc = MockDocumentLayout()
-    metadata = common._get_page_image_metadata(doc.pages[0])
+    metadata = common.get_page_image_metadata(doc.pages[0])
     assert isinstance(metadata, dict)
 
 
-@dataclass
-class MockImage:
-    width = 640
-    height = 480
-    format = "JPG"
-
-
-def test_document_to_element_list_handles_parent():
-    block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText")
-    block2 = LayoutElement.from_coords(
-        1,
-        2,
-        3,
-        4,
-        text="block 2",
-        parent=block1,
-        type="NarrativeText",
-    )
-    page = PageLayout(
-        number=1,
-        image=MockImage(),
-    )
-    page.elements = [block1, block2]
-    doc = DocumentLayout.from_pages([page])
-    el1, el2 = common.document_to_element_list(doc)
-    assert el2.metadata.parent_id == el1.id
-
-
-@pytest.mark.parametrize(
-    ("sort_mode", "call_count"),
-    [(SORT_MODE_DONT, 0), (SORT_MODE_BASIC, 1), (SORT_MODE_XY_CUT, 1)],
-)
-def test_document_to_element_list_doesnt_sort_on_sort_method(sort_mode, call_count):
-    block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText")
-    block2 = LayoutElement.from_coords(
-        1,
-        2,
-        3,
-        4,
-        text="block 2",
-        parent=block1,
-        type="NarrativeText",
-    )
-    page = PageLayout(
-        number=1,
-        image=MockImage(),
-    )
-    page.elements = [block1, block2]
-    doc = DocumentLayout.from_pages([page])
-    with mock.patch.object(common, "sort_page_elements") as mock_sort_page_elements:
-        common.document_to_element_list(doc, sortable=True, sort_mode=sort_mode)
-    assert mock_sort_page_elements.call_count == call_count
-
-
-def test_document_to_element_list_sets_category_depth_titles():
-    layout_with_hierarchies = MockDocumentLayout()
-    elements = common.document_to_element_list(layout_with_hierarchies)
-    assert elements[0].metadata.category_depth == 1
-    assert elements[1].metadata.category_depth == 2
-    assert elements[2].metadata.category_depth is None
-    assert elements[3].metadata.category_depth == 0
-
-
 def test_ocr_data_to_elements(
     filename=example_doc_path("img/layout-parser-paper-fast.jpg"),
 ):

diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -5,6 +5,7 @@
 import math
 import os
 import tempfile
+from dataclasses import dataclass
 from pathlib import Path
 from tempfile import SpooledTemporaryFile
 from unittest import mock
@@ -14,6 +15,8 @@
 from PIL import Image
 from pytest_mock import MockFixture
 from unstructured_inference.inference import layout
+from unstructured_inference.inference.layout import DocumentLayout, PageLayout
+from unstructured_inference.inference.layoutelement import LayoutElement
 
 from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
 from unstructured.chunking.title import chunk_by_title
@@ -32,9 +35,12 @@
 )
 from unstructured.errors import PageCountExceededError
 from unstructured.partition import pdf, strategies
-from unstructured.partition.pdf import get_uris_from_annots
 from unstructured.partition.pdf_image import ocr, pdfminer_processing
+from unstructured.partition.pdf_image.pdfminer_processing import get_uris_from_annots
 from unstructured.partition.utils.constants import (
+    SORT_MODE_BASIC,
+    SORT_MODE_DONT,
+    SORT_MODE_XY_CUT,
     UNSTRUCTURED_INCLUDE_DEBUG_METADATA,
     PartitionStrategy,
 )
@@ -95,6 +101,37 @@ def __init__(self, number: int, image: Image):
         ]
 
 
+class MockSinglePageLayout(layout.PageLayout):
+    def __init__(self, number: int, image: Image.Image):
+        self.number = number
+        self.image = image
+
+    @property
+    def elements(self):
+        return [
+            LayoutElement(
+                type="Headline",
+                text="Charlie Brown and the Great Pumpkin",
+                bbox=None,
+            ),
+            LayoutElement(
+                type="Subheadline",
+                text="The Beginning",
+                bbox=None,
+            ),
+            LayoutElement(
+                type="Text",
+                text="This time Charlie Brown had it really tricky...",
+                bbox=None,
+            ),
+            LayoutElement(
+                type="Title",
+                text="Another book title in the same page",
+                bbox=None,
+            ),
+        ]
+
+
 class MockDocumentLayout(layout.DocumentLayout):
     @property
     def pages(self):
@@ -104,6 +141,14 @@ def pages(self):
         ]
 
 
+class MockSinglePageDocumentLayout(layout.DocumentLayout):
+    @property
+    def pages(self):
+        return [
+            MockSinglePageLayout(number=1, image=Image.new("1", (1, 1))),
+        ]
+
+
 @pytest.mark.parametrize(
     ("filename", "file"),
     [
@@ -787,11 +832,14 @@ def test_combine_numbered_list(filename):
 
 
 @pytest.mark.parametrize(
-    "filename",
-    [example_doc_path("pdf/layout-parser-paper-fast.pdf")],
+    ("filename", "strategy"),
+    [
+        (example_doc_path("pdf/layout-parser-paper-fast.pdf"), "fast"),
+        (example_doc_path("pdf/layout-parser-paper-fast.pdf"), "hi_res"),
+    ],
 )
-def test_partition_pdf_hyperlinks(filename):
-    elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO)
+def test_partition_pdf_hyperlinks(filename, strategy):
+    elements = pdf.partition_pdf(filename=filename, strategy=strategy)
     links = [
         {
             "text": "8",
@@ -813,11 +861,14 @@ def test_partition_pdf_hyperlinks(filename):
 
 
 @pytest.mark.parametrize(
-    "filename",
-    [example_doc_path("pdf/embedded-link.pdf")],
+    ("filename", "strategy"),
+    [
+        (example_doc_path("pdf/embedded-link.pdf"), "fast"),
+        (example_doc_path("pdf/embedded-link.pdf"), "hi_res"),
+    ],
 )
-def test_partition_pdf_hyperlinks_multiple_lines(filename):
-    elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO)
+def test_partition_pdf_hyperlinks_multiple_lines(filename, strategy):
+    elements = pdf.partition_pdf(filename=filename, strategy=strategy)
     assert elements[-1].metadata.links[-1]["text"] == "capturing"
     assert len(elements[-1].metadata.links) == 2
 
@@ -1392,3 +1443,75 @@ def test_pdf_hi_res_max_pages_argument(filename, pdf_hi_res_max_pages, expected_
                 pdf_hi_res_max_pages=pdf_hi_res_max_pages,
                 is_image=is_image,
             )
+
+
+def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
+    layout_elem_absent_coordinates = MockSinglePageDocumentLayout()
+    for page in layout_elem_absent_coordinates.pages:
+        for el in page.elements:
+            el.bbox = None
+    elements = pdf.document_to_element_list(layout_elem_absent_coordinates)
+    assert elements[0].metadata.coordinates is None
+
+
+@dataclass
+class MockImage:
+    width = 640
+    height = 480
+    format = "JPG"
+
+
+def test_document_to_element_list_handles_parent():
+    block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText")
+    block2 = LayoutElement.from_coords(
+        1,
+        2,
+        3,
+        4,
+        text="block 2",
+        parent=block1,
+        type="NarrativeText",
+    )
+    page = PageLayout(
+        number=1,
+        image=MockImage(),
+    )
+    page.elements = [block1, block2]
+    doc = DocumentLayout.from_pages([page])
+    el1, el2 = pdf.document_to_element_list(doc)
+    assert el2.metadata.parent_id == el1.id
+
+
+@pytest.mark.parametrize(
+    ("sort_mode", "call_count"),
+    [(SORT_MODE_DONT, 0), (SORT_MODE_BASIC, 1), (SORT_MODE_XY_CUT, 1)],
+)
+def test_document_to_element_list_doesnt_sort_on_sort_method(sort_mode, call_count):
+    block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText")
+    block2 = LayoutElement.from_coords(
+        1,
+        2,
+        3,
+        4,
+        text="block 2",
+        parent=block1,
+        type="NarrativeText",
+    )
+    page = PageLayout(
+        number=1,
+        image=MockImage(),
+    )
+    page.elements = [block1, block2]
+    doc = DocumentLayout.from_pages([page])
+    with mock.patch.object(pdf, "sort_page_elements") as mock_sort_page_elements:
+        pdf.document_to_element_list(doc, sortable=True, sort_mode=sort_mode)
+    assert mock_sort_page_elements.call_count == call_count
+
+
+def test_document_to_element_list_sets_category_depth_titles():
+    layout_with_hierarchies = MockSinglePageDocumentLayout()
+    elements = pdf.document_to_element_list(layout_with_hierarchies)
+    assert elements[0].metadata.category_depth == 1
+    assert elements[1].metadata.category_depth == 2
+    assert elements[2].metadata.category_depth is None
+    assert elements[3].metadata.category_depth == 0
diff --git a/...d_ingest/expected-structured-output/astradb/25b75f1d-a2ea-4c97-b75f-1da2eadc97f7.csv.json b/...d_ingest/expected-structured-output/astradb/25b75f1d-a2ea-4c97-b75f-1da2eadc97f7.csv.json
@@ -0,0 +1,20 @@
+[
+  {
+    "type": "Table",
+    "element_id": "29fba2aa35cbdea208791e942ac3c40c",
+    "text": "_id title reviewid creationdate criticname originalscore reviewstate reviewtext 25b75f1d-a2ea-4c97-b75f-1da2eadc97f7 City Hunter: Shinjuku Private Eyes 2558908 2019-02-14 Matt Schley 2.5/5 rotten The film's out-of-touch attempts at humor may find them hunting for the reason the franchise was so popular in the first place.",
+    "metadata": {
+      "text_as_html": "<table><tr><td>_id</td><td>title</td><td>reviewid</td><td>creationdate</td><td>criticname</td><td>originalscore</td><td>reviewstate</td><td>reviewtext</td></tr><tr><td>25b75f1d-a2ea-4c97-b75f-1da2eadc97f7</td><td>City Hunter: Shinjuku Private Eyes</td><td>2558908</td><td>2019-02-14</td><td>Matt Schley</td><td>2.5/5</td><td>rotten</td><td>The film's out-of-touch attempts at humor may find them hunting for the reason the franchise was so popular in the first place.</td></tr></table>",
+      "languages": [
+        "eng"
+      ],
+      "filetype": "text/csv",
+      "data_source": {
+        "record_locator": {
+          "document_id": "25b75f1d-a2ea-4c97-b75f-1da2eadc97f7"
+        },
+        "filesize_bytes": 326
+      }
+    }
+  }
+]
diff --git a/...d_ingest/expected-structured-output/astradb/60297eea-73d7-4fca-a97e-ea73d7cfca62.csv.json b/...d_ingest/expected-structured-output/astradb/60297eea-73d7-4fca-a97e-ea73d7cfca62.csv.json
@@ -0,0 +1,20 @@
+[
+  {
+    "type": "Table",
+    "element_id": "b3b034c9f8fb0ab442599982063f0590",
+    "text": "_id title reviewid creationdate criticname originalscore reviewstate reviewtext 60297eea-73d7-4fca-a97e-ea73d7cfca62 City Hunter: Shinjuku Private Eyes 2590987 2019-05-28 Reuben Baron fresh The choreography is so precise and lifelike at points one might wonder whether the movie was rotoscoped, but no live-action reference footage was used. The quality is due to the skill of the animators and Kodama's love for professional wrestling.",
+    "metadata": {
+      "text_as_html": "<table><tr><td>_id</td><td>title</td><td>reviewid</td><td>creationdate</td><td>criticname</td><td>originalscore</td><td>reviewstate</td><td>reviewtext</td></tr><tr><td>60297eea-73d7-4fca-a97e-ea73d7cfca62</td><td>City Hunter: Shinjuku Private Eyes</td><td>2590987</td><td>2019-05-28</td><td>Reuben Baron</td><td/><td>fresh</td><td>The choreography is so precise and lifelike at points one might wonder whether the movie was rotoscoped, but no live-action reference footage was used. The quality is due to the skill of the animators and Kodama's love for professional wrestling.</td></tr></table>",
+      "languages": [
+        "eng"
+      ],
+      "filetype": "text/csv",
+      "data_source": {
+        "record_locator": {
+          "document_id": "60297eea-73d7-4fca-a97e-ea73d7cfca62"
+        },
+        "filesize_bytes": 442
+      }
+    }
+  }
+]