Skip to content

Commit

Permalink
feat: support pdf link extraction in hi_res strategy (#3753)
Browse files Browse the repository at this point in the history
This PR aims to add support for link extraction in pdf `hi_res`
strategy. The `partition_pdf()` function now supports link extraction
when using the `hi_res` strategy, allowing users to extract hyperlinks
from PDF documents.

### Summary
- Added functionalities to support link extraction in hi_res flow
- Enhanced word extraction functionality used for link extraction in
both `fast` and `hi_res` flows, resulted in more correct `start_index`
and `text` in `links` metadata.
- Updated ingest fixture update workflow to not skip Astra DB source
test

### Testing
```
elements = partition_pdf(
    filename="example-docs/pdf/embedded-link.pdf",
    strategy="hi_res"
)
assert len(elements[0].metadata.links) == 3
```

---------

Co-authored-by: ryannikolaidis <[email protected]>
Co-authored-by: christinestraub <[email protected]>
Co-authored-by: cragwolfe <[email protected]>
  • Loading branch information
4 people authored Oct 31, 2024
1 parent 1953b86 commit df156eb
Show file tree
Hide file tree
Showing 26 changed files with 1,718 additions and 1,039 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/ingest-test-fixtures-update-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ jobs:
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
OCTOAI_API_KEY: ${{ secrets.OCTOAI_API_KEY }}
ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_TOKEN}}
ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}}
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
OVERWRITE_FIXTURES: "true"
CI: "true"
Expand Down
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.16.4-dev2
## 0.16.4

### Enhancements

Expand All @@ -9,6 +9,8 @@

### Features

* **Add support for link extraction in pdf hi_res strategy.** The `partition_pdf()` function now supports link extraction when using the `hi_res` strategy, allowing users to extract hyperlinks from PDF documents more effectively.

### Fixes


Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest/ingest.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
unstructured-ingest[airtable, astradb, azure, azure-cognitive-search, bedrock, biomed, box, chroma, clarifai, confluence, couchbase, databricks-volumes, delta-table, discord, dropbox, elasticsearch, embed-huggingface, embed-octoai, embed-vertexai, embed-voyageai, gcs, github, gitlab, google-drive, hubspot, jira, kafka, kdbai, milvus, mongodb, notion, onedrive, openai, opensearch, outlook, pinecone, postgres, qdrant, reddit, remote, s3, salesforce, sftp, sharepoint, singlestore, slack, vectara, weaviate, wikipedia]==0.2.0
unstructured-ingest[airtable, astradb, azure, azure-cognitive-search, bedrock, biomed, box, chroma, clarifai, confluence, couchbase, databricks-volumes, delta-table, discord, dropbox, elasticsearch, embed-huggingface, embed-octoai, embed-vertexai, embed-voyageai, gcs, github, gitlab, google-drive, hubspot, jira, kafka, kdbai, milvus, mongodb, notion, onedrive, openai, opensearch, outlook, pinecone, postgres, qdrant, reddit, remote, s3, salesforce, sftp, sharepoint, singlestore, slack, vectara, weaviate, wikipedia]==0.2.1
s3fs>=2024.9.0
urllib3>=1.26.20
backoff>=2.2.1
Expand Down
78 changes: 1 addition & 77 deletions test_unstructured/partition/common/test_common.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
import pathlib
from dataclasses import dataclass
from multiprocessing import Pool
from unittest import mock

import numpy as np
import pytest
from PIL import Image
from unstructured_inference.inference import layout
from unstructured_inference.inference.elements import TextRegion
from unstructured_inference.inference.layout import DocumentLayout, PageLayout
from unstructured_inference.inference.layoutelement import LayoutElement

from test_unstructured.unit_utils import example_doc_path
Expand All @@ -29,7 +26,6 @@
Image as ImageElement,
)
from unstructured.partition.common import common
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT


class MockPageLayout(layout.PageLayout):
Expand Down Expand Up @@ -399,84 +395,12 @@ def test_contains_emoji(text, expected):
assert common.contains_emoji(text) is expected


def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
layout_elem_absent_coordinates = MockDocumentLayout()
for page in layout_elem_absent_coordinates.pages:
for el in page.elements:
el.bbox = None
elements = common.document_to_element_list(layout_elem_absent_coordinates)
assert elements[0].metadata.coordinates is None


def test_get_page_image_metadata_and_coordinate_system():
doc = MockDocumentLayout()
metadata = common._get_page_image_metadata(doc.pages[0])
metadata = common.get_page_image_metadata(doc.pages[0])
assert isinstance(metadata, dict)


@dataclass
class MockImage:
width = 640
height = 480
format = "JPG"


def test_document_to_element_list_handles_parent():
block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText")
block2 = LayoutElement.from_coords(
1,
2,
3,
4,
text="block 2",
parent=block1,
type="NarrativeText",
)
page = PageLayout(
number=1,
image=MockImage(),
)
page.elements = [block1, block2]
doc = DocumentLayout.from_pages([page])
el1, el2 = common.document_to_element_list(doc)
assert el2.metadata.parent_id == el1.id


@pytest.mark.parametrize(
("sort_mode", "call_count"),
[(SORT_MODE_DONT, 0), (SORT_MODE_BASIC, 1), (SORT_MODE_XY_CUT, 1)],
)
def test_document_to_element_list_doesnt_sort_on_sort_method(sort_mode, call_count):
block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText")
block2 = LayoutElement.from_coords(
1,
2,
3,
4,
text="block 2",
parent=block1,
type="NarrativeText",
)
page = PageLayout(
number=1,
image=MockImage(),
)
page.elements = [block1, block2]
doc = DocumentLayout.from_pages([page])
with mock.patch.object(common, "sort_page_elements") as mock_sort_page_elements:
common.document_to_element_list(doc, sortable=True, sort_mode=sort_mode)
assert mock_sort_page_elements.call_count == call_count


def test_document_to_element_list_sets_category_depth_titles():
layout_with_hierarchies = MockDocumentLayout()
elements = common.document_to_element_list(layout_with_hierarchies)
assert elements[0].metadata.category_depth == 1
assert elements[1].metadata.category_depth == 2
assert elements[2].metadata.category_depth is None
assert elements[3].metadata.category_depth == 0


def test_ocr_data_to_elements(
filename=example_doc_path("img/layout-parser-paper-fast.jpg"),
):
Expand Down
141 changes: 132 additions & 9 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import math
import os
import tempfile
from dataclasses import dataclass
from pathlib import Path
from tempfile import SpooledTemporaryFile
from unittest import mock
Expand All @@ -14,6 +15,8 @@
from PIL import Image
from pytest_mock import MockFixture
from unstructured_inference.inference import layout
from unstructured_inference.inference.layout import DocumentLayout, PageLayout
from unstructured_inference.inference.layoutelement import LayoutElement

from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
from unstructured.chunking.title import chunk_by_title
Expand All @@ -32,9 +35,12 @@
)
from unstructured.errors import PageCountExceededError
from unstructured.partition import pdf, strategies
from unstructured.partition.pdf import get_uris_from_annots
from unstructured.partition.pdf_image import ocr, pdfminer_processing
from unstructured.partition.pdf_image.pdfminer_processing import get_uris_from_annots
from unstructured.partition.utils.constants import (
SORT_MODE_BASIC,
SORT_MODE_DONT,
SORT_MODE_XY_CUT,
UNSTRUCTURED_INCLUDE_DEBUG_METADATA,
PartitionStrategy,
)
Expand Down Expand Up @@ -95,6 +101,37 @@ def __init__(self, number: int, image: Image):
]


class MockSinglePageLayout(layout.PageLayout):
def __init__(self, number: int, image: Image.Image):
self.number = number
self.image = image

@property
def elements(self):
return [
LayoutElement(
type="Headline",
text="Charlie Brown and the Great Pumpkin",
bbox=None,
),
LayoutElement(
type="Subheadline",
text="The Beginning",
bbox=None,
),
LayoutElement(
type="Text",
text="This time Charlie Brown had it really tricky...",
bbox=None,
),
LayoutElement(
type="Title",
text="Another book title in the same page",
bbox=None,
),
]


class MockDocumentLayout(layout.DocumentLayout):
@property
def pages(self):
Expand All @@ -104,6 +141,14 @@ def pages(self):
]


class MockSinglePageDocumentLayout(layout.DocumentLayout):
@property
def pages(self):
return [
MockSinglePageLayout(number=1, image=Image.new("1", (1, 1))),
]


@pytest.mark.parametrize(
("filename", "file"),
[
Expand Down Expand Up @@ -787,11 +832,14 @@ def test_combine_numbered_list(filename):


@pytest.mark.parametrize(
"filename",
[example_doc_path("pdf/layout-parser-paper-fast.pdf")],
("filename", "strategy"),
[
(example_doc_path("pdf/layout-parser-paper-fast.pdf"), "fast"),
(example_doc_path("pdf/layout-parser-paper-fast.pdf"), "hi_res"),
],
)
def test_partition_pdf_hyperlinks(filename):
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO)
def test_partition_pdf_hyperlinks(filename, strategy):
elements = pdf.partition_pdf(filename=filename, strategy=strategy)
links = [
{
"text": "8",
Expand All @@ -813,11 +861,14 @@ def test_partition_pdf_hyperlinks(filename):


@pytest.mark.parametrize(
"filename",
[example_doc_path("pdf/embedded-link.pdf")],
("filename", "strategy"),
[
(example_doc_path("pdf/embedded-link.pdf"), "fast"),
(example_doc_path("pdf/embedded-link.pdf"), "hi_res"),
],
)
def test_partition_pdf_hyperlinks_multiple_lines(filename):
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO)
def test_partition_pdf_hyperlinks_multiple_lines(filename, strategy):
elements = pdf.partition_pdf(filename=filename, strategy=strategy)
assert elements[-1].metadata.links[-1]["text"] == "capturing"
assert len(elements[-1].metadata.links) == 2

Expand Down Expand Up @@ -1392,3 +1443,75 @@ def test_pdf_hi_res_max_pages_argument(filename, pdf_hi_res_max_pages, expected_
pdf_hi_res_max_pages=pdf_hi_res_max_pages,
is_image=is_image,
)


def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
layout_elem_absent_coordinates = MockSinglePageDocumentLayout()
for page in layout_elem_absent_coordinates.pages:
for el in page.elements:
el.bbox = None
elements = pdf.document_to_element_list(layout_elem_absent_coordinates)
assert elements[0].metadata.coordinates is None


@dataclass
class MockImage:
width = 640
height = 480
format = "JPG"


def test_document_to_element_list_handles_parent():
block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText")
block2 = LayoutElement.from_coords(
1,
2,
3,
4,
text="block 2",
parent=block1,
type="NarrativeText",
)
page = PageLayout(
number=1,
image=MockImage(),
)
page.elements = [block1, block2]
doc = DocumentLayout.from_pages([page])
el1, el2 = pdf.document_to_element_list(doc)
assert el2.metadata.parent_id == el1.id


@pytest.mark.parametrize(
("sort_mode", "call_count"),
[(SORT_MODE_DONT, 0), (SORT_MODE_BASIC, 1), (SORT_MODE_XY_CUT, 1)],
)
def test_document_to_element_list_doesnt_sort_on_sort_method(sort_mode, call_count):
block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText")
block2 = LayoutElement.from_coords(
1,
2,
3,
4,
text="block 2",
parent=block1,
type="NarrativeText",
)
page = PageLayout(
number=1,
image=MockImage(),
)
page.elements = [block1, block2]
doc = DocumentLayout.from_pages([page])
with mock.patch.object(pdf, "sort_page_elements") as mock_sort_page_elements:
pdf.document_to_element_list(doc, sortable=True, sort_mode=sort_mode)
assert mock_sort_page_elements.call_count == call_count


def test_document_to_element_list_sets_category_depth_titles():
layout_with_hierarchies = MockSinglePageDocumentLayout()
elements = pdf.document_to_element_list(layout_with_hierarchies)
assert elements[0].metadata.category_depth == 1
assert elements[1].metadata.category_depth == 2
assert elements[2].metadata.category_depth is None
assert elements[3].metadata.category_depth == 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[
{
"type": "Table",
"element_id": "29fba2aa35cbdea208791e942ac3c40c",
"text": "_id title reviewid creationdate criticname originalscore reviewstate reviewtext 25b75f1d-a2ea-4c97-b75f-1da2eadc97f7 City Hunter: Shinjuku Private Eyes 2558908 2019-02-14 Matt Schley 2.5/5 rotten The film's out-of-touch attempts at humor may find them hunting for the reason the franchise was so popular in the first place.",
"metadata": {
"text_as_html": "<table><tr><td>_id</td><td>title</td><td>reviewid</td><td>creationdate</td><td>criticname</td><td>originalscore</td><td>reviewstate</td><td>reviewtext</td></tr><tr><td>25b75f1d-a2ea-4c97-b75f-1da2eadc97f7</td><td>City Hunter: Shinjuku Private Eyes</td><td>2558908</td><td>2019-02-14</td><td>Matt Schley</td><td>2.5/5</td><td>rotten</td><td>The film's out-of-touch attempts at humor may find them hunting for the reason the franchise was so popular in the first place.</td></tr></table>",
"languages": [
"eng"
],
"filetype": "text/csv",
"data_source": {
"record_locator": {
"document_id": "25b75f1d-a2ea-4c97-b75f-1da2eadc97f7"
},
"filesize_bytes": 326
}
}
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[
{
"type": "Table",
"element_id": "b3b034c9f8fb0ab442599982063f0590",
"text": "_id title reviewid creationdate criticname originalscore reviewstate reviewtext 60297eea-73d7-4fca-a97e-ea73d7cfca62 City Hunter: Shinjuku Private Eyes 2590987 2019-05-28 Reuben Baron fresh The choreography is so precise and lifelike at points one might wonder whether the movie was rotoscoped, but no live-action reference footage was used. The quality is due to the skill of the animators and Kodama's love for professional wrestling.",
"metadata": {
"text_as_html": "<table><tr><td>_id</td><td>title</td><td>reviewid</td><td>creationdate</td><td>criticname</td><td>originalscore</td><td>reviewstate</td><td>reviewtext</td></tr><tr><td>60297eea-73d7-4fca-a97e-ea73d7cfca62</td><td>City Hunter: Shinjuku Private Eyes</td><td>2590987</td><td>2019-05-28</td><td>Reuben Baron</td><td/><td>fresh</td><td>The choreography is so precise and lifelike at points one might wonder whether the movie was rotoscoped, but no live-action reference footage was used. The quality is due to the skill of the animators and Kodama's love for professional wrestling.</td></tr></table>",
"languages": [
"eng"
],
"filetype": "text/csv",
"data_source": {
"record_locator": {
"document_id": "60297eea-73d7-4fca-a97e-ea73d7cfca62"
},
"filesize_bytes": 442
}
}
}
]
Loading

0 comments on commit df156eb

Please sign in to comment.